In [1]:
HF_TOKEN = ""

In [2]:
#!pip install -q datasets trl peft bitsandbytes sentencepiece wandb

[0m

In [3]:
import gc
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer
import bitsandbytes as bnb
import wandb
import os
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout
from datetime import datetime
from jinja2 import Template
from huggingface_hub import ModelCard, HfApi

In [4]:
username = widgets.Text(
    value="CorticalStack",
    description='HF hub user',
    disabled=False
)
username.style.description_width = 'initial'
display(username)

ref_model_id = widgets.Text(
    value="teknium/OpenHermes-2.5-Mistral-7B",
    description='Ref model id',
    disabled=False
)
ref_model_id.style.description_width = 'initial'
display(ref_model_id)

new_model_id = widgets.Text(
    value="mistral-7b-neuralhermes-2.5-dpo",
    description='New model id',
    disabled=False
)
new_model_id.style.description_width = 'initial'
display(new_model_id)

dpo_dataset = widgets.Text(
    value="Intel/orca_dpo_pairs",
    description='DPO dataset',
    disabled=False
)
dpo_dataset.style.description_width = 'initial'
display(dpo_dataset)

license = widgets.Text(
    value="apache-2.0",
    description='License',
    disabled=False
)
license.style.description_width = 'initial'
display(license)

Text(value='CorticalStack', description='HF hub user', style=TextStyle(description_width='initial'))

Text(value='teknium/OpenHermes-2.5-Mistral-7B', description='Ref model id', style=TextStyle(description_width=…

Text(value='mistral-7b-neuralhermes-2.5-dpo', description='New model id', style=TextStyle(description_width='i…

Text(value='Intel/orca_dpo_pairs', description='DPO dataset', style=TextStyle(description_width='initial'))

Text(value='apache-2.0', description='License', style=TextStyle(description_width='initial'))

In [5]:
r = widgets.IntText(
    value=16,
    description='r',
    disabled=False
)
r.style.description_width = 'initial'
display(r)

lora_alpha = widgets.IntText(
    value=16,
    description='lora alpha',
    disabled=False
)
lora_alpha.style.description_width = 'initial'
display(lora_alpha)

lora_dropout = widgets.FloatText(
    value=0.05,
    description='lora dropout',
    disabled=False
)
lora_dropout.style.description_width = 'initial'
display(lora_dropout)

IntText(value=16, description='r', style=DescriptionStyle(description_width='initial'))

IntText(value=16, description='lora alpha', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.05, description='lora dropout', style=DescriptionStyle(description_width='initial'))

In [6]:
max_steps = widgets.IntText(
    value=100,
    description='Max steps',
    disabled=False
)
max_steps.style.description_width = 'initial'
display(max_steps)

lr_scheduler_type = widgets.Text(
    value="cosine",
    description='LR schedule type',
    disabled=False
)
lr_scheduler_type.style.description_width = 'initial'
display(lr_scheduler_type)

warmup_ratio = widgets.FloatText(
    value=0.01,
    description='Warmup ratio',
    disabled=False
)
warmup_ratio.style.description_width = 'initial'
display(warmup_ratio)

optim = widgets.Text(
    value="paged_adamw_32bit",
    description='Optimizer',
    disabled=False
)
optim.style.description_width = 'initial'
display(optim)

learning_rate = widgets.FloatText(
    value="5e-5",
    description='Learning rate',
    disabled=False
)
learning_rate.style.description_width = 'initial'
display(learning_rate)

per_device_train_batch_size = widgets.IntText(
    value=4,
    description='Per device train batch size',
    disabled=False
)
per_device_train_batch_size.style.description_width = 'initial'
display(per_device_train_batch_size)

gradient_accumulation_steps = widgets.IntText(
    value=4,
    description='Gradient accumulation steps',
    disabled=False
)
gradient_accumulation_steps.style.description_width = 'initial'
display(gradient_accumulation_steps)

max_prompt_length = widgets.IntText(
    value=1024,
    description='Max prompt length',
    disabled=False
)
max_prompt_length.style.description_width = 'initial'
display(max_prompt_length)

max_length = widgets.IntText(
    value=1536,
    description='Max length',
    disabled=False
)
max_length.style.description_width = 'initial'
display(max_length)

beta = widgets.FloatText(
    value="0.1",
    description='Beta',
    disabled=False
)
beta.style.description_width = 'initial'
display(beta)

IntText(value=100, description='Max steps', style=DescriptionStyle(description_width='initial'))

Text(value='cosine', description='LR schedule type', style=TextStyle(description_width='initial'))

FloatText(value=0.01, description='Warmup ratio', style=DescriptionStyle(description_width='initial'))

Text(value='paged_adamw_32bit', description='Optimizer', style=TextStyle(description_width='initial'))

FloatText(value=5e-05, description='Learning rate', style=DescriptionStyle(description_width='initial'))

IntText(value=4, description='Per device train batch size', style=DescriptionStyle(description_width='initial'…

IntText(value=4, description='Gradient accumulation steps', style=DescriptionStyle(description_width='initial'…

IntText(value=1024, description='Max prompt length', style=DescriptionStyle(description_width='initial'))

IntText(value=1536, description='Max length', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.1, description='Beta', style=DescriptionStyle(description_width='initial'))

In [7]:
wandb.login()

wandb_project = "ft-" + new_model_id.value
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

[34m[1mwandb[0m: Currently logged in as: [33mcorticalstack[0m ([33mcorticalstackteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
def chatml_format(example):
    # Format system
    if len(example['system']) > 0:
        message = {"role": "system", "content": example['system']}
        system = tokenizer.apply_chat_template([message], tokenize=False)
    else:
        system = ""

    # Format instruction
    message = {"role": "user", "content": example['question']}
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

    # Format chosen answer
    chosen = example['chosen'] + "<|im_end|>\n"

    # Format rejected answer
    rejected = example['rejected'] + "<|im_end|>\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

# Load dataset
dataset = load_dataset(f"{dpo_dataset.value}")['train']

# Save columns
original_columns = dataset.column_names

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(f"{ref_model_id.value}")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Format dataset
dataset = dataset.map(
    chatml_format,
    remove_columns=original_columns
)

# Print sample
dataset[1]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'chosen': 'Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\n',
 'rejected': ' Sure! Here\'s a sentence that describes all the data you provided:\n\n"Midsummer House is a moderately priced Chinese restaurant with a customer rating of 3 out of 5, located near All Bar One, offering a variety of delicious dishes."<|im_end|>\n',
 'prompt': '<|im_start|>system\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\n<|im_start|>user\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\n<|im_start|>assistant\n'}

In [9]:
# LoRA configuration
peft_config = LoraConfig(
    r=r.value,
    lora_alpha=lora_alpha.value,
    lora_dropout=lora_dropout.value,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    ref_model_id.value,
    torch_dtype=torch.float16,
    load_in_4bit=True
)
model.config.use_cache = False

# Reference model
ref_model = AutoModelForCausalLM.from_pretrained(
    ref_model_id.value,
    torch_dtype=torch.float16,
    load_in_4bit=True
)

# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=per_device_train_batch_size.value,
    gradient_accumulation_steps=gradient_accumulation_steps.value,
    gradient_checkpointing=True,
    learning_rate=learning_rate.value,
    lr_scheduler_type=lr_scheduler_type.value,
    max_steps=max_steps.value,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model_id.value,
    optim=optim.value,
    warmup_ratio = warmup_ratio.value,
    bf16=True,
    report_to="wandb",
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    #ref_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=beta.value,
    max_prompt_length=max_prompt_length.value,
    max_length=max_length.value,
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [10]:
# Fine-tune model with DPO
dpo_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6931
2,0.6931
3,0.0817
4,0.0409
5,0.1266
6,0.1306
7,0.0331
8,0.025
9,0.0145
10,0.0181


TrainOutput(global_step=100, training_loss=0.024510806764007177, metrics={'train_runtime': 2014.7381, 'train_samples_per_second': 0.794, 'train_steps_per_second': 0.05, 'total_flos': 0.0, 'train_loss': 0.024510806764007177, 'epoch': 0.12})

In [11]:
# Save artifacts
dpo_trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")

# Flush memory
del dpo_trainer, model, ref_model
gc.collect()
torch.cuda.empty_cache()

# Reload model in FP16 (instead of NF4)
base_model = AutoModelForCausalLM.from_pretrained(
    ref_model_id.value,
    return_dict=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(ref_model_id.value)

# Merge base model with the adapter
model = PeftModel.from_pretrained(base_model, "final_checkpoint")
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model_id.value)
tokenizer.save_pretrained(new_model_id.value)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('mistral-7b-neuralhermes-2.5-dpo/tokenizer_config.json',
 'mistral-7b-neuralhermes-2.5-dpo/special_tokens_map.json',
 'mistral-7b-neuralhermes-2.5-dpo/tokenizer.model',
 'mistral-7b-neuralhermes-2.5-dpo/added_tokens.json',
 'mistral-7b-neuralhermes-2.5-dpo/tokenizer.json')

In [12]:
template_text = """
---
license: {{ license }}
tags:
- dpo
dataset:
- {{ train_dataset_name }}
base_model:
- {{ model_id }}
---

# {{ new_model_id }}

{{ new_model_id }} is a DPO fine-tuned version of [{{ model_id }}](https://huggingface.co/{{ model_id }}) using the [{{ train_dataset_name }}](https://huggingface.co/datasets/{{ train_dataset_name }}) dataset.

### LoRA
- r: {{ r }}
- LoRA alpha: {{ lora_alpha }}
- LoRA dropout: {{ lora_dropout }}

### Training arguments
- Batch size: {{ per_device_train_batch_size }}
- Gradient accumulation steps: {{ gradient_accumulation_steps }}
- Optimizer: {{ optim }}
- Max steps: {{ max_steps }}
- Learning rate: {{ learning_rate }}
- Learning rate scheduler type: {{ lr_scheduler_type }}
- Beta: {{ beta }}
- Max prompt length: {{ max_prompt_length }}
- Max length: {{ max_length }}
"""

    # Create a Jinja template object
jinja_template = Template(template_text.strip())

In [13]:
# Fill the template
content = jinja_template.render(
          license = license.value,
          train_dataset_name = dpo_dataset.value,
          model_id = ref_model_id.value,
          new_model_id = new_model_id.value,
          r = r.value,
          lora_alpha = lora_alpha.value,
          lora_dropout = lora_dropout.value,
          per_device_train_batch_size = per_device_train_batch_size.value,
          gradient_accumulation_steps = gradient_accumulation_steps.value,
          optim = optim.value,
          max_steps = max_steps.value,
          learning_rate = learning_rate.value,
          lr_scheduler_type = lr_scheduler_type.value,
          beta = beta.value,
          max_prompt_length = max_prompt_length.value,
          max_length = max_length.value
          )

# Save the model card
card = ModelCard(content)
card.save(f"{new_model_id.value}/README.md")

In [14]:
api = HfApi(token=HF_TOKEN)
api.create_repo(
    repo_id=f"{username.value}/{new_model_id.value}",
    repo_type="model",
    exist_ok=True,
)

RepoUrl('https://huggingface.co/CorticalStack/mistral-7b-neuralhermes-2.5-dpo', endpoint='https://huggingface.co', repo_type='model', repo_id='CorticalStack/mistral-7b-neuralhermes-2.5-dpo')

In [15]:
api.upload_folder(
    repo_id=f"{username.value}/{new_model_id.value}",
    folder_path=new_model_id.value,
    token=HF_TOKEN
)

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/mistral-7b-neuralhermes-2.5-dpo/commit/282787d4374fbc17a6266e14e670e9ae7893cf33', commit_message='Upload folder using huggingface_hub', commit_description='', oid='282787d4374fbc17a6266e14e670e9ae7893cf33', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
# Format prompt
message = [
    {"role": "system", "content": "You are a helpful assistant chatbot."},
    {"role": "user", "content": "What is a Large Language Model?"}
]
tokenizer = AutoTokenizer.from_pretrained(new_model_id.value)
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)

# Create pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=new_model_id.value,
    tokenizer=tokenizer
)

# Generate text
sequences = pipeline(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    num_return_sequences=1,
    max_length=200,
)
print(sequences[0]['generated_text'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


NameError: name 'transformers' is not defined