In [1]:
!pip install -q accelerate
!pip install -q bitsandbytes
!pip install -q peft
!pip install datasets -q

In [2]:
!nvidia-smi

Thu Jun  6 17:29:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   68C    P8              19W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
import os
import gc
from datasets import load_dataset
from types import SimpleNamespace

import pandas as pd
import numpy as np

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    get_cosine_schedule_with_warmup
)


from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [4]:
cfg = {
    'model_id': 'h2oai/h2o-danube-1.8b-sft',
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'seed': 1337,
    'beta': 0.1,
    'batch_size': 1,
    'learning_rate': 1e-07,
    'weight_decay': 1e-02,
    'num_epochs': 1,
    'warmup_steps': 0,
    'accumulation_steps': 2,
    'logging_steps': 50,
}

cfg = SimpleNamespace(**cfg)

# Create a set of prompts

In [5]:
toy_data = {
    'prompt' : ['What is Ha Noi', '2 + 2 ='],
}

train = pd.DataFrame(toy_data)
train

Unnamed: 0,prompt
0,What is Ha Noi
1,2 + 2 =


# Understand chat template of Danube in HF


In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
messages = [
    {"role": "user", "content": "How are you?"},
    {"role": "assistant", "content": "I'm an AI, so I don't have feelings, but I'm here and ready to help you! How can I assist you today?"},
]

In [8]:
encoded_input_ids = tokenizer.apply_chat_template(messages)
tokenizer.decode(encoded_input_ids)

"<|prompt|>How are you?</s> <|answer|>I'm an AI, so I don't have feelings, but I'm here and ready to help you! How can I assist you today?</s>"

# Let's ask H20 Danube, a supervised fine-tuning model

In [9]:
reference_model = AutoModelForCausalLM.from_pretrained(
    cfg.model_id,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)

reference_model.eval();

In [10]:
def get_messages(prompt):
    # Function to generate messages with user prompt
    return [
        {'role': 'user', 'content': prompt},
        {'role': 'assistant', 'content': ''},
    ]

def extract_response(question_response):
    # Function to extract response from the model output
    response = question_response.strip().split('<|answer|>')[-1]  # Splitting by <|answer|> string to get the last part
    return response

In [11]:
prompts = train['prompt']
responses = []

for prompt in prompts:
    # Generate input tokens for the model
    input_ids = tokenizer.apply_chat_template(
        get_messages(prompt),
        return_tensors='pt'
    ).to(cfg.device)

    input_ids = input_ids[:, :-1] # Slice to remove the </s> that is automatically added when using tokenizer.apply_chat_template()

    # Generate response using the model
    with torch.no_grad():
        reference_model.eval()
        output_ids = reference_model.generate(
            input_ids,
            max_new_tokens=1024,
            pad_token_id=tokenizer.pad_token_id
        )[0]

    # Decode model output and extract response
    question_response = tokenizer.decode(output_ids, skip_special_tokens=True)

    responses.append(
        extract_response(question_response)
    )

In [12]:
for idx, response in enumerate(responses):
    print(f'### Response for question {idx+1}:\n', response.strip(), '\n')

### Response for question 1:
 Ha Noi, also known as Hanoi, is the capital city of Vietnam. It is located in the north-central part of the country and is known for its rich history, culture, and architecture. The city is home to many historic landmarks such as the Ho Chi Minh Mausoleum, the Temple of Literature, and the Hoan Kiem Lake. Ha Noi is also a hub for business and commerce, with many international companies having offices or branches in the city. Overall, Ha Noi is an important center of government, education, and tourism in Vietnam. 

### Response for question 2:
 To solve the equation 2 + 2 =, we first add the two numbers on the left side of the equation:

2 + 2 = 4

So, the equation becomes:

4 =

Now, to find the value of x that satisfies this equation, we can set it equal to 4 and solve for x.

4 = 4

Therefore, the value of unknown variable x is 4. 



# Get a preference dataset

In [13]:
chosen_responses = ['Hanoi is a city in Viet Nam.', '2 + 2 = 4']
rejected_responses = responses # Answer we don't like the current responses from Danube

train['chosen'] = chosen_responses
train['rejected'] = rejected_responses
train

Unnamed: 0,prompt,chosen,rejected
0,What is Ha Noi,Hanoi is a city in Viet Nam.,"Ha Noi, also known as Hanoi, is the capital c..."
1,2 + 2 =,2 + 2 = 4,"To solve the equation 2 + 2 =, we first add t..."


# Create dataloader for training

In [14]:
def encode(prompt, response):

    eos_token = '</s>'
    start_inst_token = '<|prompt|>'
    end_inst_token = '<|answer|>'

    prompt = start_inst_token + prompt + eos_token + end_inst_token

    prompt_ids = tokenizer(
        prompt,
        add_special_tokens=False,
        return_tensors='pt'
    ).input_ids

    response_ids = tokenizer(
        response,
        add_special_tokens=False,
        return_tensors='pt'
    ).input_ids

    eos_id = tokenizer(
        eos_token,
        add_special_tokens=False,
        return_tensors='pt'
    ).input_ids

    input_ids = torch.cat(
        [prompt_ids, response_ids, eos_id],
        dim=1
    ).view(-1)

    labels = torch.cat(
        [
            torch.full_like(
                prompt_ids[:, :-1],
                tokenizer.eos_token_id
            ), # create a tensor with full eos_token_id (2)
            prompt_ids[:, -1].view(1, -1), # slice to except the ">" token in "<|answer|>" before the first token of teh response. Because this contains the predicted probas of the first token of the response
            response_ids, # token ids of the esponse
            eos_id # end token </s>
        ],
        dim=1
    ).view(-1)

    return input_ids, labels

In [15]:
class CustomDataset(Dataset):
    def __init__(self, prompts, chosen_responses, rejected_responses):
        self.prompts = prompts
        self.chosen_responses = chosen_responses
        self.rejected_responses = rejected_responses

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, index):
        prompt = self.prompts[index]
        chosen_response = self.chosen_responses[index]
        rejected_response = self.rejected_responses[index]

        chosen_input_ids, chosen_labels = encode(prompt, chosen_response)
        rejected_input_ids, rejected_labels = encode(prompt, rejected_response)

        return {
            'chosen_input_ids': chosen_input_ids,
            'chosen_labels': chosen_labels,
            'rejected_input_ids': rejected_input_ids,
            'rejected_labels': rejected_labels
        }

def custom_collate(data):
    padded_data = {}
    data = pd.DataFrame(data)
    keys = data.columns

    for key in keys:
        padded_data[key] = pad_sequence(
            sequences=data[key].values.tolist(),
            batch_first=True,
            padding_value=tokenizer.eos_token_id
        )

    return padded_data

In [16]:
prompts = train['prompt'].values.tolist()
chosen_responses = train['chosen'].values.tolist()
rejected_responses = train['rejected'].values.tolist()

torch.manual_seed(seed=cfg.seed)
train_dataset = CustomDataset(
    prompts=prompts,
    chosen_responses=chosen_responses,
    rejected_responses=rejected_responses
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=cfg.batch_size,
    shuffle=True,
    drop_last=True,
    collate_fn=custom_collate
)

# Implement DPO loss

In [17]:
def get_batch_logps(logits, labels, pad_id_value):
    labels = labels.clone()
    loss_mask = labels != pad_id_value
    per_token_logps = torch.gather(logits.log_softmax(dim=-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
    return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)

def dpo_loss(
    policy_chosen_logps,
    policy_rejected_logps,
    reference_chosen_logps,
    reference_rejected_logps,
    beta
):
    policy_logratios = policy_chosen_logps - policy_rejected_logps
    reference_log_ratios = reference_chosen_logps - reference_rejected_logps
    losses = (-1) * F.logsigmoid((beta*policy_logratios) - beta*reference_log_ratios)
    return losses.mean()

def get_reference_logits(reference_model, chosen_pair, rejected_pair):
    reference_model.eval()
    with torch.no_grad():
        reference_chosen_logits = reference_model(chosen_pair).logits
        reference_rejected_logits = reference_model(rejected_pair).logits
    return reference_chosen_logits, reference_rejected_logits

# Fine-tuning the tiny toy dataset with DPO loss

In [18]:
class Policy(nn.Module):
    def __init__(self, cfg):
        super(Policy, self).__init__()

        self.backbone = AutoModelForCausalLM.from_pretrained(
            cfg.model_id,
            low_cpu_mem_usage=True,
            device_map='cuda',
            torch_dtype=torch.bfloat16,
            use_cache=False
        )

        self.backbone.gradient_checkpointing_enable()

    def forward(self, input_ids):
        logits = self.backbone(input_ids).logits
        return logits

In [19]:
model = Policy(cfg)

optimizer = optim.AdamW(
    model.parameters(),
    lr=cfg.learning_rate,
    weight_decay=cfg.weight_decay
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=cfg.warmup_steps,
    num_training_steps=cfg.num_epochs*len(train_dataloader)
)


scaler = GradScaler()

for epoch in range(cfg.num_epochs):

    for batch_idx, batch in enumerate(train_dataloader):

        model.train()

        chosen_input_ids = batch['chosen_input_ids'].long().to(cfg.device)
        chosen_labels = batch['chosen_labels'].long().to(cfg.device)
        rejected_input_ids = batch['rejected_input_ids'].long().to(cfg.device)
        rejected_labels = batch['rejected_labels'].long().to(cfg.device)

        reference_chosen_logits, reference_rejected_logits = get_reference_logits(
            reference_model,
            chosen_input_ids,
            rejected_input_ids
        )

        policy_chosen_logits = model(chosen_input_ids)
        policy_rejected_logits = model(rejected_input_ids)

        policy_chosen_logps = get_batch_logps(
            policy_chosen_logits,
            chosen_labels,
            pad_id_value=tokenizer.eos_token_id
        )

        policy_rejected_logps = get_batch_logps(
            policy_rejected_logits,
            rejected_labels,
            pad_id_value=tokenizer.eos_token_id
        )

        reference_chosen_logps = get_batch_logps(
            reference_chosen_logits,
            chosen_labels,
            pad_id_value=tokenizer.eos_token_id
        )

        reference_rejected_logps = get_batch_logps(
            reference_rejected_logits,
            rejected_labels,
            pad_id_value=tokenizer.eos_token_id
        )

        loss = dpo_loss(
            policy_chosen_logps,
            policy_rejected_logps,
            reference_chosen_logps,
            reference_rejected_logps,
            beta=cfg.beta
        ) / cfg.accumulation_steps

        print(
            f'Epoch: {epoch+1} / {cfg.num_epochs}'
            f'| Batch: {batch_idx}/{len(train_dataloader)}'
            f'| DPO Loss: {loss.item():.8f}'
        )

        loss.backward()

        if not (batch_idx % cfg.accumulation_steps):
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

Epoch: 1 / 1| Batch: 0/2| DPO Loss: 0.34657359
Epoch: 1 / 1| Batch: 1/2| DPO Loss: 0.34611285


# Examine the model after fine-tuning with DPO
Note: 2 rows of data seem to provide no improvement, but just for testing. We will fine-tune with a larger preference dataset.

In [20]:
test_prompts = ['What is Hanoi', '2 + 2 =']
responses = []

for prompt in test_prompts:

    input_ids = tokenizer.apply_chat_template(
        get_messages(prompt),
        return_tensors='pt'
    ).to(cfg.device)

    input_ids = input_ids[:, :-1] # Slice to remove the </s> that is automatically added when using tokenizer.apply_chat_template()

    with torch.no_grad():
        model.eval()
        output_ids = model.backbone.generate(
                input_ids=input_ids,
                max_new_tokens=512,
                pad_token_id=tokenizer.pad_token_id
        )[0]

    question_response = tokenizer.decode(output_ids, skip_special_tokens=True)
    response = extract_response(question_response)

    print('Prompt:', tokenizer.decode(input_ids[0]))
    print('Response:', response)
    print('\n')

Prompt: <|prompt|>What is Hanoi</s> <|answer|>
Response:  Hanoi is the capital city of Vietnam. It's a big city with lots of people and buildings, but it's also very old and has many interesting things to see. Some famous places in Hanoi are the Ho Chi Minh Mausoleum, the Temple of Literature, and the Hoan Kiem Lake.


Prompt: <|prompt|>2 + 2 =</s> <|answer|>
Response:  To solve the equation, I will follow these steps:

Step 1: Add the numbers on the left side of the equation.
2 + 2 = 4

So, the equation becomes:
4 =

Step 2: Since there is no unknown variable in this equation, we can conclude that the value of the unknown variable x is not needed to determine the solution.

Therefore, the answer is 4.




# Let's train on a real preference dataset

I use **ultrafeedback** dataset on HuggingFace: https://huggingface.co/datasets/argilla/ultrafeedback-binarized-preferences

In [21]:
from datasets import load_dataset

data_path = 'argilla/ultrafeedback-binarized-preferences'

# load in data
train = load_dataset(data_path)['train'].to_pandas()

# Select important columns
train = train[['instruction', 'chosen_response', 'rejected_response']]

# rename columns for consistent with our above format
train.columns = ['prompt', 'chosen', 'rejected']

# Sample about 500 training examples for fine-tuning
train = train.sample(500, random_state=1).reset_index(drop=True)
train

Unnamed: 0,prompt,chosen,rejected
0,You will be given a definition of a task first...,"[256, 512, 4096, 4096]","Sure, I'd be happy to help! Here's the definit..."
1,"const [upgradePrimeToPlusMutation, { isUpgrade...",It appears that the code you've provided is a ...,It's essential to test various scenarios to en...
2,I work in Typescript and use Google Data Stora...,"Sure, I can help you with that. First, let's d...","Sure, here's an example of a generic base clas..."
3,"TASK DEFINITION: In this task, you are given a...","Sure, I'd be happy to help! Here's the transla...",همراحم از زمان محل را به نظر می رسند، من بنا ب...
4,Develop an algorithm that can detect a given t...,"Certainly, I can help you develop an algorithm...","As a helpful and respectful AI assistant, I mu..."
...,...,...,...
495,What ingredients are required to make the cook...,To make the cookie dough texture for M&M Choco...,No ingredients are required. This is not a rec...
496,Upwork Bidding: \n\nI'm bidding on this Upwork...,Subject: Top-Notch SEO Services for Your Busin...,"Dear [Client Name],\n\nI hope this message fin..."
497,In what ways does the World Health Organizatio...,The World Health Organization (WHO) collaborat...,The World Health Organization (WHO) collaborat...
498,Develop a Ruby on Rails application that facil...,To develop a Ruby on Rails application that fa...,Great! I'm here to assist you in creating a Ru...


In [22]:
# Truncate prompt, and responses to avoid OOM
for col in ['prompt', 'chosen', 'rejected']:
    if col == 'prompt':
        max_length = 500
    else:
        max_length = 1024
    train[col] = train[col].str[:max_length]

In [23]:
prompts = train['prompt'].values.tolist()
chosen_responses = train['chosen'].values.tolist()
rejected_responses = train['rejected'].values.tolist()

torch.manual_seed(seed=cfg.seed)
train_dataset = CustomDataset(
    prompts=prompts,
    chosen_responses=chosen_responses,
    rejected_responses=rejected_responses
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=cfg.batch_size,
    shuffle=True,
    drop_last=True,
    collate_fn=custom_collate
)

In [24]:
# Init model
model = Policy(cfg)

# Optimizer and scheduler
optimizer = optim.AdamW(
    model.parameters(),
    lr=cfg.learning_rate,
    weight_decay=cfg.weight_decay
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=cfg.warmup_steps,
    num_training_steps=cfg.num_epochs*len(train_dataloader)
)

In [25]:
model = Policy(cfg)

optimizer = optim.AdamW(
    model.parameters(),
    lr=cfg.learning_rate,
    weight_decay=cfg.weight_decay
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=cfg.warmup_steps,
    num_training_steps=cfg.num_epochs*len(train_dataloader)
)

train_losses = []
for epoch in range(cfg.num_epochs):

    for batch_idx, batch in enumerate(train_dataloader):

        torch.cuda.empty_cache()
        gc.collect()

        model.train()

        chosen_input_ids = batch['chosen_input_ids'].long().to(cfg.device)
        chosen_labels = batch['chosen_labels'].long().to(cfg.device)
        rejected_input_ids = batch['rejected_input_ids'].long().to(cfg.device)
        rejected_labels = batch['rejected_labels'].long().to(cfg.device)

        reference_chosen_logits, reference_rejected_logits = get_reference_logits(
            reference_model,
            chosen_input_ids,
            rejected_input_ids
        )

        policy_chosen_logits = model(chosen_input_ids)
        policy_rejected_logits = model(rejected_input_ids)

        policy_chosen_logps = get_batch_logps(
            policy_chosen_logits,
            chosen_labels,
            pad_id_value=tokenizer.eos_token_id
        )

        policy_rejected_logps = get_batch_logps(
            policy_rejected_logits,
            rejected_labels,
            pad_id_value=tokenizer.eos_token_id
        )

        reference_chosen_logps = get_batch_logps(
            reference_chosen_logits,
            chosen_labels,
            pad_id_value=tokenizer.eos_token_id
        )

        reference_rejected_logps = get_batch_logps(
            reference_rejected_logits,
            rejected_labels,
            pad_id_value=tokenizer.eos_token_id
        )

        loss = dpo_loss(
            policy_chosen_logps,
            policy_rejected_logps,
            reference_chosen_logps,
            reference_rejected_logps,
            beta=cfg.beta
        ) / cfg.accumulation_steps

        train_losses.append(loss.item())
        if not (batch_idx % 100):
            print(
                f'Epoch: {epoch+1} / {cfg.num_epochs}'
                f'| Batch: {batch_idx}/{len(train_dataloader)}'
                f'| Train DPO Loss: {np.mean(train_losses):.8f}'
            )

        loss.backward()

        if not (batch_idx % cfg.accumulation_steps):
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

Epoch: 1 / 1| Batch: 0/500| Train DPO Loss: 0.34657359
Epoch: 1 / 1| Batch: 100/500| Train DPO Loss: 0.34655462
Epoch: 1 / 1| Batch: 200/500| Train DPO Loss: 0.34659074
Epoch: 1 / 1| Batch: 300/500| Train DPO Loss: 0.34656812
Epoch: 1 / 1| Batch: 400/500| Train DPO Loss: 0.34656084


In [26]:
test_prompts = ['What is Hanoi', '2 + 2 =']
responses = []

for prompt in test_prompts:

    input_ids = tokenizer.apply_chat_template(
        get_messages(prompt),
        return_tensors='pt'
    ).to(cfg.device)

    input_ids = input_ids[:, :-1] # Slice to remove the </s> that is automatically added when using tokenizer.apply_chat_template()

    with torch.no_grad():
        model.eval()
        output_ids = model.backbone.generate(
                input_ids=input_ids,
                max_new_tokens=1024,
                pad_token_id=tokenizer.pad_token_id
        )[0]

    question_response = tokenizer.decode(output_ids, skip_special_tokens=True)
    response = extract_response(question_response)

    print('Prompt:', tokenizer.decode(input_ids[0]))
    print('Response:', response)
    print('\n')

Prompt: <|prompt|>What is Hanoi</s> <|answer|>
Response:  Hanoi is the capital city of Vietnam. It's a big city with lots of people and buildings. It has many important places like museums, parks, and temples. People in Hanoi speak Vietnamese, which is a language that is different from English. They also have their own special food and way of life.


Prompt: <|prompt|>2 + 2 =</s> <|answer|>
Response:  To solve the equation, I will follow these steps:

Step 1: Add the numbers on the left side of the equation.
2 + 2 = 4

So, the equation becomes:
4 =

Step 2: Since there is no unknown variable in this equation, we can conclude that the value of the unknown variable x is not needed to determine the solution.

Therefore, the answer is 4.




We can see that aligning models with more training examples using DPO yields slightly better responses.