<a href="https://colab.research.google.com/github/fatday/STATS-305B-HW4-Group/blob/main/dpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %load_ext autoreload
# %autoreload 2

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
from peft import LoraConfig, get_peft_model
set_seed(42) # DO NOT CHANGE THE SEED

In [None]:

# DO NOT CHANGE!
def sample_model(tokenizer, model, prompt, N=100):
    """
    Samples N different completions from the model based on the given prompt.

    Args:
    tokenizer: The tokenizer object used to encode/decode text.
    model: The language model used for generation.
    prompt (str): The input prompt for which completions will be generated.
    N (int): The number of completions to generate.

    Returns:
    list[str]: A list of N generated completions.
    """

    chat = [{"role": "user", "content": prompt}]
    chat_tokens = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True)

    # Generate N different responses
    outputs = model.generate(
        torch.tensor([chat_tokens], device=model.device),
        num_return_sequences=N,
        max_new_tokens=32,
        temperature=0.15,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )

    def extract_response(decoded_text):
        return decoded_text.rsplit('model\n', 1)[-1][:-2]

    responses = [extract_response(tokenizer.decode(output, skip_special_tokens=True)) for output in outputs]
    return responses

In [None]:

# DO NOT CHANGE!
def fraction_responses_with_because_of(responses):
    """
    Calculates the fraction of responses that start with a specific match string.

    Args:
    responses (list[str]): A list of model-generated responses.

    Returns:
    float: The fraction of responses that start with the phrase "The sky appears blue because of".
    """

    match_str = "The sky appears blue because of"
    match_count = 0

    for response in responses:
        if response.startswith(match_str):
            match_count += 1

    return match_count / len(responses)

In [None]:
# AUTOGRADER_EXPORT_START

In [None]:
import torch

# PSET 1 START (Only modify this section)

In [None]:

def get_response_idxs(tokenizer, chat_token_ids):
    """
    Finds the start and end indices of the response in the tokenized chat.

    Args:
    tokenizer: The tokenizer object used to encode/decode text.
    chat_token_ids (list[int]): The token IDs representing the chat conversation.

    Returns:
    tuple: A tuple (response_start_idx, response_end_idx), both of which are nonnegative integers.
    """

    start_of_turn_id = tokenizer.convert_tokens_to_ids("<start_of_turn>")
    end_of_turn_id = tokenizer.convert_tokens_to_ids("<end_of_turn>")

    response_start_idx = None # Nonnegative integer
    response_end_idx = None # Nonnegative integer

    # YOUR CODE HERE (~3-5 lines)
    response_end_idx = len(chat_token_ids)  - 1 - chat_token_ids[::-1].index(end_of_turn_id) - 1 # <end_of_turn> index - 1
    response_start_idx = len(chat_token_ids) - 1 - chat_token_ids[::-1].index(start_of_turn_id) + 3 # <start_of_turn> index + 3 (pass the model\n)
    # END OF YOUR CODE

    return response_start_idx, response_end_idx

In [None]:

def get_response_next_token_probs(tokenizer, model, chat_token_ids):
    """
    Computes the next token probabilities for the response in a chat.

    Args:
    tokenizer: The tokenizer object used to encode/decode text.
    model: The language model used to generate the logits.
    chat_token_ids (list[int]): The token IDs representing the chat conversation.

    Returns:
    torch.Tensor: A 1D tensor containing the probabilities of the tokens in the response found by appropriately indexing
                  the next token probabilities of the preceding token.
    """

    response_start_idx, response_end_idx = get_response_idxs(tokenizer, chat_token_ids)
    chat_token_ids_tensor = torch.tensor([chat_token_ids]).to(model.device)
    logits = model(chat_token_ids_tensor).logits[0, :, :] # shape (len(chat_token_ids), vocabulary_size)

    next_token_probs = None # Should be a 1D-tensor

    # YOUR CODE HERE (~3-5 lines)
    response_matrix = logits[response_start_idx - 1:response_end_idx, :]
    softmax_matrix = torch.softmax(response_matrix, dim=1)
    next_token_probs, _ = softmax_matrix.max(dim=1)
    # END OF YOUR CODE

    return next_token_probs

In [None]:

def compute_dpo_objective(preferred_train_probs, nonpreferred_train_probs, preferred_ref_probs, nonpreferred_ref_probs, beta):
    """
    Computes the Direct Preference Optimization (DPO) objective for training.

    Args:
    preferred_train_probs (torch.Tensor): Token probabilities for the preferred chat sequence from the training model.
    nonpreferred_train_probs (torch.Tensor): Token probabilities for the non-preferred chat sequence from the training model.
    preferred_ref_probs (torch.Tensor): Token probabilities for the preferred chat sequence from the reference model.
    nonpreferred_ref_probs (torch.Tensor): Token probabilities for the non-preferred chat sequence from the reference model.
    beta (float): Controls the KL strength of staying close to the reference model.

    Returns:
    torch.Tensor: The computed DPO objective, which is a float.
    """

    dpo_obj = None # Float value

    # YOUR CODE HERE (~4-6 lines)
    log_preferred_ratio = torch.log(preferred_train_probs / preferred_ref_probs)
    log_nonpreferred_ratio = torch.log(nonpreferred_train_probs / nonpreferred_ref_probs)
    loss = - torch.log(torch.sigmoid(beta * (log_preferred_ratio - log_nonpreferred_ratio)))
    dpo_obj = loss.mean()
    # END OF YOUR CODE

    return dpo_obj

In [None]:


def finetune(tokenizer, optimizer, train_model, ref_model, preferred_chat_ids, nonpreferred_chat_ids, num_gradient_steps, beta):
    """
    Fine-tunes the training model using DPO. Make sure to disable gradients on the reference model!

    Args:
    tokenizer: The tokenizer object used to encode/decode text.
    optimizer: The optimizer for updating the training model's parameters.
    train_model: The model being fine-tuned.
    ref_model: The reference model.
    preferred_chat_ids (list[int]): The token IDs representing the preferred chat sequence.
    nonpreferred_chat_ids (list[int]): The token IDs representing the non-preferred chat sequence.
    num_gradient_steps (int): The number of gradient updates to perform.
    beta (float): A parameter used in computing the DPO objective.

    Returns:
    None
    """

    print('Fine-tuning...')
    for i in range(num_gradient_steps):
        # YOUR CODE HERE (~9-12 lines)
        pass
        # END OF YOUR CODE
    print("Fine-tuning complete!")

# PSET 1 END

In [None]:
# AUTOGRADER_EXPORT_END

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
     torch_dtype=torch.bfloat16,
     device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
sample_prompt = "How is it going?"
sample_completion = "As an AI, I don't have feelings or experiences like humans do, so I don't have a \"going\" in the same way."

In [None]:
sample_chat = [
    {"role": "user", "content": sample_prompt},
    {"role": "assistant", "content": sample_completion}
]

sample_chat_tokens = tokenizer.apply_chat_template(sample_chat, tokenize=False, add_generation_prompt=False)
sample_chat_token_ids = tokenizer.apply_chat_template(sample_chat, tokenize=True, add_generation_prompt=False)

print("Chat tokens:")
print(sample_chat_tokens)

print("Chat token IDs:")
print(sample_chat_token_ids)

Chat tokens:
<bos><start_of_turn>user
How is it going?<end_of_turn>
<start_of_turn>model
As an AI, I don't have feelings or experiences like humans do, so I don't have a "going" in the same way.<end_of_turn>

Chat token IDs:
[2, 106, 1645, 108, 2299, 603, 665, 2319, 235336, 107, 108, 106, 2516, 108, 2169, 671, 16481, 235269, 590, 1453, 235303, 235251, 791, 14690, 689, 12219, 1154, 17611, 749, 235269, 712, 590, 1453, 235303, 235251, 791, 476, 664, 9779, 235281, 575, 573, 1809, 1703, 235265, 107, 108]


In [None]:
# convert
tokenizer.decode(sample_chat_token_ids[14-1:44])

'\nAs an AI, I don\'t have feelings or experiences like humans do, so I don\'t have a "going" in the same way'

In [None]:
response_start_idx, response_end_idx = get_response_idxs(tokenizer, sample_chat_token_ids)
print(f"Response tokens index in sample_chat_tokens range from {response_start_idx} to {response_end_idx}.")

first_response_token_id = sample_chat_token_ids[response_start_idx]
last_response_token_id = sample_chat_token_ids[response_end_idx]
print(f'First response token is "{tokenizer.decode(first_response_token_id)}" with ID {first_response_token_id}')
print(f'Last response token is "{tokenizer.decode(last_response_token_id)}" with ID {last_response_token_id}')

# Make sure your code passes this test!
assert tokenizer.decode(first_response_token_id) == "As" and tokenizer.decode(last_response_token_id) == "."

Response tokens index in sample_chat_tokens range from 14 to 44.
First response token is "As" with ID 2169
Last response token is "." with ID 235265


In [None]:
with torch.no_grad():
    next_token_probs = get_response_next_token_probs(tokenizer, model, sample_chat_token_ids)
print(f'Next token probabilities: {next_token_probs}')

# Make sure your code passes this test!
assert next_token_probs.mean() > 0.7

Next token probabilities: tensor([0.8867, 0.9805, 1.0000, 0.9727, 1.0000, 1.0000, 1.0000, 1.0000, 0.8398,
        0.9062, 0.9844, 0.8320, 0.8984, 0.8867, 0.9883, 0.5469, 0.8320, 0.8359,
        0.7148, 1.0000, 1.0000, 0.7188, 0.5625, 0.8711, 0.6797, 0.9727, 0.9531,
        0.9961, 0.3672, 0.5938, 0.6484], dtype=torch.bfloat16)


In [None]:
train_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
     torch_dtype=torch.bfloat16,
     device_map='auto'
)
lora_config = LoraConfig()
train_model = get_peft_model(train_model, lora_config)
train_model.train()

ref_model = model
ref_model.train()
print('Loaded models!')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded models!


In [None]:
# The model's response to the prompt usually includes the words "due to" - we want to change that to "because of" using DPO!
prompt = "Explain why the sky is blue in one sentence."
preferred_completion = "The sky appears blue because of"
nonpreferred_completion = "The sky appears blue due to"

In [None]:
preferred_chat = [
    {"role": "user", "content": prompt},
    {"role": "assistant", "content": preferred_completion}
]

nonpreferred_chat = [
    {"role": "user", "content": prompt},
    {"role": "assistant", "content": nonpreferred_completion}
]

preferred_chat_ids = tokenizer.apply_chat_template(preferred_chat, tokenize=True, add_generation_prompt=False)
nonpreferred_chat_ids = tokenizer.apply_chat_template(nonpreferred_chat, tokenize=True, add_generation_prompt=False)

In [None]:
preferred_train_probs = get_response_next_token_probs(tokenizer, train_model, preferred_chat_ids)
nonpreferred_train_probs = get_response_next_token_probs(tokenizer, train_model, nonpreferred_chat_ids)

# Gradients are not needed for the reference model since we will not be optimizing with respect to it
with torch.no_grad():
    preferred_ref_probs = get_response_next_token_probs(tokenizer, ref_model, preferred_chat_ids)
    nonpreferred_ref_probs = get_response_next_token_probs(tokenizer, ref_model, nonpreferred_chat_ids)

your_favorite_beta = 1 # Feel free to play with beta here. Does anything change?
dpo_obj = compute_dpo_objective(preferred_train_probs, nonpreferred_train_probs, preferred_ref_probs, nonpreferred_ref_probs, beta=your_favorite_beta)
print(dpo_obj)

tensor(0.6914, dtype=torch.bfloat16, grad_fn=<MeanBackward0>)


In [None]:
prior_responses = sample_model(tokenizer, train_model, prompt)
print('Sampled responses before fine-tuning:\n' + '\n'.join(prior_responses[:10]))
print(f'Fraction responses with because of: {fraction_responses_with_because_of(prior_responses)}') # should start close to 0

KeyboardInterrupt: 

In [None]:
# DO NOT CHANGE THESE VALUES
num_gradient_steps = 150
learning_rate = 2e-6
beta = 1
optimizer = torch.optim.Adam(train_model.parameters(), lr=learning_rate)

finetune(tokenizer, optimizer, train_model, ref_model, preferred_chat_ids, nonpreferred_chat_ids, num_gradient_steps, beta)

In [None]:
# Save GPU memory
del ref_model
del model

In [None]:
post_tuning_responses = sample_model(tokenizer, train_model, prompt)
print('Sampled responses after fine-tuning:\n' + '\n'.join(post_tuning_responses[:10]))
print(f'Fraction responses with because of: {fraction_responses_with_because_of(post_tuning_responses)}') # should be more than half