In [1]:
import torch
import rich
import datasets
import utils

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
results1 = torch.load("data/prepared_training_data_temp1.5_bsz2_shot1_q2.pt")
print(results1.keys())

results1

dict_keys(['questions', 'chosen_solutions', 'reject_solutions', 'chosen_logprobs', 'reject_logprobs', 'chosen_selfcertainties', 'reject_selfcertainties'])


{'questions': ['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?'],
 'chosen_solutions': ['Natalia sold 48 clips in April. In May, she sold half as many, so the number of clips sold in May is 48/2 = 24 clips.\nTotal number of clips sold in April and May = 48 (April) + 24 (May) = 48 + 24 = 72 clips\n#### 72\n\n',
  'Her hourly rate is $12 and she babysat for 50 minutes. First, convert the minutes to hours: 50 minutes is 50/60 = 0.833 hours.\nNow, multiply her hourly rate by the babysitting hours: $12 * 0.833 = $<<12*0.833=9.996>>9.996\nRounding to the nearest cent, Tonya earned $10.00.\n#### 10.00'],
 'reject_solutions': ['Natalia sold 48 clips in April. In May, she sold half as many, so the number of clips sold in May is 48/2 = 24 clips.\nTotal number of clips sol

In [None]:
results1 = torch.load("data/prepared_training_data_temp1.5_bsz8_shot2_q500.pt")
print(results1.keys())

dict_keys(['questions', 'chosen_solutions', 'reject_solutions', 'chosen_logprobs', 'reject_logprobs', 'chosen_selfcertainties', 'reject_selfcertainties'])


In [7]:
print(results1["questions"][0])
print()
print(results1["chosen_solutions"][0])
print()
print(results1["reject_solutions"][0])

Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Natalia sold 48 clips in April.
In May, she sold half as many as in April, so she sold 48/2 = <<48/2=24>>24 clips.
Altogether, Natalia sold 48 + 24 = <<48+24=72>>72 clips in April and May.
#### 72

In April, Natalia sold clips to 48 friends.
In May, she sold half as many clips as in April, which is 48/2 = 24 clips.
To find out how many clips she sold altogether, add the numbers from April and May: 48 + 24 = <<48+24=72>>72 clips.
      ### 72


In [None]:
for q_idx in range(len(results1["questions"])):
    correctness_mask = results1["correctness_mask"][q_idx]

    question = results1["questions"][q_idx]
    solutions = results1["solutions"][q_idx]

    print(f"Question: {question}")
    for i, (sol, corr) in enumerate(zip(solutions, correctness_mask)):
        if corr:
            rich.print(f"[green]Solution {i}: {sol}[/green]")
        else:
            rich.print(f"[red]Solution {i}: {sol}[/red]")


In [20]:
from transformers import AutoTokenizer

prompt_template = """You are given a math question. You must provide a concise step-by-step reasoning
        and a final answer. Your response should follow strictly the format of the provided examples where each new line is a reasoning step
        written in a very concise style, and the final answer is on the last line. There should be roughly 2-4 steps, but it is okay
        to have more or less steps if needed.
        
        {n_shot_examples}

        # Question:
        {question}
    """

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

ds = datasets.load_dataset("openai/gsm8k", "main")

In [54]:
questions = ds['train']['question']
answers = ds['train']['answer']

# Randomly sample `num_questions` questions from the dataset
questions, answers, n_shot_examples = utils.generate_n_shot_examples(
    questions, answers, num_examples=2
)

In [8]:
prepared_training_data = torch.load("data/prepared_training_data_temp1.5_bsz8_shot2_q500.pt")

In [9]:
prepared_training_data.keys()

# Convert to huggingface dataset
from datasets import Dataset

hf_dataset = Dataset.from_dict({
    "questions": prepared_training_data["questions"],
    "chosen_solutions": prepared_training_data["chosen_solutions"],
    "chosen_logprobs": prepared_training_data["chosen_logprobs"],
    "chosen_selfcertainties": prepared_training_data["chosen_selfcertainties"],
    "reject_solutions": prepared_training_data["reject_solutions"],
    "reject_logprobs": prepared_training_data["reject_logprobs"],
    "reject_selfcertainties": prepared_training_data["reject_selfcertainties"],
})

In [16]:
hf_dataset[0]

{'questions': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'chosen_solutions': 'Natalia sold 48 clips in April.\nIn May, she sold half as many as in April, so she sold 48/2 = <<48/2=24>>24 clips.\nAltogether, Natalia sold 48 + 24 = <<48+24=72>>72 clips in April and May.\n#### 72',
 'chosen_logprobs': -0.06678149849176407,
 'chosen_selfcertainties': 39.8349609375,
 'reject_solutions': 'In April, Natalia sold clips to 48 friends.\nIn May, she sold half as many clips as in April, which is 48/2 = 24 clips.\nTo find out how many clips she sold altogether, add the numbers from April and May: 48 + 24 = <<48+24=72>>72 clips.\n      ### 72',
 'reject_logprobs': -0.34496068954467773,
 'reject_selfcertainties': 37.653018951416016}

In [10]:
# Write a toy example training loop just to loop through the dataset
# using dataloader
from torch.utils.data import DataLoader

dataloader = DataLoader(hf_dataset, batch_size=2, shuffle=True)

for batch in dataloader:
    # Simulate a training step
    print("Training on batch:")
    print("Batch size:", len(batch["questions"]))
    for key, value in batch.items():
        print(f"  {key}: {value}")

Training on batch:
Batch size: 2
  questions: ['Hans booked a room in a hotel. The hotel has 10 floors with 10 identical rooms on each floor. Because of an accident, the last floor is unavailable for the guests. Considering there are no other guests, in how many different rooms could Hans be checked in?', 'John buys 10 packs of magic cards.  Each pack has 20 cards and 1/4 of those cards are uncommon.  How many uncommon cards did he get?']
  chosen_solutions: ['The hotel has 10 floors with 10 rooms each: 10 x 10 = <<10*10=100>>100 rooms.\nSince the last floor is unavailable, there are 9 available floors with 10 rooms each: 9 x 10 = <<9*10=90>>90 rooms.\nHans can be checked into one of those 90 different rooms.\n#### 90\n\n', 'There are 10 packs with 20 cards each, so there are a total of 10*20 = <<10*20=200>>200 cards.\n1/4 of those cards are uncommon, so there are 200*(1/4) = <<200*(1/4)=50>>50 uncommon cards.\nJohn got 50 uncommon cards.\n#### 50']
  chosen_logprobs: tensor([-0.3238, 

In [18]:
from typing import Tuple, List, Dict
from datasets import Dataset

def create_train_val_split(dataset: Dataset, val_split: float = 0.1) -> Tuple[Dataset, Dataset]:
    """
    Split dataset into training and validation sets.
    """
    total_size = len(dataset)
    val_size = int(total_size * val_split)
    train_size = total_size - val_size
    
    # Create indices for splitting
    indices = list(range(total_size))
    train_indices = indices[:train_size]
    val_indices = indices[train_size:]
    
    train_dataset = dataset.select(train_indices)
    val_dataset = dataset.select(val_indices)
    
    return train_dataset, val_dataset


In [14]:
train_dataset, val_dataset = create_train_val_split(hf_dataset, val_split=0.1)
print(train_dataset)
print(val_dataset)

Dataset({
    features: ['questions', 'chosen_solutions', 'chosen_logprobs', 'chosen_selfcertainties', 'reject_solutions', 'reject_logprobs', 'reject_selfcertainties'],
    num_rows: 450
})
Dataset({
    features: ['questions', 'chosen_solutions', 'chosen_logprobs', 'chosen_selfcertainties', 'reject_solutions', 'reject_logprobs', 'reject_selfcertainties'],
    num_rows: 50
})


In [43]:
def prepare_dpo_sample(
        question: str,
        chosen_solution: str,
        reject_solution: str,
        tokenizer: AutoTokenizer,
        max_absolute_length: int = 2048
    ) -> Dict[str, torch.Tensor]:
    """
    Prepare a single DPO training sample by creating full sequences
    (question + answer) for both chosen and rejected responses.
    No truncation unless absolutely necessary for memory constraints.
    """
    # Create chat templates for chosen and rejected responses
    chosen_chat = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": chosen_solution}
    ]
    
    reject_chat = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": reject_solution}
    ]
    
    # Apply chat template and tokenize
    chosen_text = tokenizer.apply_chat_template(
        chosen_chat, tokenize=False, add_generation_prompt=False
    )
    reject_text = tokenizer.apply_chat_template(
        reject_chat, tokenize=False, add_generation_prompt=False
    )
    
    # Tokenize the full sequences without truncation (except safety limit)
    chosen_tokens = tokenizer(
        chosen_text,
        truncation=True,
        max_length=max_absolute_length,
        return_tensors="pt",
        padding=False
    )
    
    reject_tokens = tokenizer(
        reject_text,
        truncation=True,
        max_length=max_absolute_length,
        return_tensors="pt",
        padding=False
    )
    
    # Also tokenize just the question part to identify where the answer starts
    question_chat = [{"role": "user", "content": question}]
    question_text = tokenizer.apply_chat_template(
        question_chat, tokenize=False, add_generation_prompt=True
    )
    question_tokens = tokenizer(
        question_text,
        truncation=True,
        max_length=max_absolute_length,
        return_tensors="pt",
        padding=False
    )

    # Calculate answer start positions and lengths for both chosen and rejected solutions
    chosen_answer_start = len(question_tokens["input_ids"][0])
    reject_answer_start = len(question_tokens["input_ids"][0])
    chosen_answer_length = len(chosen_tokens["input_ids"][0]) - chosen_answer_start
    reject_answer_length = len(reject_tokens["input_ids"][0]) - reject_answer_start
    
    return {
        "chosen_input_ids": chosen_tokens["input_ids"].squeeze(0),
        "chosen_attention_mask": chosen_tokens["attention_mask"].squeeze(0),
        "reject_input_ids": reject_tokens["input_ids"].squeeze(0),
        "reject_attention_mask": reject_tokens["attention_mask"].squeeze(0),
        "chosen_answer_start": torch.tensor(chosen_answer_start, dtype=torch.long),
        "chosen_answer_length": torch.tensor(chosen_answer_length, dtype=torch.long),
        "reject_answer_start": torch.tensor(reject_answer_start, dtype=torch.long),
        "reject_answer_length": torch.tensor(reject_answer_length, dtype=torch.long),
    }

In [46]:
question = hf_dataset[0]["questions"]
chosen_solution = hf_dataset[0]["chosen_solutions"]
reject_solution = hf_dataset[0]["reject_solutions"]
result = prepare_dpo_sample(
    question=question,
    chosen_solution=chosen_solution,
    reject_solution=reject_solution,
    tokenizer=tokenizer
)

print(tokenizer.decode(result["chosen_input_ids"][result["chosen_answer_start"]:]))
print(result["chosen_answer_length"])
print(result["reject_answer_length"])

Natalia sold 48 clips in April.
In May, she sold half as many as in April, so she sold 48/2 = <<48/2=24>>24 clips.
Altogether, Natalia sold 48 + 24 = <<48+24=72>>72 clips in April and May.
#### 72<|end|><|endoftext|>
tensor(88)
tensor(94)
