In [1]:
# !pip install torch 
# !pip install transformers 
# !pip install datasets 
# !pip install trl 
# !pip install peft 
# !pip install accelerate 
# !pip install bitsandbytes 
# !pip install scikit-learn 
# !pip install numpy 
# !pip install wandb
# !pip install tqdm

In [None]:
import torch
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TrainerCallback
from datasets import Dataset
import os
import json
import re
import wandb
import pandas as pd

import trl
from trl import GRPOConfig, GRPOTrainer
print(f"trl version: {trl.__version__}")

# Set seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# 1. Model Selection and Loading

In [None]:
# Options for small models
model_options = {
    "tinyllama": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
    "gpt2": "gpt2",  # 124M parameters
    "gpt2-medium": "gpt2-medium",  # 355M parameters
    "opt-125m": "facebook/opt-125m",
    "bloom-560m": "bigscience/bloom-560m"
}

# Choose a model
MODEL_CHOICE = "tinyllama"
model_name = model_options[MODEL_CHOICE]

print(f"Selected model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the model with optimal settings for limited resources
device_map = "auto"
if torch.cuda.is_available():
    print("Using CUDA")
elif torch.backends.mps.is_available():
    print("CUDA unavailable, using MPS (Metal Performance Shaders) for Mac")
    device_map = {"": "mps"}
else:
    print("CUDA and MPS unavailable, using CPU")
    device_map = {"": "cpu"}

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    low_cpu_mem_usage=True
)

# 2. Dataset Creation

In [4]:
def generate_math_problem():
    """Generates a random math problem."""
    operations = ["+", "-", "*", "/"]
    op = random.choice(operations)
    
    if op == "+":
        a = random.randint(1, 100)
        b = random.randint(1, 100)
        answer = a + b
        prompt = f"What is {a} plus {b}?"
    elif op == "-":
        a = random.randint(1, 100)
        b = random.randint(1, min(a, 100))  # To ensure a positive result
        answer = a - b
        prompt = f"What is {a} minus {b}?"
    elif op == "*":
        a = random.randint(1, 20)
        b = random.randint(1, 20)
        answer = a * b
        prompt = f"What is {a} multiplied by {b}?"
    else:  # "/"
        b = random.randint(1, 10)
        a = b * random.randint(1, 10)  # To ensure an integer result
        answer = a // b
        prompt = f"What is {a} divided by {b}?"
    
    return prompt, answer

# Create training and validation datasets
def create_datasets(train_size=200, val_size=50):
    """Creates training and validation datasets."""
    all_data = []
    
    # Generate data
    for _ in range(train_size + val_size):
        prompt, answer = generate_math_problem()
        all_data.append({"prompt": prompt, "answer": answer})
    
    random.shuffle(all_data)
    
    train_data = all_data[:train_size]
    val_data = all_data[train_size:]
    
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)
    
    return train_dataset, val_dataset

In [None]:
train_dataset, val_dataset = create_datasets(train_size=1000, val_size=50)

print("Examples from the training dataset:")
for i in range(3):
    print(f"Example {i+1}: {train_dataset[i]}")

# 3. Base Model Testing

In [None]:
def format_prompt(problem):
    """Formats a problem as a prompt for the model."""
    return f"Problem: {problem}\nAnswer:"

# Create a pipeline for text generation
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer,
    min_new_tokens=1,
    max_new_tokens=4,
    # device="cpu"  # Force CPU for the pipeline
)

# Test the base model on a few examples
print("\nTesting the base model:")
for i in range(3):
    prompt = train_dataset[i]["prompt"]
    expected = train_dataset[i]["answer"]
    prompt = format_prompt(prompt)
    response = pipe(prompt)[0]["generated_text"]
    
    print(f"\nProblem: {prompt}")
    print(f"Expected answer: {expected}")
    print(f"Model response: {response}")

# 4. Reward Function

In [20]:
def reward_function(prompts, completions, **kwargs):
    """
    Reward function for evaluating model responses.
    
    Parameters:
    - prompts: list of problems
    - completions: list of model responses
    - kwargs: additional arguments that GRPOTrainer may pass
    
    Returns:
    - list of rewards for each response
    """
    rewards = []
    
    ground_truth = kwargs.get('answer', [None] * len(prompts))
    
    for prompt, completion, gt in zip(prompts, completions, ground_truth):
        reward = 0.0
        
        # Extract numerical answer
        match = re.search(r'Answer:\s*(\d+)', completion)
        
        format_correct = bool(match)
        
        if format_correct:
            reward += 1.0
            
            # Check answer correctness if ground_truth is available
            if gt is not None:
                try:
                    provided_answer = int(match.group(1))
                    
                    # Differentiated reward based on closeness to correct answer
                    if provided_answer == gt:
                        reward += 2.0  # Full reward for correct answer
                    elif abs(provided_answer - gt) <= 5:
                        reward += 1.0  # Partial reward for close answer
                    elif abs(provided_answer - gt) <= 10:
                        reward += 0.5  # Small reward for not too distant answer
                except:
                    pass
        
        rewards.append(reward)
    
    return rewards

# 5. GRPO Trainer Configuration

In [None]:
# GRPO configuration with optimal parameters for small models
grpo_config = GRPOConfig(
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    # beta=0.1,  # Target KL divergence
    seed=SEED,
    scale_rewards=True,  # Reward scaling
    output_dir="./grpo_checkpoint",  # Directory for checkpoint saving
    logging_steps=10,  # Log every 10 steps
    save_strategy="epoch",  # Save model at the end of each epoch
    eval_strategy="epoch",  # Evaluate model at the end of each epoch
    num_train_epochs=3,
    max_completion_length=4,
    report_to=["wandb"],  # Enable wandb reports
    log_completions=True,  # Enable text example logging
    wandb_log_unique_prompts=True  # Log only unique prompts to save space
)

# GRPO trainer
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    args=grpo_config,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    reward_funcs=[reward_function],
)

# 6. Data Preparation

In [16]:
def prepare_dataset_for_grpo(dataset):
    """Prepares data for GRPO training."""
    problems = [format_prompt(item['prompt']) for item in dataset]
    expected_answers = [item['answer'] for item in dataset]
    
    initial_responses = []
    print("Getting initial model responses...")
    
    cpu_model = model.to("cpu")
    
    with torch.no_grad():
        for problem in tqdm(problems[:10]):
            inputs = tokenizer(problem, return_tensors="pt")
            outputs = cpu_model.generate(
                inputs["input_ids"],
                max_new_tokens=4,
                min_new_tokens=1,
                pad_token_id=tokenizer.eos_token_id
            )
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            initial_responses.append(response)
    
    cpu_model = cpu_model.to(model.device)
    
    initial_rewards = reward_function(problems[:10], initial_responses, answer=expected_answers[:10])
    avg_initial_reward = sum(initial_rewards) / len(initial_rewards)
    
    print(f"Average initial reward on sample: {avg_initial_reward:.2f}")


def evaluate_model_for_wandb(model, tokenizer, test_problems=None, epoch=0):
    """
    Evaluates the model on test examples and logs results to wandb.
    
    Parameters:
    - model: trained model
    - tokenizer: tokenizer
    - test_problems: list of test problems (if None, predefined ones are used)
    - epoch: epoch number
    """
    if test_problems is None:
        test_problems = [
            "What is 15 plus 27?",
            "What is 42 minus 18?",
            "What is 7 multiplied by 8?", 
            "What is 45 divided by 9?"
        ]
    
    results = []
    
    for problem in test_problems:
        formatted_prompt = format_prompt(problem)
        
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            inputs["input_ids"],
            max_new_tokens=4,
            min_new_tokens=1,
            pad_token_id=tokenizer.eos_token_id
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        match = re.search(r'Answer:\s*(\d+)', response)
        answer = match.group(1) if match else "Invalid format"
    
        results.append({
            "Problem": problem,
            "Model Response": response,
            "Numerical Answer": answer
        })
    
    if wandb.run is not None:
        wandb.log({
            "model_samples/epoch": epoch,
            "model_samples/examples": wandb.Table(
                dataframe=pd.DataFrame(results)
            )
        })
    
    return results


class WandbEvalCallback(TrainerCallback):
    """Callback for logging generation results to wandb after each epoch"""
    
    def __init__(self, model, tokenizer, test_problems=None):
        self.model = model
        self.tokenizer = tokenizer
        self.test_problems = test_problems
        
    def on_epoch_end(self, args, state, control, **kwargs):
        evaluate_model_for_wandb(
            self.model, 
            self.tokenizer, 
            self.test_problems, 
            state.epoch
        )
        return control

if "wandb" in grpo_config.report_to:
    test_problems = [
        "What is 15 plus 27?",
        "What is 42 minus 18?",
        "What is 7 multiplied by 8?",
        "What is 45 divided by 9?",
        "What is 33 plus 44?",
        "What is 99 minus 34?",
        "What is 12 multiplied by 5?",
        "What is 72 divided by 8?"
    ]
    trainer.add_callback(WandbEvalCallback(model, tokenizer, test_problems))

In [None]:
train_subset = train_dataset.select(range(50))
prepare_dataset_for_grpo(train_subset)

# 7. Training

In [None]:
print("\nStarting GRPO training...")
trainer.train()

# 8. Final Comparison Before/After

In [None]:
print("\nFinal comparison of model before and after training:")

# Load original model (before training))
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    low_cpu_mem_usage=True
)

original_pipe = pipe

test_problems = [
    "What is 15 plus 27?",
    "What is 42 minus 18?",
    "What is 7 multiplied by 8?",
    "What is 45 divided by 9?"
]

print("\nComparison of responses on new examples:")
for problem in test_problems:
    # Original model
    original_prompt = format_prompt(problem)
    original_response = original_pipe(original_prompt)[0]["generated_text"]
    
    # Trained model
    trained_prompt = format_prompt(problem)
    inputs = tokenizer(trained_prompt, return_tensors="pt").to(trainer.model.device)
    outputs = trainer.model.generate(
        inputs["input_ids"],
        max_new_tokens=4,
        min_new_tokens=1,
        pad_token_id=tokenizer.eos_token_id
    )
    trained_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"\nProblem: {problem}")
    print(f"Original model: {original_response}")
    print(f"Trained model: {trained_response}")