## Idea

## Reward Analysis

## Ablation

## Do we have to soup?

## KL-Hack

In [1]:
#| code-summary: Imports and model evaluation function
#| output: false
#| echo: false
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os, functools, itertools

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from einops import rearrange

from datasets import load_dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from transformers import set_seed


from trl import SFTConfig, SFTTrainer
from trl import RewardConfig, RewardTrainer
from trl import PPOConfig, PPOTrainer

for d in ['data', 'models', 'logs']: os.makedirs(d, exist_ok = True)

device = torch.device(
    'cuda' if torch.cuda.is_available() else
    ('mps' if torch.backends.mps.is_available() else
    'cpu')
)

num_proc = int(4 * torch.cuda.device_count()) if device == 'cuda' else 1

set_seed(42)
torch.manual_seed(42)
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
m0 = AutoModelForCausalLM.from_pretrained('emiliocantuc/SmolLM2-135M-SFT-0', torch_dtype = torch.bfloat16, device_map = 'auto')
m1 = AutoModelForCausalLM.from_pretrained('emiliocantuc/SmolLM2-135M-SFT-1', torch_dtype = torch.bfloat16, device_map = 'auto')

# Souping
# TODO: try this with FP32
souped_state = {}
alpha = 0.5
for k in m0.state_dict():
    souped_state[k] = (1-alpha) * m0.state_dict()[k] + alpha * m1.state_dict()[k]

# overrite m0
m0.load_state_dict(souped_state)
m0.save_pretrained('models/souped/0&1-0.5')

In [3]:
def preprocess_function(examples, tokenizer, max_length):
    return {
        'input_ids': tokenizer(examples['prompt'], truncation = True, padding = 'max_length', max_length = max_length)['input_ids'],
    }

In [4]:
policy_model = AutoModelForCausalLM.from_pretrained('emiliocantuc/SmolLM2-135M-SFT-0', torch_dtype = torch.bfloat16, device_map = 'auto')
# reward_model = AutoModelForSequenceClassification.from_pretrained('emiliocantuc/SmolLM2-135M-SFT-0-Reward', torch_dtype = torch.bfloat16, device_map = 'auto', num_labels = 1)
soup_model = AutoModelForCausalLM.from_pretrained('models/souped/0&1-0.5', torch_dtype = torch.bfloat16, device_map = 'auto')
reward_model = model = AutoModelForSequenceClassification.from_pretrained('emiliocantuc/SmolLM2-135M-SFT-0', num_labels = 1, torch_dtype = torch.bfloat16, attn_implementation = 'flash_attention_2', device_map='auto')

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at emiliocantuc/SmolLM2-135M-SFT-0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
tokenizer_name = 'HuggingFaceTB/SmolLM2-135M-Instruct' # Need the instruct version for chat template
dataset_name = 'HuggingFaceH4/ultrafeedback_binarized'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast = True)

tokenizer.pad_token = tokenizer.eos_token  # Fix padding issue
for model in [policy_model, soup_model, reward_model]:
    model.config.pad_token_id = model.config.eos_token_id

dataset = load_dataset(dataset_name)

MAX_LENGTH = 1024 #2048

print('Preprocessing the dataset')
if True: #args.debug:
    dataset = DatasetDict({k: dataset[k].select(range(10)) for k in dataset})
    print(dataset)

column_names = dataset.column_names['train_prefs']
dataset = dataset.map(
    preprocess_function,
    fn_kwargs = {'tokenizer': tokenizer, 'max_length': MAX_LENGTH}, #?
    num_proc = min(16, os.cpu_count()),
    batched = True,
    remove_columns = column_names,
    desc = 'Preprocessing dataset',
)
dataset.set_format(type = 'torch')

num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.


Preprocessing the dataset
DatasetDict({
    train_prefs: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 10
    })
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 10
    })
    test_prefs: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 10
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 10
    })
    train_gen: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 10
    })
    test_gen: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
   

Preprocessing dataset (num_proc=10): 100%|██████████| 10/10 [00:00<00:00, 26.77 examples/s]
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.
Preprocessing dataset (num_proc=10): 100%|██████████| 10/10 [00:00<00:00, 24.79 examples/s]
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.
Preprocessing dataset (num_proc=10): 100%|██████████| 10/10 [00:00<00:00, 24.28 examples/s]
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.
Preprocessing dataset (num_proc=10): 100%|██████████| 10/10 [00:00<00:00, 26.63 examples/s]
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.
Preprocessing dataset (num_proc=10): 100%|██████████| 10/10 [00:00<00:00, 24.48 examples/s]
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.
Preprocessing dataset (num_proc=10): 100%|██████████| 10/10 [00:00<00:00, 25.01 examples/s]


In [6]:
# PPO Configuration
config = PPOConfig(
    learning_rate = 5e-6,
    batch_size = 1,
    gradient_checkpointing = True,
    output_dir = 'models/ppo/tmp'
    # Add other PPO parameters as needed
)

# Initialize PPO trainer
ppo_trainer = PPOTrainer(
    config,
    model = policy_model,
    ref_model = soup_model,  # KL will be computed against this model
    reward_model = reward_model,
    value_model = reward_model,
    processing_class = tokenizer,
    train_dataset = dataset['train_prefs'],
    eval_dataset = dataset['test_prefs'],
)

ppo_trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


===training policy===


[34m[1mwandb[0m: Currently logged in as: [33memiliocantuc[0m ([33memiliocantuc-university-of-michigan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
