In [1]:

import pandas as pd
from transformers import Phi3ForSequenceClassification, BitsAndBytesConfig, AutoTokenizer,  AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, logging
import torch
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import gc
import time
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_from_disk, load_metric
from datasets import Dataset
from torch.utils.data import DataLoader
import transformers

MODEL_NAME = 'microsoft/Phi-3-mini-4k-instruct'
DEVICE = torch.cuda.current_device()



In [2]:

# Load the dataset
test = pd.read_csv('train.csv')

# Take a subset of the data (1000 records)
test = test.sample(n=50, random_state=42)

# Load sample submission
sample_sub = pd.read_csv('sample_submission.csv')

In [3]:
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)

test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + test['response_b']



In [4]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-4k-instruct')

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

tokens = tokenizer(list(test['text']), truncation=True, max_length=1200, return_tensors='pt', padding=True)

test['labels'] = test[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1).apply(lambda x: {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}[x])

input_ids = tokens['input_ids'].to(DEVICE, dtype=torch.int32)
attention_mask = tokens['attention_mask'].to(DEVICE, dtype=torch.int32)

input_ids_cpu = [tensor.cpu().tolist() for tensor in input_ids]
attention_mask_cpu = [tensor.cpu().tolist() for tensor in attention_mask]

data = pd.DataFrame()
data['input_ids'] = input_ids_cpu
data['attention_mask'] = attention_mask_cpu
data['labels'] = test['labels'].tolist()

# Convert to Dataset
dataset = Dataset.from_pandas(data)

dataloader = DataLoader(dataset, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer), shuffle=False)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Define BitsAndBytes configuration
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

# Load the model
model = Phi3ForSequenceClassification.from_pretrained(MODEL_NAME,
                                                        device_map={'': DEVICE},
                                                        num_labels=3, 
                                                        torch_dtype=torch.float16,
                                                        attn_implementation="flash_attention_2",
                                                        quantization_config=bnb_config)

# Freeze the original model parameters
for param in model.parameters():
    param.requires_grad = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Low-rank adaptation parameter
    lora_alpha=16,
    target_modules=['o_proj', 'qkv_proj'],  # Adjust based on your model architecture
    lora_dropout=0.1,
    bias="none"
)

# Wrap the model with PEFT
model = get_peft_model(model, lora_config)
model.train()
model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,
    learning_rate=2e-5,
    logging_dir='./logs',
    gradient_accumulation_steps=16,
    fp16=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


trainable params: 4,718,592 || all params: 3,727,306,752 || trainable%: 0.1266


  0%|          | 0/1 [00:00<?, ?it/s]