In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Any, Dict, List, Optional
from matplotlib.axes import Axes
import torch
from datasets import Dataset
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, TrainingArguments, Trainer
colors = sns.color_palette("pastel")

2024-08-03 18:01:24.360033: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [3]:
data = pd.read_csv("Data/train.csv")
data.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [4]:
print(f"Data Shape | train {data.shape}")

Data Shape | train (57477, 9)


In [5]:
data.drop("id", axis=1, inplace=True)
data.head()

Unnamed: 0,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [6]:
no_rows_before = len(data)
number_of_duplicates = data.duplicated(keep=False).sum()
print(f"There exist {number_of_duplicates} duplicated rows.")

There exist 14 duplicated rows.


In [7]:
data = data.drop_duplicates(keep="first", ignore_index=True)
no_rows_after = len(data)
print(f"After removing duplicates, #samples drops from {no_rows_before} to {no_rows_after}.")

After removing duplicates, #samples drops from 57477 to 57470.


In [8]:
labels = np.zeros(len(data), dtype=np.int32)
labels[data['winner_model_a'] == 1] = 0
labels[data['winner_model_b'] == 1] = 1
labels[data['winner_tie'] == 1] = 2
data["labels"] = labels

In [9]:
data.head()

Unnamed: 0,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,labels
0,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0,0
1,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0,1
2,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1,2
3,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0,0
4,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0,1


In [10]:
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return sentences

data['prompt'] = data['prompt'].apply(process)
data['response_a'] = data['response_a'].apply(process)
data['response_b'] = data['response_b'].apply(process)

data.head()

Unnamed: 0,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,labels
0,gpt-4-1106-preview,gpt-4-0613,[Is it morally right to try to have a certain ...,[The question of whether it is morally right t...,"[As an AI, I don't have personal beliefs or op...",1,0,0,0
1,koala-13b,gpt-4-0613,[What is the difference between marriage licen...,[A marriage license is a legal document that a...,[A marriage license and a marriage certificate...,0,1,0,1
2,gpt-3.5-turbo-0613,mistral-medium,[explain function calling. how would you call ...,[Function calling is the process of invoking o...,[Function calling is the process of invoking a...,0,0,1,2
3,llama-2-13b-chat,mistral-7b-instruct,[How can I create a test set for a very rare c...,[Creating a test set for a very rare category ...,[When building a classifier for a very rare ca...,1,0,0,0
4,koala-13b,gpt-3.5-turbo-0314,[What is the best way to travel from Tel-Aviv ...,[The best way to travel from Tel Aviv to Jerus...,[The best way to travel from Tel-Aviv to Jerus...,0,1,0,1


In [11]:
special_tokens = ['[R_STRAT]', '[R_END]', '<PROMPT>', '<RESPONSE>', '[NL]', '[NLNL]']

def preprocess_text(text):
    text = text.replace('\n\n', ' [NLNL] ')
    text = text.replace('\n', ' [NL] ')
    return text

def format_conversation(row):
    conversations = []
    num_turns = min(len(row['prompt']), len(row['response_a']), len(row['response_b']))
    
    for i in range(num_turns):
        prompt = f"<PROMPT> {row['prompt'][i]}"
        response_a = f"<RESPONSE> [R_STRAT] {preprocess_text(row['response_a'][i])} [R_END]"
        response_b = f"[R_STRAT] {preprocess_text(row['response_b'][i])} [R_END]"
        conversations.append(f"{prompt} {response_a} {response_b}")
        
    return ' [NLNL] '.join(conversations)

data['text'] = data.apply(format_conversation, axis=1)
special_tokens_dict = {'additional_special_tokens': special_tokens}
data.head()

Unnamed: 0,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,labels,text
0,gpt-4-1106-preview,gpt-4-0613,[Is it morally right to try to have a certain ...,[The question of whether it is morally right t...,"[As an AI, I don't have personal beliefs or op...",1,0,0,0,<PROMPT> Is it morally right to try to have a ...
1,koala-13b,gpt-4-0613,[What is the difference between marriage licen...,[A marriage license is a legal document that a...,[A marriage license and a marriage certificate...,0,1,0,1,<PROMPT> What is the difference between marria...
2,gpt-3.5-turbo-0613,mistral-medium,[explain function calling. how would you call ...,[Function calling is the process of invoking o...,[Function calling is the process of invoking a...,0,0,1,2,<PROMPT> explain function calling. how would y...
3,llama-2-13b-chat,mistral-7b-instruct,[How can I create a test set for a very rare c...,[Creating a test set for a very rare category ...,[When building a classifier for a very rare ca...,1,0,0,0,<PROMPT> How can I create a test set for a ver...
4,koala-13b,gpt-3.5-turbo-0314,[What is the best way to travel from Tel-Aviv ...,[The best way to travel from Tel Aviv to Jerus...,[The best way to travel from Tel-Aviv to Jerus...,0,1,0,1,<PROMPT> What is the best way to travel from T...


In [12]:
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-xsmall')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-xsmall')

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
special_tokens_dict = {'additional_special_tokens': special_tokens}

In [14]:
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

# Verify if special tokens are added
print(tokenizer.additional_special_tokens)

['[R_STRAT]', '[R_END]', '<PROMPT>', '<RESPONSE>', '[NL]', '[NLNL]']


In [15]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train'].select(range(int(len(dataset['train']) * 0.1)))
val_dataset = dataset['test']

In [16]:
int(len(dataset['train']) * 0.1)

4597

In [17]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", max_length=1024, truncation=True)

# Tokenize the training and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4597 [00:00<?, ? examples/s]

Map:   0%|          | 0/11494 [00:00<?, ? examples/s]

In [18]:
def prepare_dataset(dataset):
    dataset = dataset.remove_columns(["model_a", "model_b", "prompt",	"response_a", "response_b", "winner_model_a",	"winner_model_b", "winner_tie"])
    dataset.set_format("torch")
    return dataset

train_dataset = prepare_dataset(train_dataset)
val_dataset = prepare_dataset(val_dataset)


In [19]:
train_dataset.to_pandas().head()

Unnamed: 0,labels,text,input_ids,token_type_ids,attention_mask
0,1,<PROMPT> Explain the self-attention mechanism ...,"[1, 128003, 19719, 262, 934, 271, 60602, 4866,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,0,<PROMPT> Create a text of no more than 350 cha...,"[1, 128003, 5254, 266, 1529, 265, 363, 310, 35...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,0,<PROMPT> write me a haiku about winter <RESPON...,"[1, 128003, 1183, 351, 266, 51314, 314, 2014, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1,<PROMPT> Who is Olivia Rodrigo <RESPONSE> [R_S...,"[1, 128003, 1876, 269, 14829, 36970, 128004, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,2,<PROMPT> Is there any good reason to believe t...,"[1, 128003, 273, 268, 343, 356, 397, 919, 264,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [20]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
)



In [21]:
import torch.nn as nn
model.classifier = nn.Linear(in_features=model.classifier.in_features, out_features=3, bias=True)

In [22]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128007, 384, padding_idx=0)
      (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=384, out_features=384, bias=True)
              (key_proj): Linear(in_features=384, out_features=384, bias=True)
              (value_proj): Linear(in_features=384, out_features=384, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine

In [23]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:

# Define a custom collate function to add debugging info
def collate_and_debug(batch):
    print(f"Batch size before collation: {len(batch)}")
    batch = data_collator(batch)
    print(f"Batch size after collation: {len(batch['input_ids'])}")
    return batch

In [26]:
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-xsmall', num_labels=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.0959,1.089812


TrainOutput(global_step=575, training_loss=1.0949675982931386, metrics={'train_runtime': 2305.2118, 'train_samples_per_second': 1.994, 'train_steps_per_second': 0.249, 'total_flos': 605666445944832.0, 'train_loss': 1.0949675982931386, 'epoch': 1.0})

In [None]:
def debug_training_loop(trainer, train_dataset):
    for step, batch in enumerate(trainer.get_train_dataloader()):
        inputs, labels = batch['input_ids'], batch['labels']
        print(f"Step: {step}, Inputs shape: {inputs.shape}, Labels shape: {labels.shape}")


# Debug the training loop
debug_training_loop(trainer, train_dataset)

In [27]:
# Save the model
model.save_pretrained("fine-tuned-deberta-v3")
tokenizer.save_pretrained("fine-tuned-deberta-v3")

('fine-tuned-deberta-v3/tokenizer_config.json',
 'fine-tuned-deberta-v3/special_tokens_map.json',
 'fine-tuned-deberta-v3/spm.model',
 'fine-tuned-deberta-v3/added_tokens.json')

In [28]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions.argmax(-1)
    labels = eval_preds.label_ids
    probs = torch.from_numpy(eval_preds.predictions).float().softmax(-1).numpy()

    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds)
    return {"acc": acc, "log_loss": loss}

In [29]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [34]:
# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

KeyboardInterrupt: 

In [1]:
!pip freeze > requirements.txt