In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Any, Dict, List, Optional
from matplotlib.axes import Axes
import torch
from datasets import Dataset
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, TrainingArguments, Trainer
colors = sns.color_palette("pastel")

2024-08-03 21:20:14.291165: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [3]:
data = pd.read_csv("Data/train.csv")
data.drop("id", axis=1, inplace=True)
no_rows_before = len(data)
number_of_duplicates = data.duplicated(keep=False).sum()
print(f"There exist {number_of_duplicates} duplicated rows.")
data = data.drop_duplicates(keep="first", ignore_index=True)
no_rows_after = len(data)
print(f"After removing duplicates, #samples drops from {no_rows_before} to {no_rows_after}.")
labels = np.zeros(len(data), dtype=np.int32)
labels[data['winner_model_a'] == 1] = 0
labels[data['winner_model_b'] == 1] = 1
labels[data['winner_tie'] == 1] = 2
data["labels"] = labels
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return sentences

data['prompt'] = data['prompt'].apply(process)
data['response_a'] = data['response_a'].apply(process)
data['response_b'] = data['response_b'].apply(process)


def preprocess_text(text):
    text = text.replace('\n\n', ' [NLNL] ')
    text = text.replace('\n', ' [NL] ')
    return text

def format_conversation(row):
    conversations = []
    num_turns = min(len(row['prompt']), len(row['response_a']), len(row['response_b']))
    
    for i in range(num_turns):
        prompt = f"<PROMPT> {row['prompt'][i]}"
        response_a = f"<RESPONSE> [R_STRAT] {preprocess_text(row['response_a'][i])} [R_END]"
        response_b = f"[R_STRAT] {preprocess_text(row['response_b'][i])} [R_END]"
        conversations.append(f"{prompt} {response_a} {response_b}")
        
    return ' [NLNL] '.join(conversations)

data['text'] = data.apply(format_conversation, axis=1)
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train'].select(range(int(len(dataset['train']) * 0.1)))
val_dataset = dataset['test']
tokenizer = DebertaV2Tokenizer.from_pretrained('fine-tuned-deberta-v3')
model = DebertaV2ForSequenceClassification.from_pretrained('fine-tuned-deberta-v3')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", max_length=1024, truncation=True)

# Tokenize the training and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
def prepare_dataset(dataset):
    dataset = dataset.remove_columns(["model_a", "model_b", "prompt",	"response_a", "response_b", "winner_model_a",	"winner_model_b", "winner_tie"])
    dataset.set_format("torch")
    return dataset

train_dataset = prepare_dataset(train_dataset)
val_dataset = prepare_dataset(val_dataset)


There exist 14 duplicated rows.
After removing duplicates, #samples drops from 57477 to 57470.


Map:   0%|          | 0/4597 [00:00<?, ? examples/s]

Map:   0%|          | 0/11494 [00:00<?, ? examples/s]

In [4]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions.argmax(-1)
    labels = eval_preds.label_ids
    probs = torch.from_numpy(eval_preds.predictions).float().softmax(-1).numpy()

    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds)
    return {"acc": acc, "log_loss": loss}

In [5]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)



In [6]:
from sklearn.metrics import accuracy_score, log_loss, classification_report

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 1.089062213897705, 'eval_model_preparation_time': 0.0033, 'eval_acc': 0.3668000696015312, 'eval_log_loss': 1.0890621697136182, 'eval_runtime': 1086.9653, 'eval_samples_per_second': 10.574, 'eval_steps_per_second': 1.322}
