For this competition, I chose to fine-tune DeBERTa-v3-small, a lightweight language model built on the Transformer architecture, using PyTorch and the Hugging Face Transformers library. 

DeBERTa-v3-small is a compact version of the DeBERTa family that still delivers impressive results. It’s designed to understand language deeply, but with fewer parameters, which means faster training and lower memory usage without sacrificing too much accuracy (thank god - because running this on Kaggle can be challenging).

🧩 Better Understanding of Language Structure
This “disentangled attention” helps it make more precise comparisons

⚡ Efficient for Limited Compute
Since this is a Kaggle competition with time and compute limits, using a smaller model like this meant I could train faster (in under 2 hours!) and iterate more easily - a big win when exploring different strategies and tuning hyperparameters.


Feel fee to modify: Callyn V. 

In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
sample_submission = pd.read_csv('/kaggle/input/llm-classification-finetuning/sample_submission.csv')


2025-07-23 21:32:18.475859: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753306338.662183      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753306338.712797      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Combine labels into one column
def get_winner(row):
    if row['winner_model_a'] == 1:
        return 'a'
    elif row['winner_model_b'] == 1:
        return 'b'
    else:
        return 'tie'

train['winner'] = train.apply(get_winner, axis=1)
label2id = {'a': 0, 'b': 1, 'tie': 2}

# Split train/valid
train_split = train.sample(frac=0.9, random_state=42)
valid_split = train.drop(train_split.index)


In [5]:
# No longer downloading from Hugging Face!
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/deberta-v3-small-offline/deberta-v3-small')

def tokenize_function(examples):
    return tokenizer(
        [p + ' [SEP] ' + a + ' [SEP] ' + b for p, a, b in zip(examples['prompt'], examples['response_a'], examples['response_b'])],
        truncation=True,
        padding='max_length',
        max_length=512,
    )


In [6]:
def prepare_dataset(df):
    hf_dataset = Dataset.from_pandas(df[['prompt', 'response_a', 'response_b', 'winner']])
    hf_dataset = hf_dataset.map(tokenize_function, batched=True)
    hf_dataset = hf_dataset.map(lambda x: {'labels': [label2id[w] for w in x['winner']]}, batched=True)
    hf_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    return hf_dataset

train_ds = prepare_dataset(train_split)
valid_ds = prepare_dataset(valid_split)

Map:   0%|          | 0/51729 [00:00<?, ? examples/s]

Map:   0%|          | 0/51729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5748 [00:00<?, ? examples/s]

Map:   0%|          | 0/5748 [00:00<?, ? examples/s]

Model & Trainer

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    '/kaggle/input/deberta-v3-small-offline/deberta-v3-small',
    num_labels=3
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, preds)}

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=500,
    logging_first_step=True,
    fp16=True,         # Helps with DeBERTa on P100 (recommended)
    report_to=[],      # Turn off WandB/huggingface hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
)


In [10]:
print(model)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

There are 128,100 tokens in the vocab, each mapped to a 768-dimensional vector.
Each layer has:
* DisentangledSelfAttention: A signature DeBERTa improvement over standard self-attention. It separates content-based and position-based attention.
* A feedforward layer that expands from 768 → 3072 dims (GELU activation).
* output: Projects it back to 768 and normalizes.

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0859,1.081407,0.411795
2,1.042,1.06862,0.424495
3,0.9744,1.100474,0.43389


In [None]:
test_hf = Dataset.from_pandas(test)
test_hf = test_hf.map(tokenize_function, batched=True)
test_hf.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Get Predictions
predictions = trainer.predict(test_hf)
preds = np.argmax(predictions.predictions, axis=-1)

# Map numeric labels back to text
id2label = {0: 'winner_model_a', 1: 'winner_model_b', 2: 'winner_model_tie'}
submission = sample_submission.copy()
submission['winner_model_a'] = (preds == 0).astype(int)
submission['winner_model_b'] = (preds == 1).astype(int)
submission['winner_model_tie'] = (preds == 2).astype(int)

# Drop any extra columns if needed
submission = submission[['id', 'winner_model_a', 'winner_model_b', 'winner_model_tie']]

submission.to_csv('submission.csv', index=False)



In [None]:
print(submission.head())
print(submission.sum())   
