In [1]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [2]:
import pandas as pd
import numpy as np

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
    TrainingArguments,
    Trainer,
    EvalPrediction,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    AutoConfig,
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, classification_report, f1_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import torch
import random
import os
from pathlib import Path
import warnings
from torch import nn
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
print("device:", device)

device: cuda


In [3]:
os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [4]:
train_path = Path("data/training_split.csv") #or data/training_split_eda.csv
val_path = Path("data/validation_split.csv")

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

LABEL2ID = {"negative": 0, "neutral": 1, "positive": 2}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

train_df["label"] = train_df["label"].map(LABEL2ID).astype("float32")
val_df["label"] = val_df["label"].map(LABEL2ID).astype("float32")

assert train_df["label"].isna().sum() == 0
assert val_df["label"].isna().sum() == 0

In [5]:
model_name = "microsoft/deberta-v3-base"
#model_name = "distilbert-base-cased"


tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=False)
train_ds = Dataset.from_pandas(train_df[["sentence", "label"]])
val_ds = Dataset.from_pandas(val_df[["sentence", "label"]])

def tokenize(batch):
    return tokenizer(
        batch["sentence"],
        truncation=True,
        max_length=128
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["sentence"])
val_ds = val_ds.remove_columns(["sentence"])

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=ID2LABEL,
    label2id=LABEL2ID,

)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
        "mae": mean_absolute_error(labels, preds),
    }



#hyper parameters:
lr = 2e-5
batch_size = 8
num_epochs = 4
warmup_ratio = 0.1
weight_decay = 0.02

collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

steps_per_epoch = len(train_ds) // batch_size
half_epoch = max(1, steps_per_epoch // 2)

arguments = dict(
    output_dir="data/deberta_v3_base3",
    eval_strategy="steps",
    eval_steps=half_epoch,
    save_strategy="steps",
    save_steps=half_epoch,
    logging_strategy="steps",
    logging_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="mae",
    greater_is_better=False,
    do_eval=True,
    logging_dir="data/deberta_v3_base3/logs",
    report_to="none",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    learning_rate=lr,
    lr_scheduler_type="cosine",
    warmup_ratio=warmup_ratio,
    weight_decay=weight_decay,
    gradient_accumulation_steps=1,
    seed=seed,
    fp16=True,
    save_total_limit=2,
)

training_args = TrainingArguments(**arguments)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Map:   0%|          | 0/91887 [00:00<?, ? examples/s]

Map:   0%|          | 0/10210 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print('training...')
trainer.train()

trainer.save_model("data/deberta_v3_base3/best")

print('evaluating...')
trainer.evaluate()

training...


Step,Training Loss,Validation Loss,Accuracy,F1,Mae
5742,0.5905,0.58842,0.785994,0.76871,0.239765
11484,0.533,0.536237,0.802547,0.796186,0.231636
17226,0.4492,0.605417,0.807346,0.80246,0.220274
22968,0.4231,0.588672,0.80999,0.804179,0.218609
28710,0.3274,0.779808,0.813124,0.807721,0.211459
34452,0.3018,0.774343,0.820862,0.814641,0.201273
40194,0.2288,0.916907,0.818609,0.812575,0.202938
45936,0.2125,0.934303,0.818022,0.812567,0.20382


evaluating...


{'eval_loss': 0.7743434309959412,
 'eval_accuracy': 0.8208619000979432,
 'eval_f1': 0.8146408833622321,
 'eval_mae': 0.20127326250076294,
 'eval_runtime': 39.5362,
 'eval_samples_per_second': 258.245,
 'eval_steps_per_second': 32.3,
 'epoch': 4.0}

In [8]:
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from datasets import Dataset

test_df = pd.read_csv("data/test.csv")
test_ds = Dataset.from_pandas(test_df[["sentence"]])

def tokenize_fn(batch):
    return tokenizer(batch["sentence"], truncation=True, max_length=128)

test_ds = test_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.remove_columns(["sentence"])
test_ds.set_format(type="torch", columns=["input_ids","attention_mask"])

from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

loader = DataLoader(test_ds, batch_size=8, collate_fn=collator)

model.to(device)
model.eval()

all_preds = []
with torch.no_grad():
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits
        preds  = logits.argmax(dim=-1)
        all_preds.extend(preds.cpu().numpy())

ID2LABEL = {0:"negative", 1:"neutral", 2:"positive"}
labels = [ID2LABEL[i] for i in all_preds]

submission_df = pd.DataFrame({"id":test_df["id"],"label": labels})
submission_df.to_csv("submission_deberta_non_aug.csv", index=False)
print(f" Wrote submission_deberta_class.csv ({len(submission_df)})")

Map:   0%|          | 0/11951 [00:00<?, ? examples/s]

✅ Wrote submission_deberta_class.csv (11951)
