In [20]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [1]:
import pandas as pd
import numpy as np

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction,
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, classification_report
import torch
import random
import os
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
print("device:", device)

device: cuda


In [2]:
train_path = Path("data/training_split.csv")
val_path   = Path("data/validation_split.csv")

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)

LABEL2ID = {"negative": 0, "neutral": 1, "positive": 2}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

train_df["label"] = train_df["label"].map(LABEL2ID)
val_df["label"] = val_df["label"].map(LABEL2ID)

assert train_df["label"].isna().sum() == 0
assert val_df["label"].isna().sum() == 0

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", force_download=True)
train_ds = Dataset.from_pandas(train_df[["sentence", "label"]])
val_ds = Dataset.from_pandas(val_df[["sentence", "label"]])

def tokenize_function(example):
    return tokenizer(
        example["sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)

train_ds = train_ds.remove_columns(["sentence"])
val_ds = val_ds.remove_columns(["sentence"])

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3,
    id2label=ID2LABEL,
    label2id=LABEL2ID
)

def compute_metrics(eval_pred: EvalPrediction):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    preds  = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}


arguments = dict(
    output_dir="./distilbert_baseline",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed=seed,
    fp16=True,
    dataloader_num_workers=4
)

training_args = TrainingArguments(**arguments)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print('training...')
trainer.train()

print('evaluating...')
trainer.evaluate()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Map:   0%|          | 0/91887 [00:00<?, ? examples/s]

Map:   0%|          | 0/10210 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training...


Step,Training Loss
500,0.7662
1000,0.7062
1500,0.688
2000,0.6646
2500,0.6664
3000,0.6417
3500,0.6363
4000,0.6055
4500,0.6219
5000,0.6126


evaluating...


{'eval_loss': 0.8398929238319397,
 'eval_accuracy': 0.7551420176297747,
 'eval_runtime': 10.0464,
 'eval_samples_per_second': 1016.28,
 'eval_steps_per_second': 63.605,
 'epoch': 3.0}