# 05 - Quick DistilBERT Fine-Tuning

We fine-tune only the classification head on a small sample (5 000 reviews) for 2 epochs:

1. Load & sample SP splits, clean text  
2. Build HF `Dataset`s  
3. Tokenize (max_length=64)  
4. Load & freeze DistilBERT backbone  
5. Define metrics & training arguments  
6. Train with `Trainer` (1–2 epochs, fp16)  
7. Evaluate on test set  
8. Save model & tokenizer  


In [1]:
!pip install --quiet transformers datasets evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Imports
import re
import pandas as pd
import numpy as np
from datasets import Dataset
import evaluate
import torch
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
    DataCollatorWithPadding
)
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

%matplotlib inline

# Download NLTK data
nltk.download("stopwords")
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# 1) Parameters & setup
TASK          = "sp"                             # or "hh"/"bd"
MODEL_NAME    = "distilbert-base-uncased"
OUTPUT_DIR    = f"models/{TASK}_distilbert_quick"

SMALL_N       = 5000                             # number of train samples
MAX_LENGTH    = 64                               # truncate/pad to 64 tokens
BATCH_SIZE    = 64
NUM_EPOCHS    = 2
LEARNING_RATE = 5e-5
SEED          = 42

set_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)


Device: cpu


In [None]:
# 2) Load splits, sample, and clean text
STOP = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    t = text.lower()
    t = re.sub(r"<[^>]+>", " ", t)
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"[^a-z\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return " ".join(lemmatizer.lemmatize(w) for w in t.split() if w not in STOP)

# Load CSVs
train_df = pd.read_csv(f"./project_splits/{TASK}_train.csv")
val_df   = pd.read_csv(f"./project_splits/{TASK}_val.csv")
test_df  = pd.read_csv(f"./project_splits/{TASK}_test.csv")

# Sample SMALL_N from train
train_df = train_df.sample(n=SMALL_N, random_state=SEED)

# Apply cleaning
for df in (train_df, val_df, test_df):
    df["cleaned"] = df["Text"].apply(clean_text)

print(f"Sampled {len(train_df)} train, {len(val_df)} val, {len(test_df)} test")


Sampled 5000 train, 52581 val, 105164 test


In [None]:
# 3) Build Hugging Face Datasets
hf_train = Dataset.from_pandas(train_df[["cleaned","label"]])
hf_val   = Dataset.from_pandas(val_df[["cleaned","label"]])
hf_test  = Dataset.from_pandas(test_df[["cleaned","label"]])


In [None]:
# 4) Tokenization (shorter max_length)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer)

def tokenize_fn(batch):
    return tokenizer(
        batch["cleaned"],
        truncation=True,
        max_length=MAX_LENGTH
    )

hf_train = hf_train.map(tokenize_fn, batched=True)\
                   .remove_columns("cleaned")\
                   .with_format("torch")
hf_val   = hf_val.map(tokenize_fn, batched=True)\
                 .remove_columns("cleaned")\
                 .with_format("torch")
hf_test  = hf_test.map(tokenize_fn, batched=True)\
                  .remove_columns("cleaned")\
                  .with_format("torch")


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/52581 [00:00<?, ? examples/s]

Map:   0%|          | 0/105164 [00:00<?, ? examples/s]

In [None]:
# 5) Load & freeze DistilBERT base
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2
).to(DEVICE)

# Freeze all layers except the classification head
for name, param in model.named_parameters():
    if name.startswith("distilbert"):
        param.requires_grad = False

print("Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 592130


In [None]:
# 6) Define compute_metrics for Trainer
accuracy_metric  = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric    = evaluate.load("recall")
f1_metric        = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=preds, references=labels)["precision"],
        "recall":    recall_metric.compute(predictions=preds, references=labels)["recall"],
        "f1":        f1_metric.compute(predictions=preds, references=labels)["f1"],
    }


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",            # run evaluation once per epoch
    save_strategy="epoch",            # save checkpoint once per epoch
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,      # requires eval_strategy == save_strategy
    metric_for_best_model="f1",
    seed=SEED,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=100,
    save_total_limit=2,
    fp16=torch.cuda.is_available()
)


In [None]:
# 8) Initialize Trainer & train
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

train_result = trainer.train()
trainer.save_state()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.416809,0.843974,0.843974,1.0,0.915386
2,0.432200,0.408025,0.843974,0.843974,1.0,0.915386


In [None]:
# 9) Evaluate on test set
test_result = trainer.predict(hf_test)
print("Test metrics:", test_result.metrics)

# Detailed classification report
preds = np.argmax(test_result.predictions, axis=-1)
print(classification_report(test_result.label_ids, preds, digits=3))


Test metrics: {'test_loss': 0.4166714549064636, 'test_accuracy': 0.8439865353162679, 'test_precision': 0.8439865353162679, 'test_recall': 1.0, 'test_f1': 0.915393381841059, 'test_runtime': 10098.214, 'test_samples_per_second': 10.414, 'test_steps_per_second': 0.163}
              precision    recall  f1-score   support

           0      0.000     0.000     0.000     16407
           1      0.844     1.000     0.915     88757

    accuracy                          0.844    105164
   macro avg      0.422     0.500     0.458    105164
weighted avg      0.712     0.844     0.773    105164



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# 10) Confusion matrix plot
cm = confusion_matrix(test_result.label_ids, preds)
plt.figure(figsize=(4,4))
plt.imshow(cm, cmap="Blues")
plt.colorbar()
plt.xticks([0,1], ["Fake","Genuine"])
plt.yticks([0,1], ["Fake","Genuine"])
for i in (0,1):
    for j in (0,1):
        plt.text(j, i, cm[i,j], ha="center", va="center",
                 color="white" if cm[i,j] > cm.max()/2 else "black")
plt.title("Test Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# 11) Save model & tokenizer
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved to", OUTPUT_DIR)
