<a href="https://colab.research.google.com/github/gulrukhsorakhmadjanova/ml.ai/blob/main/task1_uzcosmos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Colab: install dependencies
!pip install --upgrade pip
!pip install transformers datasets evaluate scikit-learn sentencepiece wandb tensorboard
# Optional: accelerate for mixed-precision/higher perf
!pip install -q accelerate




In [None]:
#login to wandb to track experiments
import os
use_wandb = False  # set True if you want to use wandb
if use_wandb:
    import wandb
    wandb.login()  # follow prompt
    os.environ["WANDB_DISABLED"] = "false"
else:
    os.environ["WANDB_DISABLED"] = "true"


In [None]:
train_py = """
\"""
train.py — Fine-tune BERT (bert-base-uncased) on SST-2 sentiment classification.

✅ Requirements satisfied:
1. Data Preparation via Hugging Face datasets & tokenizer
2. Model Fine-Tuning with Trainer API
3. Evaluation: Accuracy + F1 + classification report + confusion matrix
4. Organized, modular, readable code
5. Deliverables-ready: logs, model saving, optional W&B/TensorBoard
6. Bonus: Early stopping, model checkpointing, gradient clipping, LR scheduler, custom training loop option

Author: Gulrukhsor Akhmadjanova
\"""

import os
import argparse
from datetime import datetime
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
    set_seed,
)
from sklearn.metrics import classification_report, confusion_matrix
import torch


# ---------- Compute metrics ----------
def compute_metrics(pred):
    \"""Compute accuracy and macro F1 using the evaluate library.\"""
    preds = pred.predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    y_pred = np.argmax(preds, axis=1)
    y_true = pred.label_ids

    accuracy = (y_pred == y_true).mean()
    f1_metric = evaluate.load(\"f1\")
    f1_score = f1_metric.compute(predictions=y_pred, references=y_true, average=\"macro\")[\"f1\"]
    return {\"accuracy\": float(accuracy), \"f1_macro\": float(f1_score)}


# ---------- Argument parser ----------
def parse_args():
    parser = argparse.ArgumentParser(description=\"Fine-tune BERT on SST-2 dataset\")
    parser.add_argument(\"--model_name\", type=str, default=\"bert-base-uncased\")
    parser.add_argument(\"--output_dir\", type=str, default=\"./sst2_outputs\")
    parser.add_argument(\"--epochs\", type=int, default=3)
    parser.add_argument(\"--per_device_train_batch_size\", type=int, default=8)
    parser.add_argument(\"--per_device_eval_batch_size\", type=int, default=32)
    parser.add_argument(\"--lr\", type=float, default=2e-5)
    parser.add_argument(\"--weight_decay\", type=float, default=0.01)
    parser.add_argument(\"--max_length\", type=int, default=128)
    parser.add_argument(\"--seed\", type=int, default=42)
    parser.add_argument(\"--save_total_limit\", type=int, default=3)
    parser.add_argument(\"--logging_steps\", type=int, default=200)
    parser.add_argument(\"--use_custom_loop\", action=\"store_true\", help=\"Run custom PyTorch loop instead of Trainer\")
    parser.add_argument(\"--push_to_hub\", action=\"store_true\", help=\"Push best model to Hugging Face Hub\")
    return parser.parse_args()


# ---------- Main training ----------
def main():
    args = parse_args()
    set_seed(args.seed)
    os.makedirs(args.output_dir, exist_ok=True)

    # Load SST-2 dataset
    dataset = load_dataset(\"glue\", \"sst2\")
    print(\"📚 Dataset loaded:\")
    print({split: len(dataset[split]) for split in dataset.keys()})

    # Load tokenizer and model
    tokenizer = BertTokenizerFast.from_pretrained(args.model_name)
    model = BertForSequenceClassification.from_pretrained(args.model_name, num_labels=2)

    # Preprocessing
    def preprocess(examples):
        return tokenizer(examples[\"sentence\"], truncation=True, padding=False, max_length=args.max_length)

    tokenized = dataset.map(preprocess, batched=True, remove_columns=[\"sentence\", \"idx\"] if \"idx\" in dataset[\"train\"].column_names else [\"sentence\"])

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Training arguments
    timestamp = datetime.now().strftime(\"%Y%m%d-%H%M%S\")
    training_args = TrainingArguments(
        output_dir=os.path.join(args.output_dir, timestamp),
        eval_strategy=\"epoch\",
        save_strategy=\"epoch\",
        learning_rate=args.lr,
        per_device_train_batch_size=args.per_device_train_batch_size,
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        num_train_epochs=args.epochs,
        weight_decay=args.weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model=\"f1_macro\",
        greater_is_better=True,
        save_total_limit=args.save_total_limit,
        fp16=torch.cuda.is_available(),
        logging_dir=os.path.join(args.output_dir, \"logs\"),
        logging_strategy=\"steps\",
        logging_steps=args.logging_steps,
        report_to=[\"tensorboard\"],
        run_name=f\"sst2-bert-{timestamp}\",
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized[\"train\"],
        eval_dataset=tokenized[\"validation\"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # Train model
    if args.use_custom_loop:
        custom_train_loop(model, tokenized[\"train\"], tokenized[\"validation\"], tokenizer, args)
    else:
        trainer.train()
        metrics = trainer.evaluate()
        print(\"\\n📊 Validation Metrics:\", metrics)

        # Save best model
        best_dir = os.path.join(training_args.output_dir, \"best_model\")
        trainer.save_model(best_dir)
        tokenizer.save_pretrained(best_dir)
        print(f\"✅ Model saved at: {best_dir}\")

        # Detailed evaluation
        preds_output = trainer.predict(tokenized[\"validation\"])
        preds = preds_output.predictions
        if isinstance(preds, tuple):
            preds = preds[0]
        y_pred = np.argmax(preds, axis=1)
        y_true = preds_output.label_ids

        print(\"\\nClassification Report:\")
        print(classification_report(y_true, y_pred, digits=4))
        print(\"\\nConfusion Matrix:\")
        print(confusion_matrix(y_true, y_pred))

    if args.push_to_hub:
        trainer.push_to_hub()


# ---------- Optional Custom Loop ----------
def custom_train_loop(model, train_dataset, val_dataset, tokenizer, args):
    \"""
    Custom training loop for bonus points (manual PyTorch training).
    \"""
    from torch.utils.data import DataLoader
    from transformers import AdamW, get_linear_schedule_with_warmup

    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")
    model.to(device)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=args.per_device_train_batch_size, shuffle=True, collate_fn=data_collator)
    val_loader = DataLoader(val_dataset, batch_size=args.per_device_eval_batch_size, collate_fn=data_collator)

    optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    total_steps = len(train_loader) * args.epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.06 * total_steps), num_training_steps=total_steps)

    best_f1 = 0.0
    patience_counter = 0
    early_stop_patience = 2

    f1_metric = evaluate.load(\"f1\")

    for epoch in range(args.epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        all_preds, all_labels = [], []
        for batch in val_loader:
            labels = batch[\"labels\"]
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                logits = model(**batch).logits
            all_preds.append(logits.cpu().numpy())
            all_labels.append(labels.numpy())
        all_preds = np.concatenate(all_preds)
        all_labels = np.concatenate(all_labels)
        y_pred = np.argmax(all_preds, axis=1)
        acc = (y_pred == all_labels).mean()
        f1_score = f1_metric.compute(predictions=y_pred, references=all_labels, average=\"macro\")[\"f1\"]

        print(f\"Epoch {epoch+1}/{args.epochs} | Train loss: {avg_train_loss:.4f} | Val acc: {acc:.4f} | Val f1: {f1_score:.4f}\")

        # Early stopping
        if f1_score > best_f1:
            best_f1 = f1_score
            patience_counter = 0
            save_dir = os.path.join(args.output_dir, \"custom_best\")
            os.makedirs(save_dir, exist_ok=True)
            model.save_pretrained(save_dir)
            tokenizer.save_pretrained(save_dir)
            print(f\"✅ New best F1: {best_f1:.4f} -> Model saved to {save_dir}\")
        else:
            patience_counter += 1
            if patience_counter >= early_stop_patience:
                print(\"⏹️ Early stopping triggered.\")
                break


if __name__ == \"__main__\":
    main()
"""
with open("train.py", "w") as f:
    f.write(train_py)
print("✅ train.py created.")

✅ train.py created.


In [None]:
# Note: In Colab free GPU you may need to reduce per_device_train_batch_size to 8 or 4.
!python train.py --output_dir ./sst2_outputs --epochs 3 --per_device_train_batch_size 16 --per_device_eval_batch_size 64 --lr 2e-5


2025-10-30 22:58:59.682201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761865139.701750    9474 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761865139.707641    9474 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1761865139.723274    9474 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1761865139.723302    9474 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1761865139.723309    9474 computation_placer.cc:177] computation placer alr

In [None]:
# Run this cell to check training status
import os
import glob

out_dir = "./sst2_outputs"
print("📁 Checking training folder...")

if os.path.exists(out_dir):
    files = glob.glob(os.path.join(out_dir, "**", "*"), recursive=True)
    if files:
        print("✅ Training folder exists with files:")
        for f in files[:10]:  # Show first 10 files
            print(f"  - {f}")
    else:
        print("📁 Folder exists but empty - training in progress")
else:
    print("❌ No training folder - training not started or in progress")

📁 Checking training folder...
✅ Training folder exists with files:
  - ./sst2_outputs/20251030-222650
  - ./sst2_outputs/20251030-225910
  - ./sst2_outputs/20251030-225113
  - ./sst2_outputs/logs
  - ./sst2_outputs/20251030-222650/checkpoint-8420
  - ./sst2_outputs/20251030-222650/checkpoint-4210
  - ./sst2_outputs/20251030-222650/checkpoint-12630
  - ./sst2_outputs/20251030-222650/best_model
  - ./sst2_outputs/20251030-222650/checkpoint-8420/model.safetensors
  - ./sst2_outputs/20251030-222650/checkpoint-8420/scaler.pt


In [None]:
# PASTE THIS IN A NEW CELL - UPDATED VERSION

import os, glob, torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

print("🚀 RUNNING EVALUATION...")

# ====== 1️⃣ Locate Best Model Folder ======
out_dir = "./sst2_outputs"

# Look for BOTH file types
candidates_bin = glob.glob(os.path.join(out_dir, "**", "pytorch_model.bin"), recursive=True)
candidates_safe = glob.glob(os.path.join(out_dir, "**", "model.safetensors"), recursive=True)
candidates = candidates_bin + candidates_safe

if not candidates:
    raise ValueError("❌ No trained model checkpoint found!")

best_model_path = max(candidates, key=os.path.getmtime)
best_dir = os.path.dirname(best_model_path)
print(f"✅ Using best checkpoint: {best_dir}")

# ====== 2️⃣ Load Model + Tokenizer ======
tokenizer = BertTokenizer.from_pretrained(best_dir)
model = BertForSequenceClassification.from_pretrained(best_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ====== 3️⃣ Evaluate ======
dataset = load_dataset("glue", "sst2")
test_data = dataset["validation"]

def tokenize_fn(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)

test_data = test_data.map(tokenize_fn, batched=True)
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_loader = DataLoader(test_data, batch_size=32)

correct, total = 0, 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = outputs.logits.argmax(dim=-1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"\n🎯 FINAL ACCURACY: {accuracy * 100:.2f}%")

🚀 RUNNING EVALUATION...
✅ Using best checkpoint: ./sst2_outputs/20251030-225910/best_model


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Evaluating: 100%|██████████| 28/28 [00:05<00:00,  5.09it/s]


🎯 FINAL ACCURACY: 92.09%



