In [None]:
# --- RUN THIS CELL ONCE AND ONLY ONCE, THEN DELETE THE PIP LINES ---
!pip install datasets transformers accelerate -U
!pip install evaluate -U

In [None]:
# Cell 1 (The Real Starting Cell After Installs)

import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# 1. Load the SST-2 Dataset
# SST-2: Stanford Sentiment Treebank v2 (binary classification)
raw_datasets = load_dataset("glue", "sst2")

# 2. Load the Pre-trained Tokenizer
MODEL_NAME = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set the padding token. GPT-2 models don't have a default pad token,
# but we need one for batch processing in classification.
tokenizer.pad_token = tokenizer.eos_token

# 3. Define the tokenization function
def tokenize_function(examples):
    # 'sentence' is the key in the SST-2 dataset
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# 4. Apply tokenization to the dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Select the necessary columns and rename 'label' to 'labels' for the Trainer
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Set format to PyTorch tensors
tokenized_datasets.set_format("torch")

# Prepare splits for training and evaluation
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(eval_dataset)}")

In [None]:
# Load the DistilGPT-2 model configured for sequence classification (num_labels=2 for binary)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

# IMPORTANT: Configure padding for GPT-style models.
# By default, GPT models compute loss on the last non-padded token.
# We set the pad token ID on the model to match the tokenizer's EOS token ID.
model.config.pad_token_id = tokenizer.eos_token_id

# In decoder-only models, the classification head typically takes the last token's hidden state.
# We explicitly set this to use the last token for classification.
model.config.use_cache = False # Disable cache for fine-tuning
model.config.is_decoder = True

# Move model to device
model.to(device)

print(f"Model loaded and configured for {model.config.num_labels}-class classification.")

In [None]:
# Load evaluation metrics from the 'evaluate' library
# metric is for accuracy (for sst2)
metric = evaluate.load("glue", "sst2")

def compute_metrics(eval_pred):
    """
    Function to compute Accuracy and F1 score.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # 1. Compute Accuracy
    accuracy = metric.compute(predictions=predictions, references=labels)['accuracy']

    # 2. Compute F1 score (using 'binary' average for the two-class SST-2 problem)
    f1_metric = evaluate.load("f1")
    f1_score = f1_metric.compute(predictions=predictions, references=labels, average="binary")['f1']

    return {"accuracy": accuracy, "f1": f1_score}

print("Metrics defined (Accuracy and F1-score).")

In [None]:
# Cell 5: Simplified Training Arguments (Should bypass the TypeError)

# NOTE: We must use the simpler TrainingArguments since this environment
# doesn't recognize evaluation_strategy. We rely on the default behavior
# or manual logging/saving.

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    # REMOVED: evaluation_strategy
    # REMOVED: save_strategy
    # REMOVED: load_best_model_at_end (Requires save_strategy to be set)
    # REMOVED: metric_for_best_model (Requires load_best_model_at_end)
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, # We will still compute metrics on manual calls
)

# Start training (fine-tuning)
print("Starting fine-tuning...")
train_results = trainer.train()

# --- MANUAL EVALUATION STEP (REPLACES AUTO-EVALUATION) ---
# After training, we manually call the evaluation step.
print("\n--- Running Final Evaluation ---")
final_metrics = trainer.evaluate(eval_dataset)
print("Final Metrics:", final_metrics)

print("Fine-tuning complete.")

In [None]:
# --- Analysis Setup (for Part 3) ---

# Get a few examples from the validation set to inspect
sample_data = eval_dataset.select(range(5))

print("\n--- Sample Predictions for Qualitative Analysis ---")
for i in range(5):
    # Get the raw sentence from the original, un-tokenized dataset
    sentence = raw_datasets['validation'][i]['sentence']
    true_label = raw_datasets['validation'][i]['label']

    # 1. Index the tokenized dataset to get the dictionary for sample 'i'
    sample = sample_data[i]

    # 2. Extract input_ids and attention_mask and prepare them for the model
    # We must convert them to PyTorch tensors and move them to the device.
    # Note: Hugging Face datasets return tensors/scalars. Ensure they are tensors.

    inputs = {
        'input_ids': sample['input_ids'].unsqueeze(0).to(device),
        'attention_mask': sample['attention_mask'].unsqueeze(0).to(device)
    }

    with torch.no_grad():
        # Pass the prepared dictionary of inputs to the model
        outputs = model(**inputs)

    predicted_label = torch.argmax(outputs.logits, dim=-1).item()

    # Get human-readable labels
    label_map = {0: "negative", 1: "positive"}

    print(f"Text: {sentence}")
    print(f"  True: {label_map[true_label]}, Predicted: {label_map[predicted_label]}")

In [None]:
# Cell X: FINAL COMPARATIVE ANALYSIS

# NOTE: Since TinyGPT (Part 1) was a Language Model (LM),
# it does not have Accuracy or F1 scores on the classification task.

# --- ENTER YOUR TINYGPT (PART 1) RESULTS HERE ---
# These are place holders. Please replace them with your actual numbers.
TINY_GPT_RESULTS = {
    "Model": "TinyGPT (Scratch LM)",
    "Task": "Language Modeling (WikiText-2)",
    "Val. Loss": 4.52,         # Replace with your final validation loss
    "Perplexity": 91.8,       # Replace with your final perplexity (exp(val_loss))
    "Accuracy": "N/A*",       # N/A for LM
    "F1 Score": "N/A*",       # N/A for LM
    "Mem. (GB)": 1.2,         # Replace with your measured peak memory/VRAM
    "Time (min)": 5.8         # Replace with your total training time in minutes
}

# --- EXTRACT DISTILGPT-2 (PART 2) RESULTS ---
# Assuming 'final_metrics' is still in memory from the previous successful run.

import math

DISTIL_GPT2_RESULTS = {
    "Model": "DistilGPT-2 (Fine-Tuned CLF)",
    "Task": "Sequence Classification (SST-2)",
    "Val. Loss": final_metrics['eval_loss'],
    "Perplexity": math.exp(final_metrics['eval_loss']),
    "Accuracy": final_metrics['eval_accuracy'],
    "F1 Score": final_metrics['eval_f1'],
    "Mem. (GB)": 2.1, # Typical peak VRAM for DistilGPT-2 fine-tuning on T4
    "Time (min)": 53.0 # **REPLACE WITH YOUR ACTUAL RECORDED TOTAL TRAINING TIME**
}

# --- COMPILE AND DISPLAY TABLE ---
import pandas as pd

# Update the DistilGPT2 dictionary with a manually recorded time for accuracy
DISTIL_GPT2_RESULTS['Time (min)'] = 45.0  # **REPLACE WITH YOUR ACTUAL RECORDED TOTAL TRAINING TIME**

comparison_df = pd.DataFrame([TINY_GPT_RESULTS, DISTIL_GPT2_RESULTS])
comparison_df = comparison_df.set_index('Model')

# Formatting for clarity
comparison_df['Val. Loss'] = pd.to_numeric(comparison_df['Val. Loss'], errors='coerce').round(4)
comparison_df['Perplexity'] = pd.to_numeric(comparison_df['Perplexity'], errors='coerce').round(1)
comparison_df['Accuracy'] = pd.to_numeric(comparison_df['Accuracy'], errors='coerce').round(3).fillna('N/A*')
comparison_df['F1 Score'] = pd.to_numeric(comparison_df['F1 Score'], errors='coerce').round(3).fillna('N/A*')
comparison_df['Mem. (GB)'] = pd.to_numeric(comparison_df['Mem. (GB)'], errors='coerce').round(2)
comparison_df['Time (min)'] = pd.to_numeric(comparison_df['Time (min)'], errors='coerce').round(1)


print("## ðŸ“ˆ Final Model Comparison Table")
print("*(N/A* denotes metrics not applicable to the original task, e.g., Accuracy for Language Modeling.)*")
print("-" * 40)
print(comparison_df.to_markdown())