# ðŸ¤– NLP with Transformers: BERT Classification

Text classification using BERT, RoBERTa, and DistilBERT.

**Level**: Specialized  
**Time**: ~60 minutes

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## 1. Load Dataset

In [None]:
# Load IMDB dataset
dataset = load_dataset('imdb')
print(f"Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")
print(f"\nSample: {dataset['train'][0]['text'][:200]}...")

## 2. Tokenization

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)

# Tokenize (use subset for demo)
small_train = dataset['train'].select(range(1000))
small_test = dataset['test'].select(range(500))

tokenized_train = small_train.map(tokenize_function, batched=True)
tokenized_test = small_test.map(tokenize_function, batched=True)

print("âœ… Tokenization complete")

## 3. Load Pretrained Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

## 4. Training with HuggingFace Trainer

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, predictions), 'f1': f1_score(labels, predictions)}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

## 5. Model Comparison

In [None]:
import pandas as pd

models_comparison = pd.DataFrame({
    'Model': ['BERT-base', 'RoBERTa-base', 'DistilBERT', 'ALBERT-base', 'DeBERTa-base'],
    'Parameters': ['110M', '125M', '66M', '12M', '140M'],
    'Speed': ['1x', '1x', '2x', '1.7x', '0.8x'],
    'GLUE Score': [79.6, 83.2, 77.0, 80.1, 86.8],
    'Best For': ['General', 'Accuracy', 'Speed', 'Memory', 'Quality']
})

print("ðŸ“Š Transformer Models Comparison:")
display(models_comparison)

## 6. Inference

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    return 'Positive' if pred == 1 else 'Negative', probs[0][pred].item()

# Test
test_texts = [
    "This movie was absolutely fantastic! Best film I've seen this year.",
    "Terrible waste of time. The acting was horrible."
]

print("\nðŸ”® Predictions:")
for text in test_texts:
    sentiment, conf = predict_sentiment(text)
    print(f"  {sentiment} ({conf:.1%}): {text[:50]}...")

## ðŸŽ¯ Key Takeaways
1. DistilBERT is 2x faster with 97% performance
2. Fine-tuning beats training from scratch
3. HuggingFace Trainer simplifies training
4. Use gradient checkpointing for large models