# Week 2: Component 1 - Fine-Tuned Sentiment Model
## Building the DistilBERT Review Classifier

**Focus:** Develop sentiment analysis model for customer reviews

---

## Step 1: Environment Setup

In [None]:
# Install required libraries
# AI Assistance: Claude helped structure this installation cell

!pip install -q transformers datasets torch accelerate scikit-learn
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# Import all necessary libraries
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully")

## Step 2: Load and Prepare Dataset

In [None]:
# Load Amazon review dataset
# Using a small subset for quick training (1000 samples)
# Citation: Amazon Polarity dataset from Hugging Face

print("Loading dataset...")
# Load 1000 training samples and 200 test samples
train_dataset = load_dataset("amazon_polarity", split="train[:1000]")
test_dataset = load_dataset("amazon_polarity", split="test[:200]")

print(f"✅ Training samples: {len(train_dataset)}")
print(f"✅ Test samples: {len(test_dataset)}")

# Preview a sample
sample = train_dataset[0]
print(f"\nSample Review:")
print(f"Label: {sample['label']} (0=Negative, 1=Positive)")
print(f"Title: {sample['title']}")
print(f"Content: {sample['content'][:150]}...")

In [None]:
# Combine title and content for better context
# AI Assistance: Claude suggested combining fields for richer input

def preprocess_function(examples):
    """
    Combines review title and content into single text field.
    This gives the model more context for accurate sentiment prediction.
    """
    # Combine title and content with separator
    texts = []
    for title, content in zip(examples['title'], examples['content']):
        # Limit content length to avoid extremely long inputs
        text = f"{title}. {content[:300]}"
        texts.append(text)
    return {'text': texts}

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

print("✅ Dataset preprocessing complete")

## Step 3: Load Pre-trained Model and Tokenizer

In [None]:
# Load DistilBERT tokenizer and model
# Model: distilbert-base-uncased from Hugging Face
# DistilBERT is a smaller, faster version of BERT (40% smaller, 60% faster)

model_name = "distilbert-base-uncased"

print(f"Loading tokenizer and model: {model_name}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model for binary classification (positive/negative)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2  # Binary classification
)

print("✅ Model and tokenizer loaded successfully")
print(f"Model parameters: {model.num_parameters():,}")

## Step 4: Tokenize Dataset

In [None]:
# Tokenize the text data
# This converts text into numerical tokens that the model can process

def tokenize_function(examples):
    """
    Tokenizes text using DistilBERT tokenizer.
    
    Parameters:
    - padding: Pads shorter sequences to match longest in batch
    - truncation: Cuts off text longer than max_length
    - max_length: Maximum token length (512 is BERT's limit)
    """
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128  # Using 128 for faster training
    )

print("Tokenizing datasets...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("✅ Tokenization complete")

## Step 5: Define Evaluation Metrics

In [None]:
# Define metrics for model evaluation
# AI Assistance: Claude provided this metrics computation function

def compute_metrics(eval_pred):
    """
    Computes accuracy, precision, recall, and F1-score.
    These metrics help us understand model performance:
    - Accuracy: Overall correctness
    - Precision: How many predicted positives are actually positive
    - Recall: How many actual positives we correctly identified
    - F1: Harmonic mean of precision and recall
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

print("✅ Evaluation metrics defined")

## Step 6: Configure Training Parameters

In [None]:
# Set up training configuration
# These parameters control how the model learns

training_args = TrainingArguments(
    output_dir='./results',          # Where to save model checkpoints
    num_train_epochs=3,               # Number of times to go through dataset
    per_device_train_batch_size=16,   # Batch size for training
    per_device_eval_batch_size=32,    # Batch size for evaluation
    warmup_steps=100,                 # Gradual learning rate warmup
    weight_decay=0.01,                # Regularization to prevent overfitting
    logging_dir='./logs',             # Where to save training logs
    logging_steps=50,                 # Log every 50 steps
    eval_strategy="epoch",         # Evaluate after each epoch
    save_strategy="epoch",            # Save model after each epoch
    load_best_model_at_end=True,      # Load best model when done
    push_to_hub=False                 # Don't upload to Hugging Face Hub yet
)

print("✅ Training configuration set")
print(f"Training for {training_args.num_train_epochs} epochs")

## Step 7: Initialize Trainer and Fine-Tune Model

In [None]:
# Create Trainer instance
# The Trainer handles the training loop automatically

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

print("✅ Trainer initialized")
print("Starting training... (this may take 5-10 minutes)")

In [None]:
# Fine-tune the model
# This is transfer learning - we're adapting a pre-trained model to our task

trainer.train()

print("\n✅ Training complete!")

## Step 8: Evaluate Model Performance

In [None]:
# Evaluate on test set
results = trainer.evaluate()

print("\n=== Model Performance ===")
print(f"Accuracy:  {results['eval_accuracy']:.4f} ({results['eval_accuracy']*100:.2f}%)")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall:    {results['eval_recall']:.4f}")
print(f"F1-Score:  {results['eval_f1']:.4f}")

print("\nInterpretation:")
if results['eval_accuracy'] > 0.85:
    print("✅ Excellent performance! Model is ready for deployment.")
elif results['eval_accuracy'] > 0.75:
    print("✅ Good performance! Model is functional for demo purposes.")
else:
    print("⚠️ Model works but could be improved with more training data.")

## Step 9: Test Model with Sample Reviews

In [None]:
# Test the model with custom reviews
# AI Assistance: Claude helped create this inference function

def predict_sentiment(text):
    """
    Predicts sentiment of a given review.
    
    Returns:
    - label: 'Positive' or 'Negative'
    - confidence: Probability score (0-1)
    """
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
    # Get predicted class and confidence
    predicted_class = predictions.argmax().item()
    confidence = predictions[0][predicted_class].item()
    
    label = "Positive" if predicted_class == 1 else "Negative"
    
    return label, confidence

# Test samples
test_reviews = [
    "These headphones are amazing! Great sound quality and very comfortable.",
    "Terrible product. Broke after one week. Don't waste your money.",
    "It's okay, nothing special but does the job.",
    "Best purchase ever! Highly recommend to everyone.",
    "Poor quality, disappointed with this purchase."
]

print("\n=== Testing Model on Sample Reviews ===")
for i, review in enumerate(test_reviews, 1):
    label, confidence = predict_sentiment(review)
    print(f"\n{i}. Review: {review}")
    print(f"   Prediction: {label} (Confidence: {confidence:.2%})")

## Step 10: Save Model for Later Use

In [None]:
# Save the fine-tuned model and tokenizer
# This allows us to load it later without retraining

save_directory = "./sentiment_model"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"✅ Model saved to {save_directory}")
print("You can load this model later with:")
print(f"  model = AutoModelForSequenceClassification.from_pretrained('{save_directory}')")
print(f"  tokenizer = AutoTokenizer.from_pretrained('{save_directory}')")

## Week 2 Summary

**Completed:**
- ✅ Loaded Amazon review dataset (1,000 training samples)
- ✅ Fine-tuned DistilBERT for sentiment classification
- ✅ Achieved working model with evaluation metrics
- ✅ Tested model on sample reviews
- ✅ Saved model for integration with GUI

**Performance:**
- Expected accuracy: ~85-92% (typical for this task)
- Model can distinguish positive from negative reviews
- Ready for integration with RAG system

**Next Steps (Week 3-4):**
- Build RAG system with ChromaDB
- Create embeddings from company documents
- Test question-answering capability

---

**AI Assistance Documentation:**
- Claude (Anthropic) provided:
  - Code structure and training pipeline
  - Comments and explanations
  - Evaluation metrics implementation
  - Testing framework

**Citations:**
- Dataset: Zhang et al. (2015) - Amazon Review Dataset (Hugging Face: amazon_polarity)
- Model: Sanh et al. (2019) - DistilBERT (Hugging Face: distilbert-base-uncased)
- Framework: Hugging Face Transformers library