In [None]:
# DistilBERT Experiments for Yelp Reviews Sentiment Analysis

import os
import sys
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import DistilBertTokenizer, AdamW, get_linear_schedule_with_warmup

In [None]:
# Add the project root to path for imports
sys.path.append('..')

# Import project modules
from src.config import DISTILBERT_CONFIG, NUM_CLASSES, RANDOM_SEED, MODELS_DIR
from src.data.data_loader import YelpDataLoader
from src.data.preprocessor import DistilBERTPreprocessor
from src.models.distilbert_model import DistilBERTSentimentModel
from src.training.trainer import DistilBERTTrainer
# from src.training.metrics import compute_metrics
# from src.utils.visualization import plot_training_history, plot_confusion_matrix

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
# 1. Load and Explore the Dataset

## 1.1 Load preprocessed data
print("Loading processed data...")
data_loader = YelpDataLoader()
train_df, test_df = data_loader.load_processed_data()

print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Show distribution of sentiment labels
print("\nSentiment distribution in train set:")
print(train_df['sentiment'].value_counts().sort_index())

print("\nSentiment distribution in test set:")
print(test_df['sentiment'].value_counts().sort_index())

# Examine some examples
print("\nSample reviews and their sentiments:")
for sentiment in range(NUM_CLASSES):
    sample = train_df[train_df['sentiment'] == sentiment].sample(1).iloc[0]
    print(f"\nSentiment: {sentiment}")
    print(f"Review text: {sample['text'][:200]}...")

In [None]:
# 2. Prepare Data for Training

# Initialize tokenizer
print(
    f"Initializing DistilBERT tokenizer: {DISTILBERT_CONFIG['pretrained_model_name']}"
)
tokenizer = DistilBertTokenizer.from_pretrained(
    DISTILBERT_CONFIG["pretrained_model_name"]
)

# 2.1 Create datasets
from src.data.dataset import create_data_loaders
loaders = create_data_loaders(train_df, test_df, 0.1, "distilbert", tokenizer=tokenizer)

# Check a batch
batch = next(iter(loaders["train"]))
print(f"Batch input_ids shape: {batch['input_ids'].shape}")
print(f"Batch attention_mask shape: {batch['attention_mask'].shape}")
print(f"Batch labels shape: {batch['labels'].shape}")

In [None]:
train_dataloader = loaders["train"]
val_dataloader = loaders["val"]
test_dataloader = loaders["test"]

In [None]:
# 4. Initialize the DistilBERT Model

# 4.1 Create the model instance
model = DistilBERTSentimentModel(
    pretrained_model_name=DISTILBERT_CONFIG['pretrained_model_name'],
    num_classes=NUM_CLASSES,
    dropout=0.1,
    freeze_bert_layers=None  # We'll fine-tune all layers
)

In [None]:
model = model.to(device)
print(model)

In [None]:
# Count trainable parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params:.1%})")

In [None]:
# 5. Training Setup

# 5.1 Define optimizer and scheduler
learning_rate = DISTILBERT_CONFIG['learning_rate']
weight_decay = 0.01
num_epochs = DISTILBERT_CONFIG['epochs']

In [None]:
# Initialize optimizer with weight decay
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': weight_decay
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

In [None]:
# 5.2 Define loss function
criterion = nn.CrossEntropyLoss()

In [None]:
# Initialize trainer
trainer = DistilBERTTrainer(model, device, accumulation_steps=4)

In [None]:
# Train model
print(f"Starting training for {num_epochs} epochs...")
history = trainer.benchmark_training(
    train_dataloader,
    criterion,
    optimizer,
)