In [1]:
# DistilBERT Experiments for Yelp Reviews Sentiment Analysis

import os
import sys
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import DistilBertTokenizer, AdamW, get_linear_schedule_with_warmup

In [None]:
# Add the project root to path for imports
sys.path.append('..')

# Import project modules
from src.config import DISTILBERT_CONFIG, NUM_CLASSES, RANDOM_SEED, MODELS_DIR
from src.data.data_loader import YelpDataLoader
from src.data.preprocessor import DistilBERTPreprocessor
from src.models.distilbert_model import DistilBERTSentimentModel
from src.training.trainer import DistilBERTTrainer
# from src.training.metrics import compute_metrics
# from src.utils.visualization import plot_training_history, plot_confusion_matrix

2025-04-09 21:48:49,789 - src.data.preprocessor - INFO - Downloading necessary NLTK resources...
[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/david/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2025-04-09 21:48:51.389814: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-09 21:48:51.517988: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in

In [3]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

Using device: cuda


In [4]:
# 1. Load and Explore the Dataset

## 1.1 Load preprocessed data
print("Loading processed data...")
data_loader = YelpDataLoader()
train_df, test_df = data_loader.load_processed_data()

print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Show distribution of sentiment labels
print("\nSentiment distribution in train set:")
print(train_df['sentiment'].value_counts().sort_index())

print("\nSentiment distribution in test set:")
print(test_df['sentiment'].value_counts().sort_index())

# Examine some examples
print("\nSample reviews and their sentiments:")
for sentiment in range(NUM_CLASSES):
    sample = train_df[train_df['sentiment'] == sentiment].sample(1).iloc[0]
    print(f"\nSentiment: {sentiment}")
    print(f"Review text: {sample['text'][:200]}...")

2025-04-09 21:48:53,604 - src.data.data_loader - INFO - Loading processed data from local files...


Loading processed data...
Train set shape: (650000, 5)
Test set shape: (50000, 3)

Sentiment distribution in train set:
sentiment
0    260000
1    130000
2    260000
Name: count, dtype: int64

Sentiment distribution in test set:
sentiment
0    20000
1    10000
2    20000
Name: count, dtype: int64

Sample reviews and their sentiments:

Sentiment: 0
Review text: Awful service, awful food.\n\nWas here several weeks ago as part of a large group; I arrived partway through the meal and did not order anything, so can't speak for the food on that occasion (though i...

Sentiment: 1
Review text: Came here after a show and waiting time was 45 mins to an hour.  Promised our son that we would eat here so we waited.  \n\nI do not care much for the fire decor they have going in the entrance.  Just...

Sentiment: 2
Review text: It finally opened! The sign has been on the building for at least 3 seasons, maybe more, but the wait was worth it. We needed a breakfast spot in this central area sorely. The

In [5]:
# 2. Prepare Data for Training

# Initialize tokenizer
print(
    f"Initializing DistilBERT tokenizer: {DISTILBERT_CONFIG['pretrained_model_name']}"
)
tokenizer = DistilBertTokenizer.from_pretrained(
    DISTILBERT_CONFIG["pretrained_model_name"]
)

# 2.1 Create datasets
from src.data.dataset import create_data_loaders
loaders = create_data_loaders(train_df, test_df, 0.1, "distilbert", tokenizer=tokenizer)

# Check a batch
batch = next(iter(loaders["train"]))
print(f"Batch input_ids shape: {batch['input_ids'].shape}")
print(f"Batch attention_mask shape: {batch['attention_mask'].shape}")
print(f"Batch labels shape: {batch['labels'].shape}")

Initializing DistilBERT tokenizer: distilbert-base-uncased


2025-04-09 21:48:57,740 - src.data.dataset - INFO - Created data split: 585000 train, 65000 val, 50000 test
2025-04-09 21:48:57,741 - src.data.preprocessor - INFO - Initialized DistilBERT preprocessor with distilbert-base-uncased tokenizer
2025-04-09 21:48:57,742 - src.data.dataset - INFO - Created DataLoaders with batch size 16


Batch input_ids shape: torch.Size([16, 512])
Batch attention_mask shape: torch.Size([16, 512])
Batch labels shape: torch.Size([16])


In [6]:
# 4. Initialize the DistilBERT Model

# 4.1 Create the model instance
model = DistilBERTSentimentModel(
    pretrained_model_name=DISTILBERT_CONFIG['pretrained_model_name'],
    num_classes=NUM_CLASSES,
    dropout=0.1,
    freeze_bert_layers=None  # We'll fine-tune all layers
)

2025-04-09 21:49:01,087 - src.models.distilbert_model - INFO - Initialized DistilBERT model with pretrained_model=distilbert-base-uncased, hidden_size=768, num_classes=3, dropout=0.1


In [7]:
model = model.to(device)
print(model)

DistilBERTSentimentModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
           

In [8]:
# Count trainable parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params:.1%})")

Total parameters: 66,365,187
Trainable parameters: 66,365,187 (100.0%)


In [9]:
# 5. Training Setup

# 5.1 Define optimizer and scheduler
learning_rate = DISTILBERT_CONFIG['learning_rate']
weight_decay = 0.01
num_epochs = DISTILBERT_CONFIG['epochs']

In [10]:
# Initialize optimizer with weight decay
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': weight_decay
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)



In [11]:
# Calculate total training steps for scheduler
total_steps = len(loaders["train"]) * num_epochs
warmup_steps = int(0.1 * total_steps)  # 10% of total steps for warmup

# Create scheduler with warmup
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

In [12]:
# 5.2 Define loss function
criterion = nn.CrossEntropyLoss()

In [13]:
# Initialize trainer
trainer = DistilBERTTrainer(model, device)

In [None]:
# Train model
print(f"Starting training for {num_epochs} epochs...")
history = trainer.train(
    train_dataloader=loaders["train"],
    val_dataloader=loaders["val"],
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    num_epochs=num_epochs,
    output_dir=MODELS_DIR,
    model_name="distilbert",
    # early_stopping_patience=args.early_stopping,
)