In [6]:
# LSTM Experiments for Yelp Reviews Sentiment Analysis

import sys
import logging
import numpy as np
import torch

In [7]:
# Add the project root to path for imports
sys.path.append('..')

# Import project modules
from src.config import NUM_CLASSES, RANDOM_SEED
from src.data.data_loader import YelpDataLoader

In [8]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

Using device: cuda


In [9]:
# 1. Load and Explore the Dataset

## 1.1 Load preprocessed data
print("Loading processed data...")
data_loader = YelpDataLoader()
train_df, test_df = data_loader.load_processed_data()

print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Show distribution of sentiment labels
print("\nSentiment distribution in train set:")
print(train_df['sentiment'].value_counts().sort_index())

print("\nSentiment distribution in test set:")
print(test_df['sentiment'].value_counts().sort_index())

# Examine some examples
print("\nSample reviews and their sentiments:")
for sentiment in range(NUM_CLASSES):
    sample = train_df[train_df['sentiment'] == sentiment].sample(1).iloc[0]
    print(f"\nSentiment: {sentiment}")
    print(f"Review text: {sample['text'][:200]}...")

2025-04-12 01:19:59,696 - src.data.data_loader - INFO - Loading processed data from local files...


Loading processed data...
Train set shape: (650000, 5)
Test set shape: (50000, 3)

Sentiment distribution in train set:
sentiment
0    260000
1    130000
2    260000
Name: count, dtype: int64

Sentiment distribution in test set:
sentiment
0    20000
1    10000
2    20000
Name: count, dtype: int64

Sample reviews and their sentiments:

Sentiment: 0
Review text: Awful service, awful food.\n\nWas here several weeks ago as part of a large group; I arrived partway through the meal and did not order anything, so can't speak for the food on that occasion (though i...

Sentiment: 1
Review text: Came here after a show and waiting time was 45 mins to an hour.  Promised our son that we would eat here so we waited.  \n\nI do not care much for the fire decor they have going in the entrance.  Just...

Sentiment: 2
Review text: It finally opened! The sign has been on the building for at least 3 seasons, maybe more, but the wait was worth it. We needed a breakfast spot in this central area sorely. The

In [10]:
# Create datasets
from src.config import LSTM_CONFIG, VALIDATION_SPLIT
from src.data.dataset import create_data_loaders

loaders = create_data_loaders(
        train_df,
        test_df,
        VALIDATION_SPLIT,
        "lstm",
        # max_seq_length=max_seq_length,
        # max_vocab_size=max_vocab_size,
        batch_size=LSTM_CONFIG["batch_size"],
        # num_workers=args.num_workers,
    )


# Check a batch
batch = next(iter(loaders["train"]))
print(f"Batch text tensor shape: {batch['text'].shape}  # [batch_size, max_seq_length]")
print(f"Batch lengths tensor shape: {batch['lengths'].shape}  # [batch_size]")
print(f"Batch labels shape: {batch['labels'].shape}  # [batch_size]")

2025-04-12 01:20:02,278 - src.data.dataset - INFO - Created data split: 585000 train, 65000 val, 50000 test
2025-04-12 01:20:45,492 - src.data.preprocessor - INFO - Building vocabulary...
2025-04-12 01:20:46,264 - src.data.preprocessor - INFO - Vocabulary built with 49999 words
2025-04-12 01:20:46,270 - src.data.dataset - INFO - Fitted preprocessor on 100000 samples with vocab size 49999
2025-04-12 01:20:46,271 - src.data.dataset - INFO - Created lazy-loading dataset with 585000 samples
2025-04-12 01:20:46,271 - src.data.dataset - INFO - Created lazy-loading dataset with 65000 samples
2025-04-12 01:20:46,272 - src.data.dataset - INFO - Created lazy-loading dataset with 50000 samples
2025-04-12 01:20:46,272 - src.data.dataset - INFO - Created DataLoaders with batch size 64 and 4 workers


Batch text tensor shape: torch.Size([64, 512])  # [batch_size, max_seq_length]
Batch lengths tensor shape: torch.Size([64])  # [batch_size]
Batch labels shape: torch.Size([64])  # [batch_size]


In [11]:
train_dataloader = loaders["train"]
val_dataloader = loaders["val"]
test_dataloader = loaders["test"]
vocab = loaders.get("vocab")  # Get vocabulary for model initialization

# Get vocabulary size
vocab_size = LSTM_CONFIG["max_vocab_size"]
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 50000


In [12]:
# Initialize model
from src.models.lstm_model import LSTMSentimentModel


print("Initializing LSTM model...")
model = LSTMSentimentModel(
    vocab_size=vocab_size,
    embedding_dim=LSTM_CONFIG["embedding_dim"],
    hidden_dim=LSTM_CONFIG["hidden_dim"],
    num_layers=LSTM_CONFIG["num_layers"],
    bidirectional=LSTM_CONFIG["bidirectional"],
    num_classes=NUM_CLASSES,
    dropout=LSTM_CONFIG["dropout"],
    # padding_idx=vocab.get("<PAD>", 0),
    padding_idx=0,
    pretrained_embeddings=None,  # TODO: Add support for pretrained embeddings if needed
)

2025-04-12 01:20:46,860 - src.models.lstm_model - INFO - Initialized LSTM model with vocab_size=50000, embedding_dim=300, hidden_dim=256, num_layers=2, dropout=0.3, bidirectional=True, num_classes=3


Initializing LSTM model...


In [13]:
# Initialize trainer
from src.training.trainer import LSTMTrainer

trainer = LSTMTrainer(model, device, accumulation_steps=4)

2025-04-12 01:20:46,877 - src.training.trainer - INFO - Trainer initialized with gradient accumulation over 4 steps


In [14]:
epochs=1
# Train model
print(f"Starting training for {epochs} epochs...")
history = trainer.benchmark_training(
    model,
    train_dataloader,
    epochs=epochs
)

Starting training for 1 epochs...
Avg. time per batch: 0.0853s
Estimated time per epoch: 12.99 minutes
