In [16]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm
import json 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
import random


import torch
import torch.nn as nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt


from transformers import BertTokenizer
nltk.download('punkt_tab')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt_tab to /Users/chi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [17]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim, n_layers, bidirectional, dropout):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output
    

# Training function
def train_lstm(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    total_acc = 0

    print('dataloader len', len(dataloader))

    # Wrap the dataloader with tqdm for progress tracking
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

       
        # Forward pass
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        preds = torch.argmax(outputs, dim=1)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

    return total_loss / len(dataloader), total_acc / len(dataloader)


# Evaluation function
def evaluate_lstm(model, dataloader):
    model.eval()
    total_acc = 0

    with torch.no_grad():
        # Wrap the dataloader with tqdm for progress tracking
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1)

            total_acc += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

    return total_acc / len(dataloader)



In [18]:
file_path = 'data/sampled_preprocessed.json'
df = pd.read_json(file_path)

In [19]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['is_spoiler'])

# Split the data before applying SMOTE
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Calculate original class distribution
original_class_distribution = train_df['label'].value_counts()



# # Vectorization and SMOTE
# vectorizer = TfidfVectorizer()
# X_train_tfidf = vectorizer.fit_transform(train_df['cleaned_review_text'])  # Convert text to TF-IDF
y_train = train_df['label']

# print("Original dataset shape:", type(X_train_tfidf))


# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

# # Check the resampled data
# print("Resampled dataset shape:", X_resampled.shape)

# train_df_resampled = pd.DataFrame(X_resampled.todense(), columns=vectorizer.get_feature_names_out())
# train_df_resampled['label'] = y_resampled

# # 5. Re-split the temporary set into validation and test sets (50% each)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# print("Train (Resampled) Shape:", train_df_resampled.shape)
# print("Validation Shape:", val_df.shape)
# print("Test Shape:", test_df.shape)
# Tokenization
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class SpoilerDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = tokenizer.encode_plus(
            text,
            max_length=512,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [21]:
batch_size = 32
# Create datasets with the resampled training data
train_dataset = SpoilerDataset(train_df['cleaned_review_text'].tolist(), train_df['label'].tolist())
val_dataset = SpoilerDataset(val_df['cleaned_review_text'].tolist(), val_df['label'].tolist())
test_dataset = SpoilerDataset(test_df['cleaned_review_text'].tolist(), test_df['label'].tolist())

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
embedding_dim = 100
hidden_dim = 128
vocab_size = tokenizer.vocab_size
output_dim = 2
n_layers = 3
bidirectional = True
dropout = 0.5



# Instantiate model, loss function, optimizer
lstm_model = LSTMClassifier(embedding_dim, hidden_dim, vocab_size, output_dim, n_layers, bidirectional, dropout)
lstm_model = lstm_model.to(device)
optimizer = Adam(lstm_model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()


In [22]:
train_losses = []
train_accuracies = []
val_accuracies = []

n_epochs = 20
# Initialize tqdm progress bar
for epoch in tqdm(range(n_epochs), desc='Training Epochs'):
    train_loss, train_acc = train_lstm(lstm_model, train_loader, optimizer, criterion)
    val_acc = evaluate_lstm(lstm_model, val_loader)
    
    # Append metrics to lists
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)
    
    # Print epoch information
    tqdm.write(f'Epoch {epoch+1}: Train Loss {train_loss:.4f}, Train Acc {train_acc:.4f}, Val Acc {val_acc:.4f}')
    
    # # Save model checkpoint every 10 epochs
    # if (epoch + 1) % 10 == 0:
    #     checkpoint_path = f'lstm_model_epoch_{epoch+1}.pth'
    #     torch.save(lstm_model.state_dict(), checkpoint_path)
    #     print(f'Model checkpoint saved to {checkpoint_path}')

# Evaluate on test set
test_acc = evaluate_lstm(lstm_model, test_loader)
print(f'Test Accuracy: {test_acc:.4f}')

Training Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

dataloader len 71


Training: 100%|██████████| 71/71 [01:06<00:00,  1.07it/s]
Evaluating: 100%|██████████| 9/9 [00:02<00:00,  3.01it/s]
Training Epochs:   5%|▌         | 1/20 [01:09<21:55, 69.24s/it]

Epoch 1: Train Loss 0.5790, Train Acc 0.7369, Val Acc 0.7409
dataloader len 71


Training: 100%|██████████| 71/71 [01:04<00:00,  1.11it/s]
Evaluating: 100%|██████████| 9/9 [00:02<00:00,  3.01it/s]
Training Epochs:  10%|█         | 2/20 [02:16<20:25, 68.06s/it]

Epoch 2: Train Loss 0.5688, Train Acc 0.7453, Val Acc 0.7305
dataloader len 71


Training: 100%|██████████| 71/71 [01:04<00:00,  1.11it/s]
Evaluating: 100%|██████████| 9/9 [00:02<00:00,  3.06it/s]
Training Epochs:  15%|█▌        | 3/20 [03:23<19:09, 67.61s/it]

Epoch 3: Train Loss 0.5656, Train Acc 0.7466, Val Acc 0.7201
dataloader len 71


Training: 100%|██████████| 71/71 [01:04<00:00,  1.09it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  3.00it/s]
Training Epochs:  20%|██        | 4/20 [04:31<18:03, 67.71s/it]

Epoch 4: Train Loss 0.5602, Train Acc 0.7532, Val Acc 0.7166
dataloader len 71


Training: 100%|██████████| 71/71 [01:07<00:00,  1.06it/s]
Evaluating: 100%|██████████| 9/9 [00:02<00:00,  3.01it/s]
Training Epochs:  25%|██▌       | 5/20 [05:41<17:08, 68.55s/it]

Epoch 5: Train Loss 0.5595, Train Acc 0.7561, Val Acc 0.7236
dataloader len 71


Training: 100%|██████████| 71/71 [01:07<00:00,  1.05it/s]
Evaluating: 100%|██████████| 9/9 [00:02<00:00,  3.01it/s]
Training Epochs:  30%|███       | 6/20 [06:52<16:09, 69.28s/it]

Epoch 6: Train Loss 0.5532, Train Acc 0.7592, Val Acc 0.7270
dataloader len 71


Training: 100%|██████████| 71/71 [01:05<00:00,  1.08it/s]
Evaluating: 100%|██████████| 9/9 [00:02<00:00,  3.02it/s]
Training Epochs:  35%|███▌      | 7/20 [08:00<14:58, 69.10s/it]

Epoch 7: Train Loss 0.5512, Train Acc 0.7586, Val Acc 0.7382
dataloader len 71


Training: 100%|██████████| 71/71 [01:05<00:00,  1.08it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.97it/s]
Training Epochs:  40%|████      | 8/20 [09:09<13:48, 69.05s/it]

Epoch 8: Train Loss 0.5439, Train Acc 0.7616, Val Acc 0.7305
dataloader len 71


Training: 100%|██████████| 71/71 [01:05<00:00,  1.09it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.94it/s]
Training Epochs:  45%|████▌     | 9/20 [10:18<12:36, 68.78s/it]

Epoch 9: Train Loss 0.5451, Train Acc 0.7613, Val Acc 0.7270
dataloader len 71


Training: 100%|██████████| 71/71 [01:05<00:00,  1.09it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.91it/s]
Training Epochs:  50%|█████     | 10/20 [11:26<11:25, 68.60s/it]

Epoch 10: Train Loss 0.5460, Train Acc 0.7616, Val Acc 0.7270
dataloader len 71


Training: 100%|██████████| 71/71 [01:05<00:00,  1.09it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.90it/s]
Training Epochs:  55%|█████▌    | 11/20 [12:34<10:16, 68.47s/it]

Epoch 11: Train Loss 0.5382, Train Acc 0.7652, Val Acc 0.7348
dataloader len 71


Training: 100%|██████████| 71/71 [01:05<00:00,  1.09it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.95it/s]
Training Epochs:  60%|██████    | 12/20 [13:42<09:07, 68.39s/it]

Epoch 12: Train Loss 0.5380, Train Acc 0.7649, Val Acc 0.7270
dataloader len 71


Training: 100%|██████████| 71/71 [01:04<00:00,  1.10it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.91it/s]
Training Epochs:  65%|██████▌   | 13/20 [14:50<07:57, 68.25s/it]

Epoch 13: Train Loss 0.5368, Train Acc 0.7649, Val Acc 0.7340
dataloader len 71


Training: 100%|██████████| 71/71 [01:04<00:00,  1.10it/s]
Evaluating: 100%|██████████| 9/9 [00:04<00:00,  1.94it/s]
Training Epochs:  70%|███████   | 14/20 [16:00<06:51, 68.63s/it]

Epoch 14: Train Loss 0.5413, Train Acc 0.7638, Val Acc 0.7313
dataloader len 71


Training: 100%|██████████| 71/71 [01:07<00:00,  1.06it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.89it/s]
Training Epochs:  75%|███████▌  | 15/20 [17:10<05:45, 69.15s/it]

Epoch 15: Train Loss 0.5362, Train Acc 0.7655, Val Acc 0.7313
dataloader len 71


Training: 100%|██████████| 71/71 [01:06<00:00,  1.07it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.92it/s]
Training Epochs:  80%|████████  | 16/20 [18:20<04:37, 69.29s/it]

Epoch 16: Train Loss 0.5300, Train Acc 0.7662, Val Acc 0.7270
dataloader len 71


Training: 100%|██████████| 71/71 [01:04<00:00,  1.10it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.91it/s]
Training Epochs:  85%|████████▌ | 17/20 [19:27<03:26, 68.72s/it]

Epoch 17: Train Loss 0.5315, Train Acc 0.7671, Val Acc 0.7313
dataloader len 71


Training: 100%|██████████| 71/71 [01:03<00:00,  1.12it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.94it/s]
Training Epochs:  90%|█████████ | 18/20 [20:34<02:16, 68.11s/it]

Epoch 18: Train Loss 0.5286, Train Acc 0.7673, Val Acc 0.7452
dataloader len 71


Training: 100%|██████████| 71/71 [01:03<00:00,  1.11it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.98it/s]
Training Epochs:  95%|█████████▌| 19/20 [21:40<01:07, 67.72s/it]

Epoch 19: Train Loss 0.5305, Train Acc 0.7668, Val Acc 0.7305
dataloader len 71


Training: 100%|██████████| 71/71 [01:03<00:00,  1.12it/s]
Evaluating: 100%|██████████| 9/9 [00:03<00:00,  3.00it/s]
Training Epochs: 100%|██████████| 20/20 [22:47<00:00, 68.36s/it]


Epoch 20: Train Loss 0.5283, Train Acc 0.7681, Val Acc 0.7270


Evaluating: 100%|██████████| 9/9 [00:03<00:00,  2.99it/s]

Test Accuracy: 0.7171



