In [1]:
!pip install torch tensorflow nlpaug scikit-learn

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvi

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter
import nlpaug.augmenter.word as naw


In [3]:
# ----------------------
# 1. Configuration
# ----------------------
SEED = 42
BATCH_SIZE = 128
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
DROPOUT = 0.5
NUM_LAYERS = 2
NUM_MODELS = 3
MAX_LEN = 200
EPOCHS = 100
PATIENCE = 5
torch.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [4]:
# ----------------------
# 2. Enhanced Preprocessing
# ----------------------
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

In [5]:
# ----------------------
# 3. Data Augmentation
# ----------------------
aug = naw.SynonymAug(aug_src='wordnet')

def augment_text(text, num_aug=1):
    return [aug.augment(text)[0] for _ in range(num_aug)]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [6]:
# ----------------------
# 4. Dataset Preparation
# ----------------------
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, augment=False):
        self.reviews = reviews
        self.labels = labels
        self.augment = augment

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        text = self.reviews[idx]
        label = self.labels[idx]

        if self.augment and torch.rand(1).item() > 0.5:
            text = augment_text(text)[0]

        return text, label

In [7]:
# ----------------------
# 5. Model Architecture
# ----------------------
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, 1)

    def forward(self, lstm_output):
        attn_weights = torch.softmax(self.attn(lstm_output).squeeze(-1), dim=1)
        context = torch.sum(lstm_output * attn_weights.unsqueeze(-1), dim=1)
        return context

class SentimentLSTM(nn.Module):
    def __init__(self, embedding_matrix, class_weights):
        super().__init__()
        self.num_words = embedding_matrix.shape[0]

        # Pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix), freeze=False
        )

        # Bidirectional LSTM
        self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM,
                           num_layers=NUM_LAYERS,
                           bidirectional=True,
                           dropout=DROPOUT,
                           batch_first=True)

        # Attention
        self.attention = Attention(HIDDEN_DIM)
        self.dropout = nn.Dropout(DROPOUT)
        self.fc = nn.Linear(HIDDEN_DIM * 2, 1)

        # Class weights
        self.class_weights = class_weights

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        context = self.attention(lstm_out)
        context = self.dropout(context)
        return torch.sigmoid(self.fc(context)).squeeze()

In [8]:
# ----------------------
# 6. GloVe Embeddings
# ----------------------
def load_glove_embeddings(embedding_file):
    embeddings_index = {}
    with open(embedding_file, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [9]:
# Load and preprocess data
data = pd.read_csv("IMDB Dataset.csv")
data['review'] = data['review'].apply(clean_text)
data['sentiment'] = data['sentiment'].replace({'positive': 1, 'negative': 0})

  data['sentiment'] = data['sentiment'].replace({'positive': 1, 'negative': 0})


In [10]:
# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=SEED)

In [11]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data["review"])

In [12]:
# Create sequences
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=MAX_LEN)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=MAX_LEN)
y_train = train_data["sentiment"].values
y_test = test_data["sentiment"].values

In [14]:
# Class weights
class_weights = compute_class_weight('balanced', classes=np.array([0,1]), y=y_train)
class_weights = torch.FloatTensor(class_weights).to(device)

In [20]:
# ----------------------
# 6. GloVe Embeddings
# ----------------------
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

def load_glove_embeddings(embedding_file):
    """Loads GloVe embeddings from a text file.

    Args:
        embedding_file (str): Path to the GloVe embeddings file.

    Returns:
        dict: A dictionary mapping words to their corresponding embedding vectors.
    """
    embeddings_index = {}
    with open(embedding_file, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

--2025-03-02 11:55:52--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-02 11:55:52--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-02 11:55:52--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [22]:
# Prepare GloVe embeddings
embeddings_index = load_glove_embeddings("glove.6B.300d.txt")
embedding_matrix = np.zeros((10000, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < 10000:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [21]:
# Convert to tensors
X_train_tensor = torch.LongTensor(X_train).to(device)
X_test_tensor = torch.LongTensor(X_test).to(device)
y_train_tensor = torch.FloatTensor(y_train).to(device)
y_test_tensor = torch.FloatTensor(y_test).to(device)

In [23]:
# Create models
models = []
for i in range(NUM_MODELS):
    model = SentimentLSTM(embedding_matrix, class_weights).to(device)
    models.append(model)

In [26]:
# Training loop for each model
from torch.utils.data import TensorDataset # import TensorDataset
for model_idx, model in enumerate(models):
    print(f"Training model {model_idx+1}/{NUM_MODELS}")

    # Optimizer and scheduler
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2)
    criterion = nn.BCELoss(weight=class_weights[1])

    best_acc = 0
    patience_counter = 0

    # Create DataLoader with augmentation
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        with torch.no_grad():
            # Split X_test_tensor and y_test_tensor into smaller batches
            VALIDATION_BATCH_SIZE = 32  # Choose a smaller batch size for validation
            num_batches = len(X_test_tensor) // VALIDATION_BATCH_SIZE + (len(X_test_tensor) % VALIDATION_BATCH_SIZE != 0)

            correct = 0
            total = 0

            for i in range(num_batches):
                start_idx = i * VALIDATION_BATCH_SIZE
                end_idx = min((i + 1) * VALIDATION_BATCH_SIZE, len(X_test_tensor))

                batch_inputs = X_test_tensor[start_idx:end_idx]
                batch_labels = y_test_tensor[start_idx:end_idx]

                outputs = model(batch_inputs)
                predicted = (outputs > 0.5).float()
                correct += (predicted == batch_labels).sum().item()
                total += batch_labels.size(0)

            acc = correct / total

        scheduler.step(acc)

        print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f} | Acc: {acc:.4f}")

        # Early stopping
        if acc > best_acc:
            best_acc = acc
            patience_counter = 0
            torch.save(model.state_dict(), f"best_model_{model_idx}.pt")
        else:
            patience_counter += 1

        if patience_counter >= PATIENCE:
            print("Early stopping triggered")
            break

    # Load best model weights
    model.load_state_dict(torch.load(f"best_model_{model_idx}.pt"))

Training model 1/3
Epoch 1 | Loss: 0.3452 | Acc: 0.8647
Epoch 2 | Loss: 0.3047 | Acc: 0.8767
Epoch 3 | Loss: 0.2831 | Acc: 0.8833
Epoch 4 | Loss: 0.2622 | Acc: 0.8900
Epoch 5 | Loss: 0.2518 | Acc: 0.8923
Epoch 6 | Loss: 0.2295 | Acc: 0.8871
Epoch 7 | Loss: 0.2165 | Acc: 0.8786
Epoch 8 | Loss: 0.2073 | Acc: 0.8966
Epoch 9 | Loss: 0.1903 | Acc: 0.8978
Epoch 10 | Loss: 0.1814 | Acc: 0.8969
Epoch 11 | Loss: 0.1685 | Acc: 0.8801
Epoch 12 | Loss: 0.1573 | Acc: 0.8931
Epoch 13 | Loss: 0.1303 | Acc: 0.9011
Epoch 14 | Loss: 0.1264 | Acc: 0.8964
Epoch 15 | Loss: 0.1239 | Acc: 0.8992
Epoch 16 | Loss: 0.1218 | Acc: 0.8973
Epoch 17 | Loss: 0.1176 | Acc: 0.8988
Epoch 18 | Loss: 0.1172 | Acc: 0.8983
Early stopping triggered
Training model 2/3


  model.load_state_dict(torch.load(f"best_model_{model_idx}.pt"))


Epoch 1 | Loss: 0.4898 | Acc: 0.8480
Epoch 2 | Loss: 0.3393 | Acc: 0.8550
Epoch 3 | Loss: 0.2999 | Acc: 0.8797
Epoch 4 | Loss: 0.2732 | Acc: 0.8830
Epoch 5 | Loss: 0.2608 | Acc: 0.8825
Epoch 6 | Loss: 0.2413 | Acc: 0.8898
Epoch 7 | Loss: 0.2240 | Acc: 0.8943
Epoch 8 | Loss: 0.2079 | Acc: 0.8915
Epoch 9 | Loss: 0.2023 | Acc: 0.8793
Epoch 10 | Loss: 0.1861 | Acc: 0.8915
Epoch 11 | Loss: 0.1594 | Acc: 0.8969
Epoch 12 | Loss: 0.1564 | Acc: 0.8983
Epoch 13 | Loss: 0.1537 | Acc: 0.8951
Epoch 14 | Loss: 0.1513 | Acc: 0.8979
Epoch 15 | Loss: 0.1491 | Acc: 0.8964
Epoch 16 | Loss: 0.1451 | Acc: 0.8969
Epoch 17 | Loss: 0.1449 | Acc: 0.8965
Early stopping triggered
Training model 3/3
Epoch 1 | Loss: 0.4908 | Acc: 0.8382
Epoch 2 | Loss: 0.3334 | Acc: 0.8680
Epoch 3 | Loss: 0.2987 | Acc: 0.8783
Epoch 4 | Loss: 0.2687 | Acc: 0.8848
Epoch 5 | Loss: 0.2612 | Acc: 0.8816
Epoch 6 | Loss: 0.2422 | Acc: 0.8880
Epoch 7 | Loss: 0.2252 | Acc: 0.8938
Epoch 8 | Loss: 0.2144 | Acc: 0.8782
Epoch 9 | Loss: 0.2052 

In [27]:
# ----------------------
# 8. Ensemble Prediction
# ----------------------
def ensemble_predict(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=MAX_LEN)
    tensor = torch.LongTensor(padded).to(device)

    predictions = []
    with torch.no_grad():
        for model in models:
            output = model(tensor).item()
            predictions.append(output)

    avg_pred = np.mean(predictions)
    return "positive" if avg_pred > 0.5 else "negative"

In [28]:
# ----------------------
# 9. Evaluation
# ----------------------
# Evaluate ensemble
correct = 0
for text, label in zip(test_data["review"], test_data["sentiment"]):
    pred = ensemble_predict(text)
    correct += (pred == ("positive" if label == 1 else "negative"))

print(f"Ensemble Accuracy: {correct/len(test_data):.4f}")

Ensemble Accuracy: 0.9046


In [29]:
# ----------------------
# 10. Example Usage
# ----------------------
test_reviews = [
    "This movie was an absolute masterpiece!",
    "Terrible waste of time, would not recommend.",
    "The plot was average but acting was good."
]

for review in test_reviews:
    print(f"Review: {review}")
    print(f"Sentiment: {ensemble_predict(review)}\n")

Review: This movie was an absolute masterpiece!
Sentiment: positive

Review: Terrible waste of time, would not recommend.
Sentiment: negative

Review: The plot was average but acting was good.
Sentiment: negative

