In [None]:
!pip install torchinfo

In [None]:

!pip install torchtext==0.17.2

In [None]:
import nltk
nltk.download('punkt_tab')


In [None]:
import os, tarfile, time, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from torchinfo import summary

In [None]:
!pip install wordcloud
from wordcloud import WordCloud

In [None]:
from nltk.tokenize import word_tokenize

torch.manual_seed(42)
np.random.seed(42)
sns.set(style="whitegrid")
%matplotlib inline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
if not os.path.exists("aclImdb_v1.tar.gz"):
    !wget -O aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

if not os.path.exists("aclImdb"):
    with tarfile.open("aclImdb_v1.tar.gz", "r:gz") as tar:
        tar.extractall()

print("Dataset downloaded and extracted.")

In [None]:
def load_imdb_data(dir_path):
    texts, labels = [], []
    for sentiment in ['pos', 'neg']:
        sentiment_dir = os.path.join(dir_path, sentiment)
        for filename in os.listdir(sentiment_dir):
            if filename.endswith(".txt"):
                with open(os.path.join(sentiment_dir, filename), encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(1 if sentiment == 'pos' else 0)
    return texts, labels

In [None]:
train_texts, train_labels = load_imdb_data(os.path.join("aclImdb", "train"))
test_texts, test_labels = load_imdb_data(os.path.join("aclImdb", "test"))

train_df = pd.DataFrame({'review': train_texts, 'sentiment': train_labels})
test_df  = pd.DataFrame({'review': test_texts,  'sentiment': test_labels})

print("Train data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("First 5 rows of training data:")
print(train_df.head(5))

This is IMDb Large Movie Review Dataset, which contains 25,000 movie reviews for training and 25,000 for testing.
Each review is labeled as positive (1) or negative (0), and the data is provided by Stanford.
Source: https://ai.stanford.edu/~amaas/data/sentiment/

In [None]:
print("Number of training samples:", len(train_df))
print("Number of test samples:", len(test_df))

In [None]:
class_counts = train_df['sentiment'].value_counts(normalize=True) * 100
print("Class distribution (percentage):\n", class_counts)


In [None]:
train_df['word_count'] = train_df['review'].apply(lambda x: len(x.split()))
train_df['char_count'] = train_df['review'].apply(len)
print("Average word count:", train_df['word_count'].mean())
print("Average character count:", train_df['char_count'].mean())

In [None]:
print("Missing values in training set:\n", train_df.isnull().sum())
train_df = train_df.dropna()
test_df = test_df.dropna()

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(train_df['word_count'], bins=50, kde=True)
plt.title("Histogram of Review Lengths (Words)")

plt.subplot(1, 2, 2)
sns.histplot(train_df['char_count'], bins=50, kde=True)
plt.title("Histogram of Review Lengths (Characters)")
plt.show()

In [None]:
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title(title)
    plt.axis("off")
    plt.show()

pos_text = " ".join(train_df[train_df['sentiment']==1]['review'].tolist())
neg_text = " ".join(train_df[train_df['sentiment']==0]['review'].tolist())
generate_wordcloud(pos_text, "Word Cloud for Positive Reviews")
generate_wordcloud(neg_text, "Word Cloud for Negative Reviews")

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='sentiment', data=train_df, color='burlywood')
plt.title("Number of Reviews for Each Sentiment Class")
plt.xticks([0, 1], ["Negative", "Positive"])
plt.show()

In [None]:
from nltk import ngrams, FreqDist

all_tokens = []
for review in train_df['review']:
    all_tokens.extend((get_tokenizer("basic_english"))(review))

bigrams = list(ngrams(all_tokens, 2))
bigram_freq = FreqDist(bigrams)

common_bigrams = bigram_freq.most_common(20)
print("Top 20 bigrams:", common_bigrams)

bigram_labels = [' '.join(bigram) for bigram, count in common_bigrams]
counts = [count for bigram, count in common_bigrams]

plt.figure(figsize=(12, 6))
sns.barplot(x=counts, y=bigram_labels, palette="viridis")
plt.title("Top 20 Most Common Bigrams")
plt.xlabel("Frequency")
plt.ylabel("Bigrams")
plt.show()


In [None]:
sample_review = train_df['review'].iloc[0]

start = time.time()
tokens_nltk = word_tokenize(sample_review)
nltk_time = time.time() - start
print("nltk.word_tokenize time:", nltk_time, "seconds")

basic_tokenizer = get_tokenizer("basic_english")
start = time.time()
tokens_basic = basic_tokenizer(sample_review)
basic_time = time.time() - start
print("basic_english tokenizer time:", basic_time, "seconds")

In [None]:
tokenizer = basic_tokenizer

def yield_tokens(data_iter, tokenizer):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_df['review'], tokenizer), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
print("Vocabulary size:", len(vocab))

In [None]:
def text_to_sequence(text, tokenizer, vocab):
    return [vocab[token] for token in tokenizer(text)]

train_df['seq'] = train_df['review'].apply(lambda x: text_to_sequence(x, tokenizer, vocab))
test_df['seq'] = test_df['review'].apply(lambda x: text_to_sequence(x, tokenizer, vocab))

lengths = train_df['seq'].apply(len)
max_length = int(np.percentile(lengths, 95))
print("Max sequence length (95th percentile):", max_length)

In [None]:
from torch.nn.utils.rnn import pad_sequence

train_tensors = [torch.tensor(seq[:max_length], dtype=torch.long) for seq in train_df['seq']]
test_tensors  = [torch.tensor(seq[:max_length], dtype=torch.long) for seq in test_df['seq']]

padded_train = pad_sequence(train_tensors, batch_first=True, padding_value=0)
padded_test  = pad_sequence(test_tensors, batch_first=True, padding_value=0)

def ensure_fixed_length(tensor, target_length):
    if tensor.size(1) < target_length:
        pad_size = target_length - tensor.size(1)
        extra_pad = torch.zeros(tensor.size(0), pad_size, dtype=torch.long)
        tensor = torch.cat([tensor, extra_pad], dim=1)
    return tensor[:, :target_length]

padded_train = ensure_fixed_length(padded_train, max_length)
padded_test = ensure_fixed_length(padded_test, max_length)

train_df['padded_seq'] = padded_train.tolist()
test_df['padded_seq'] = padded_test.tolist()

print("All sequences have been padded/truncated to length:", max_length)


In [None]:
def text_to_sequence(text, tokenizer, vocab):
    return [vocab[token] for token in tokenizer(text)]

train_df['seq'] = train_df['review'].apply(lambda x: text_to_sequence(x, tokenizer, vocab))
test_df['seq'] = test_df['review'].apply(lambda x: text_to_sequence(x, tokenizer, vocab))

lengths = train_df['seq'].apply(len)
max_length = int(np.percentile(lengths, 95))
print("Max sequence length (95th percentile):", max_length)

def pad_sequence_fn(seq, max_length):
    if len(seq) < max_length:
        return seq + [0]*(max_length - len(seq))
    else:
        return seq[:max_length]

train_df['padded_seq'] = train_df['seq'].apply(lambda x: pad_sequence_fn(x, max_length))
test_df['padded_seq'] = test_df['seq'].apply(lambda x: pad_sequence_fn(x, max_length))

Padding and truncation are important preprocessing steps when we are working with variable-length text inputs for sequence models, and they can have significant impacts on model performance:

1. Truncation: Here, the issue is that, when the sequences longer than a chosen maximum length are truncated we might loose important information. This happen specially if key context or details occur later in the text. Now, this loss of information can negatively affect the model's ability to understand or correctly classify a review. But truncation is necessary to control computational complexity and ensure consistent input sizes.

2. Paddidng: If we padd sequence with zero than, it will ensure that all the sequences are of uniform length and it is necessary for efficient batch processing. But too much padding can introduce a lot of non-informative tokens into the model input. So, it can led to more use of computational power unnecessarily or might leads to overfit.

3. Balance: Setting the maximum sequence length like 95 percentile will leads to preservation of most sequences and also avoid excessive truncation. So, this minimizes the risk of losing critical information and also limit the amount of padding needed. 

In [None]:
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['sentiment'])

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, df):
        self.sequences = df['padded_seq'].tolist()
        self.labels = df['sentiment'].tolist()
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

In [None]:
batch_size = 64
train_dataset = SentimentDataset(train_data)
val_dataset   = SentimentDataset(val_data)
test_dataset  = SentimentDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import torch.nn as nn
from torchinfo import summary

class BaselineLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout):
        super(BaselineLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])
        return out

vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
num_layers = 3
num_classes = 2
dropout = 0.5

model = BaselineLSTM(vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout)
model = model.to(device)

print("Baseline LSTM Model Summary:")
sample_input = torch.randint(0, vocab_size, (batch_size, max_length), dtype=torch.long, device=device)
summary(model, input_data=sample_input)


In [None]:
def train_evaluate_model(hidden_dim, num_layers, dropout, learning_rate, num_epochs=5):
    model_exp = BaselineLSTM(vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model_exp.parameters(), lr=learning_rate)

    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    for epoch in range(num_epochs):
        model_exp.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model_exp(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = correct_train / total_train
        train_losses.append(epoch_train_loss)
        train_accuracies.append(epoch_train_acc)

        model_exp.eval()
        running_val_loss = 0.0
        correct_val = 0
        total_val = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model_exp(inputs)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()
        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_val_acc = correct_val / total_val
        val_losses.append(epoch_val_loss)
        val_accuracies.append(epoch_val_acc)

        print(f"Epoch {epoch+1}/{num_epochs}: Train Loss={epoch_train_loss:.4f}, Train Acc={epoch_train_acc:.4f}, " \
              f"Val Loss={epoch_val_loss:.4f}, Val Acc={epoch_val_acc:.4f}")

    model_exp.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model_exp(inputs)
            _, predicted = torch.max(outputs, 1)
            total_test += labels.size(0)
            correct_test += (predicted == labels).sum().item()
    test_acc = correct_test / total_test

    return model_exp, epoch_val_acc, test_acc, train_losses, val_losses, train_accuracies, val_accuracies


In [None]:
experiment_results = []
best_val_acc = 0.0
best_model = None
best_config = None
best_metrics = {}

exp_configs = [
    {
        "name": "Exp1",
        "hidden_dim": 128,
        "num_layers": 3,
        "learning_rate": 0.001,
        "dropouts": [0.3, 0.5]
    },
    {
        "name": "Exp2",
        "hidden_dim": 256,
        "num_layers": 3,
        "learning_rate": 0.002,
        "dropouts": [0.3, 0.5]
    },
    {
        "name": "Exp3",
        "hidden_dim": 128,
        "num_layers": 4,
        "learning_rate": 0.001,
        "dropouts": [0.3, 0.5]
    }
]


In [None]:
import copy

In [None]:
print("Starting experiments...\n")
for config in exp_configs:
    for dropout_val in config["dropouts"]:
        print(f"Running {config['name']} with hidden_dim={config['hidden_dim']}, num_layers={config['num_layers']}, "
              f"learning_rate={config['learning_rate']}, dropout={dropout_val}")
        model_exp, val_acc, test_acc, t_losses, v_losses, t_accs, v_accs = train_evaluate_model(
            hidden_dim=config["hidden_dim"],
            num_layers=config["num_layers"],
            dropout=dropout_val,
            learning_rate=config["learning_rate"],
            num_epochs=5
        )
        print(f"Result: Val Acc={val_acc:.4f}, Test Acc={test_acc:.4f}\n")
        experiment_results.append({
            "Experiment": config["name"],
            "Hidden_dim": config["hidden_dim"],
            "Num_layers": config["num_layers"],
            "Learning_rate": config["learning_rate"],
            "Dropout": dropout_val,
            "Val_Acc": val_acc,
            "Test_Acc": test_acc
        })
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = copy.deepcopy(model_exp)
            best_config = {
                "Experiment": config["name"],
                "Hidden_dim": config["hidden_dim"],
                "Num_layers": config["num_layers"],
                "Learning_rate": config["learning_rate"],
                "Dropout": dropout_val,
                "Val_Acc": val_acc,
                "Test_Acc": test_acc
            }
            best_metrics = {
                "train_losses": t_losses,
                "val_losses": v_losses,
                "train_accuracies": t_accs,
                "val_accuracies": v_accs
            }

In [None]:
results_df = pd.DataFrame(experiment_results)
print("\nSummary of Experiment Results:")
print(results_df)
print("\nBest Model Configuration:")
print(best_config)

In [None]:
best_model.eval()
all_labels = []
all_preds = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = best_model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

print("Classification Report for Best Model:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens",
            xticklabels=["Negative", "Positive"],
            yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for Best Model")
plt.show()

In [None]:
plt.plot(best_metrics["train_losses"], marker='o', label="Train Loss")
plt.plot(best_metrics["val_losses"], marker='o', label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs. Validation Loss")
plt.legend()
plt.show()

In [None]:
plt.plot(best_metrics["train_accuracies"], marker='o', label="Train Accuracy")
plt.plot(best_metrics["val_accuracies"], marker='o', label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Validation Accuracy")
plt.legend()
plt.show()

In [None]:
torch.save(best_model.state_dict(), "best_baseline_lstm.pth")
print("Best model weights saved as 'best_baseline_lstm.pth'.")

In [None]:
import pickle

In [None]:
with open("best_training_metrics.pkl", "wb") as f:
    pickle.dump(best_metrics, f)
print("Best model training metrics saved as 'best_training_metrics.pkl'.")

**Improved**

In [None]:
import torch.nn as nn
from torchtext.vocab import GloVe
from torchinfo import summary

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, outputs):
        attn_scores = self.attn(outputs)
        attn_weights = torch.softmax(attn_scores, dim=1)
        context = torch.sum(attn_weights * outputs, dim=1)
        return context, attn_weights

glove = GloVe(name='6B', dim=100)

embedding_matrix = torch.randn(vocab_size, embed_dim)
for token, idx in vocab.get_stoi().items():
    if token in glove.stoi:
        embedding_matrix[idx] = glove.vectors[glove.stoi[token]]

class ImprovedGRUAttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout, embedding_matrix):
        super(ImprovedGRUAttentionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight.data.copy_(embedding_matrix)
        self.embedding.weight.requires_grad = False

        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers,
                          dropout=dropout, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        context, attn_weights = self.attention(gru_out)
        out = self.fc(context)
        return out, attn_weights


hidden_dim_adv = 128
num_layers_adv = 4
dropout_adv = 0.5

advanced_model = ImprovedGRUAttentionModel(vocab_size, embed_dim, hidden_dim_adv, num_layers_adv, num_classes, dropout_adv, embedding_matrix)
advanced_model = advanced_model.to(device)

print("Advanced Improved Model (GRU with Attention) Summary:")
sample_input = torch.randint(0, vocab_size, (batch_size, max_length), dtype=torch.long, device=device)
summary(advanced_model, input_data=sample_input)


In [None]:
import torch.optim as optim

def train_evaluate_advanced_model(hidden_dim, num_layers, dropout, learning_rate, num_epochs=5):
    model_exp = ImprovedGRUAttentionModel(vocab_size, embed_dim, hidden_dim, num_layers, num_classes, dropout, embedding_matrix).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model_exp.parameters(), lr=learning_rate)

    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    for epoch in range(num_epochs):
        model_exp.train()
        running_loss = 0.0
        correct_train, total_train = 0, 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs, _ = model_exp(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = correct_train / total_train
        train_losses.append(epoch_train_loss)
        train_accuracies.append(epoch_train_acc)

        model_exp.eval()
        running_val_loss = 0.0
        correct_val, total_val = 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs, _ = model_exp(inputs)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()
        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_val_acc = correct_val / total_val
        val_losses.append(epoch_val_loss)
        val_accuracies.append(epoch_val_acc)

        print(f"[Advanced] Epoch {epoch+1}/{num_epochs}: Train Loss={epoch_train_loss:.4f}, Train Acc={epoch_train_acc:.4f}, " \
              f"Val Loss={epoch_val_loss:.4f}, Val Acc={epoch_val_acc:.4f}")

    model_exp.eval()
    correct_test, total_test = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs, _ = model_exp(inputs)
            _, predicted = torch.max(outputs, 1)
            total_test += labels.size(0)
            correct_test += (predicted == labels).sum().item()
    test_acc = correct_test / total_test

    return model_exp, epoch_val_acc, test_acc, train_losses, val_losses, train_accuracies, val_accuracies

In [None]:
advanced_experiment_results = []
best_val_acc_adv = 0.0
best_model_adv = None
best_config_adv = None
best_metrics_adv = {}

exp_configs_adv = [
    {
        "name": "AdvExp1",
        "hidden_dim": 128,
        "num_layers": 4,
        "learning_rate": 0.001,
        "dropouts": [0.3, 0.5]
    },
    {
        "name": "AdvExp2",
        "hidden_dim": 128,
        "num_layers": 4,
        "learning_rate": 0.002,
        "dropouts": [0.3, 0.5]
    },
    {
        "name": "AdvExp3",
        "hidden_dim": 256,
        "num_layers": 4,
        "learning_rate": 0.001,
        "dropouts": [0.3, 0.5]
    }
]

In [None]:
print("Starting Advanced Model Experiments...\n")
for config in exp_configs_adv:
    for dropout_val in config["dropouts"]:
        print(f"Running {config['name']} with hidden_dim={config['hidden_dim']}, num_layers={config['num_layers']}, "
              f"learning_rate={config['learning_rate']}, dropout={dropout_val}")
        model_exp, val_acc, test_acc, t_losses, v_losses, t_accs, v_accs = train_evaluate_advanced_model(
            hidden_dim=config["hidden_dim"],
            num_layers=config["num_layers"],
            dropout=dropout_val,
            learning_rate=config["learning_rate"],
            num_epochs=5
        )
        print(f"Result: Val Acc={val_acc:.4f}, Test Acc={test_acc:.4f}\n")
        advanced_experiment_results.append({
            "Experiment": config["name"],
            "Hidden_dim": config["hidden_dim"],
            "Num_layers": config["num_layers"],
            "Learning_rate": config["learning_rate"],
            "Dropout": dropout_val,
            "Val_Acc": val_acc,
            "Test_Acc": test_acc
        })
        if val_acc > best_val_acc_adv:
            best_val_acc_adv = val_acc
            best_model_adv = copy.deepcopy(model_exp)
            best_config_adv = {
                "Experiment": config["name"],
                "Hidden_dim": config["hidden_dim"],
                "Num_layers": config["num_layers"],
                "Learning_rate": config["learning_rate"],
                "Dropout": dropout_val,
                "Val_Acc": val_acc,
                "Test_Acc": test_acc
            }
            best_metrics_adv = {
                "train_losses": t_losses,
                "val_losses": v_losses,
                "train_accuracies": t_accs,
                "val_accuracies": v_accs
            }

In [None]:
results_df_adv = pd.DataFrame(advanced_experiment_results)
print("\nSummary of Advanced Model Experiment Results:")
print(results_df_adv)
print("\nBest Advanced Model Configuration:")
print(best_config_adv)

In [None]:
best_model_adv.eval()
all_labels_adv = []
all_preds_adv = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs, _ = best_model_adv(inputs)
        _, predicted = torch.max(outputs, 1)
        all_labels_adv.extend(labels.cpu().numpy())
        all_preds_adv.extend(predicted.cpu().numpy())

print("Classification Report for Best Advanced Model:")
print(classification_report(all_labels_adv, all_preds_adv, target_names=["Negative", "Positive"]))

cm_adv = confusion_matrix(all_labels_adv, all_preds_adv)
plt.figure(figsize=(6,5))
sns.heatmap(cm_adv, annot=True, fmt="d", cmap="Greens",
            xticklabels=["Negative", "Positive"],
            yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for Best Advanced Model")
plt.show()


In [None]:
plt.plot(best_metrics_adv["train_losses"], marker='o', label="Train Loss")
plt.plot(best_metrics_adv["val_losses"], marker='o', label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Advanced Model: Training vs. Validation Loss")
plt.legend()
plt.show()

In [None]:
plt.plot(best_metrics_adv["train_accuracies"], marker='o', label="Train Accuracy")
plt.plot(best_metrics_adv["val_accuracies"], marker='o', label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Advanced Model: Training vs. Validation Accuracy")
plt.legend()
plt.show()

In [None]:
torch.save(best_model_adv.state_dict(), "best_advanced_gru_attention.pth")
print("Best advanced model weights saved as 'best_advanced_gru_attention.pth'.")

In [None]:
with open("best_training_metrics_adv.pkl", "wb") as f:
    pickle.dump(best_metrics_adv, f)
print("Best model training metrics saved as 'best_training_metrics_adv.pkl'.")

In [None]:
with open("best_training_metrics.pkl", "rb") as f:
    baseline_metrics = pickle.load(f)

with open("best_training_metrics_adv.pkl", "rb") as f:
    improved_metrics = pickle.load(f)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(baseline_metrics["train_losses"], marker='o', label="Baseline Train Loss")
plt.plot(baseline_metrics["val_losses"], marker='o', label="Baseline Val Loss")
plt.plot(improved_metrics["train_losses"], marker='o', label="Improved Train Loss")
plt.plot(improved_metrics["val_losses"], marker='o', label="Improved Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs. Validation Loss")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(baseline_metrics["train_accuracies"], marker='o', label="Baseline Train Acc")
plt.plot(baseline_metrics["val_accuracies"], marker='o', label="Baseline Val Acc")
plt.plot(improved_metrics["train_accuracies"], marker='o', label="Improved Train Acc")
plt.plot(improved_metrics["val_accuracies"], marker='o', label="Improved Val Acc")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Validation Accuracy")
plt.legend()

plt.tight_layout()
plt.show()

**Overview:**

The IMDb Large Movie Review Dataset has 25,000 movie reviews for training and 25,000 for testing. Theya are labeled as either positive (1) or negative (0). This dataset offers a balanced collection of reviews to capture a broad range of opinions and language styles. The dataset is provided by Stanford.

**Architectures:**

1. Baseline Architecture:
    - The input words are first mapped into dense vector representations using an embedding layer. In this model, the embedding dimension is set to 100, which was chosen as a balance between capturing sufficient semantic information and maintaining computational efficiency.
    - Here, three LSTM layers are used and they are responsible for capturing the sequencial dependencies in the text. Then hidden dimension of 128 is used for each LSTM layer this allow the model to learn complex representations of the input sequence. Also, dropout with a probability of 0.5 is applied between LSTM layers to avoid overfitting.
    - The output from the final layer is connected to fully connected layer, it will map the hidden representation to two output logits corresponding to the positive and negative sentiment classes. I didn't used activation function as cross-entropy requires raw logits.

2. Improved advanced architecture (GRU with Attention and Pre-trained GloVe Embeddings):
    - Here, instead of learning embeddings from scratch, this model initializes its embedding layer with pre-trained GloVe vectors which is derived from a large corpus. As these embeddings are fixed, it allows the model to use rich, contextual semantic information learned from external data.
    - Here, The recurrent component is replaced by a bidirectional Gated Recurrent Unit network, which has four GRU layers with a hidden dimension of 128. This bidirectional configuration enable the model to capture contextual information from both forward and backward passes through the input sequence. So, it will double the hidden state representation.
    - An attention mechanism is applied on top of the GRU outputs, they helps to enhance the model’s focus on the most informative parts of the sequence. Also, a dedicated attention layer computes scalar attention scores for each time step, which are normalized via softmax to produce attention weights. These weights are used to compute a weighted sum of the GRU outputs and so it allows the model to dynamically prioritize relevant words or phrases in the review.
    - Finally, the context vector generated by the attention mechanism is passed through a fully connected layer that outputs logits for the two sentiment classes. Here, the use of a bidirectional GRU and attention mechanism results in an input dimension for the FC layer that is twice the hidden dimension of a single GRU.

**Visualization:**

1. Baseline LSTM
- The baseline model has accuracy around 50% for both training and validation, which means model is predicting randomly and the weights are not updating correctly.
- Also, the validation loss for baseline model is decreased a bit. So, the model’s learned features do not generalize to unseen data.

2. Improved LSTM
- Improved model’s validation accuracy is around 90% by the last epoch, which is high compared to baseline model, which was not performing well. This shows the benefits of using pre-trained embeddings, a bidirectional GRU, and an attention mechanism.
- Here, the improved model’s training and validation loss curves converge smoothly and they have a much smaller gap between training and validation metrics, which indicates that the model is generalized well.


**Strengths and Limitations:**

1. Handling long sequences

- LSTMs are designed to avoid the vanishing/exploding gradient problem found in vanilla RNNs. So, it allows them to better capture long-range dependencies within text.
- But, LSTMs can struggle with extremely long sequences and sometimes they forget the earlier parts of the input if it’s very lengthy or complex. In such cases transformers handle it more effectively.

2. Computational cost

- Compared to basic RNNs, LSTMs can learn better temporal dependencies with fewer issues related to gradient flow.
- But the issue is that LSTM requires sequential training. So, it makes them less parallelizable compared to Transformer-based models. So, as sequence length and model size grow, the training time can become larger, and large hidden dimensions or many stacked layers leads to more computational and memory demands.

3. Interpretability

- LSTM gates offer some insight into how information flows and is retained or forgotten over time.
- The limitation is that thay are still black-box neural networks compared to simpler models like logistic regression, where we can directly interpret feature weights. Also, without attention or some tools, it can be difficult to understand that which tokens or phrases are influencing the LSTM’s predictions most.

4. Sensitivity to hyperparameters

- Here, LSTMs provide several hyperparameters to tune for improved performance. So, dropout and regularization techniques can be used to handle overfitting and improving generalization.
- But that is also a limitation as the performance of LSTMs can be highly sensitive to these hyperparameters.

**References:**

https://pytorch.org/docs/stable/torch.html <br>
https://pytorch.org/text/stable/index.html <br>
https://amueller.github.io/word_cloud/ <br>
https://www.nltk.org <br>
https://scikit-learn.org/stable/user_guide.html <br>
https://seaborn.pydata.org/tutorial.html <br>
https://matplotlib.org/stable/users/index.html <br>