In [None]:
!pip install wandb

!wandb login 9172fb113e07d174f618e9042047cc5c4adacc0f

Collecting wandb
  Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import wandb

# Mount files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Data processing

In [None]:
def data_init():
    with open("/content/drive/MyDrive/Colab Notebooks/data/ptb.train.txt") as f:
        train = f.read().strip().replace('\n', '<eos>').split()
    with open("/content/drive/MyDrive/Colab Notebooks/data/ptb.valid.txt") as f:
        val = f.read().strip().replace('\n', '<eos>').split()
    with open("/content/drive/MyDrive/Colab Notebooks/data/ptb.test.txt") as f:
        test = f.read().strip().replace('\n', '<eos>').split()

    words = sorted(set(train))
    word2idx = {word: idx for idx, word in enumerate(words)}
    trn = [word2idx[w] for w in train]
    vld = [word2idx[w] if w in word2idx else word2idx['<unk>'] for w in val]
    tst = [word2idx[w] if w in word2idx else word2idx['<unk>'] for w in test]

    return np.array(trn), np.array(vld), np.array(tst), len(words)

train_set, val_set, test_set, vocab_size = data_init()

print("Train set shape:", train_set.shape)
print("Validation set shape:", val_set.shape)
print("Test set shape:", test_set.shape)
print("Vocabulary size:", vocab_size)
print(train_set[:20])
print(train_set[20:40])

# print(len(train_text), train_text[:10])
# print(len(valid_text), valid_text[:10])
# print(len(test_text), test_text[:10])

Train set shape: (929588,)
Validation set shape: (73759,)
Test set shape: (82429,)
Vocabulary size: 10000
[ 237  807  950 1325 1476 1691 3773 3920 4067 4380 4731 4922 5569 5732
 5876 7091 7175 7366 7769 8203]
[8301 8478 8819 9658   43 6605   44   45 9965 6172 9838 4833 9012 1040
  609   48 6033 2631 6074   45]


In [None]:
# Batch data preparation
def minibatch(data, batch_size, seq_length):
    data = torch.tensor(data, dtype=torch.int64)
    num_batches = data.size(0) // batch_size
    data = data[:num_batches * batch_size].view(batch_size, -1)

    dataset = []
    for i in range(0, data.size(1) - seq_length+1, seq_length):
        x = data[:, i:i + seq_length].transpose(1, 0)
        y = data[:, i+1:i+seq_length+1].transpose(1, 0)
        dataset.append((x, y))
    return dataset

In [None]:
#Testing minibatch
batch_size = 20
seq_length = 20

train_batch = minibatch(train_set, batch_size, seq_length)
valid_batch = minibatch(val_set, batch_size, seq_length)
test_batch = minibatch(test_set, batch_size, seq_length)

print(len(train_batch))
print(len(valid_batch))
print(len(test_batch))

print(train_batch[0][0].shape)
print(train_batch[0][1].shape)
print(train_batch[0][0])
print(train_batch[0][1])
print("*********")

# for i, (x, y) in enumerate(valid_batch):
#     print(f"Batch {i}: x shape: {x.shape}, y shape: {y.shape}")

2323
184
206
torch.Size([20, 20])
torch.Size([20, 20])
tensor([[ 237, 9010,   45,  424,  657, 5133,   43, 9846, 4769, 8215, 6237, 2863,
          873, 7829,  424,   45, 7877,  406, 5442,   95],
        [ 807, 9928, 8093, 9805, 5086, 9012, 1097, 9012, 8713, 9961, 5234,   43,
         5782,   43,   44,   45, 5825, 6142, 1965, 6237],
        [ 950, 8304, 6142, 6378, 4470, 2764, 9888,   44,  609,   43, 9119, 1573,
         2602, 9119, 9869,  424,   44, 2362, 5799, 3352],
        [1325, 5232, 4770,   43, 6400, 8307, 7875, 9012, 9590, 3756,   48, 2096,
           43, 8860,  345,  270, 9012,  812, 6142, 2343],
        [1476,   48, 1800, 9010,  889, 9055, 9034, 3908, 7298, 1551, 4039,  413,
         5825, 9012, 9010, 6185, 2947,  555, 4619,    3],
        [1691, 1406, 8582, 5531, 6142, 5755,  373, 9846,  424, 4028, 9012,   44,
         7728, 6919, 5531,  536, 6142, 9842, 1717, 6920],
        [3773, 9474, 3659, 3957, 1283,  280, 3543, 9012, 6532, 6257, 3972, 1315,
         1768, 5993, 3659,   4

# Defining our models

In [None]:
class Model(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, dropout, rnn_type='LSTM'):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn_type = rnn_type

        # Embedding layer to map input tokens to vectors
        self.embedding = nn.Embedding(vocab_size, hidden_size)

        # RNN layer (either LSTM or GRU based on user choice)
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(hidden_size, hidden_size, num_layers, dropout=dropout)

        # Dropout layer to prevent overfitting
        self.dropout = nn.Dropout(p=dropout)

        # Linear layer to map from hidden state to vocabulary size (for logits)
        self.fc = nn.Linear(hidden_size, vocab_size)

        self.init_parameters()

    # Initialize parameters to U(-0.1, 0.1)
    def init_parameters(self):
        for param in self.parameters():
            nn.init.uniform_(param, -0.1, 0.1)

    # Forward pass: directly from paper
    def forward(self, x, states):
        x = self.dropout(self.embedding(x))  # Embedding input, then dropout
        x, states = self.rnn(x, states)  # Pass through RNN (LSTM or GRU)
        x = self.dropout(x)  # Apply dropout after rnn again
        x = self.fc(x)  # Final fully connected layer to get logits
        return x, states

    # Initialize hidden (and cell) states
    def state_init(self, batch_size):
        if self.rnn_type == 'LSTM':
            # h0 = torch.randn(self.num_layers, batch_size, self.hidden_size).to(device)
            # c0 = torch.randn(self.num_layers, batch_size, self.hidden_size).to(device)
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

            # h0 = torch.nn.init.xavier_uniform_(torch.empty(self.num_layers, batch_size, self.hidden_size)).to(device)
            # c0 = torch.nn.init.xavier_uniform_(torch.empty(self.num_layers, batch_size, self.hidden_size)).to(device)

            return (h0, c0)
        else:  # GRU has only one hidden state
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
            # h0 = torch.randn(self.num_layers, batch_size, self.hidden_size).to(device)
            # h0 = torch.nn.init.xavier_uniform_(torch.empty(self.num_layers, batch_size, self.hidden_size)).to(device)

            return h0

    # Detach hidden states (to avoid backpropagating through entire sequence)
    def detach(self, states):
        if isinstance(states, tuple):  # LSTM states
            return (states[0].detach(), states[1].detach())
        else:  # GRU state
            return states.detach()


In [None]:
# Cross-entropy loss function
def cross_entropy_loss(scores, y):
    criterion = nn.CrossEntropyLoss()
    scores = scores.reshape(-1, scores.size(2))
    y = y.reshape(-1)
    loss = criterion(scores, y)
    return loss

In [None]:
# Perplexity calculation
def perplexity(data, model, batch_size):
    with torch.no_grad():
        losses = []
        states = model.state_init(batch_size)
        for x, y in data:
            x = x.to(device)
            y = y.to(device)
            scores, states = model(x, states)
            loss = cross_entropy_loss(scores, y)
            losses.append(loss.item())
    return np.exp(np.mean(losses))

In [None]:
import timeit

def train(data, path, model, epochs, initial_learning_rate, max_grad_norm, epoch_threshold, lr_decay, step_size=6, gamma=1.0/1.65, dropout=False):
    wandb.init(
        project="dl-ex2",
        name=f'{model.rnn_type}_lr_{initial_learning_rate}_dropout_{model.dropout.p}',
        config={
        "learning_rate": initial_learning_rate,
        "architecture": model.rnn_type,
        "hidden_size": model.hidden_size,
        "layer_num": model.num_layers,
        "epochs": epochs,
        "dropout": model.dropout.p,
        "batch_size": batch_size,
        "seq_length": seq_length,
        "max_grad_norm": max_grad_norm
        }
    )

    trn, vld, tst = data
    tic = timeit.default_timer()
    optimizer = optim.SGD(model.parameters(), lr=initial_learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
    best_val_loss = float('inf')
    best_model = None

    for epoch in range(epochs):
        model.train()
        states = model.state_init(batch_size)
        total_loss = 0.0
        total_words = 0

        for i, (x, y) in enumerate(trn):
            x = x.to(device)
            y = y.to(device)

            states = model.detach(states)
            optimizer.zero_grad()

            # Forward pass
            scores, states = model(x, states)

            # Loss and Backpropagation
            loss = cross_entropy_loss(scores, y)
            loss.backward()

            # Gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()

            total_loss += loss.item()
            total_words += y.numel()
            # Print 10 times per batch
            if i % (len(trn)//10) == 0:
                toc = timeit.default_timer()
                print("batch no = {:d} / {:d}, ".format(i, len(trn)) +
                      "avg train loss per word this batch = {:.3f}, ".format(loss.item()) +
                      "words per second = {:d}, ".format(round(total_words/(toc-tic))) +
                      "lr = {:.3f}, ".format(optimizer.param_groups[0]['lr']) +
                      "since beginning = {:d} mins, ".format(round((toc-tic)/60)))

        avg_train_loss = total_loss / len(trn)
        train_perp = perplexity(trn, model, batch_size)

        # Validation and Test perplexity
        model.eval()
        val_perp = perplexity(vld, model, batch_size)
        test_perp = perplexity(tst, model, batch_size)
        print(f"Epoch {epoch + 1}: Start Learning Rate: {initial_learning_rate}, Dropout: {model.dropout.p}")
        print(f"Epoch {epoch + 1}: Train Loss: {avg_train_loss:.3f}")
        print(f"Epoch {epoch + 1}: Train Perplexity: {train_perp:.3f}")
        print(f"Epoch {epoch + 1}: Validation Perplexity: {val_perp:.3f}")
        print(f"Epoch {epoch + 1}: Test Perplexity: {test_perp:.3f}")

        # Wandb Plotting
        wandb.log({"Train Perplexity": train_perp, "Validation Perplexity": val_perp, "Test Perplexity": test_perp, "epoch": epoch, "learning_rate": optimizer.param_groups[0]['lr'],"dropout": model.dropout.p })

        # Custom scheduler from paper -> Used only for LSTM with no dropout
        # Define "epoch_threshold" and "lr_decay" in arguments
        if model.rnn_type == 'LSTM' and dropout == False:
          if (epoch+1) >= epoch_threshold:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= lr_decay  # Decay learning rate

        # Step decay -> Used for (1) GRU no dropout, (2) LSTM with dropout, (3) GRU with dropout
        # Define "step_size" and "gamma" in arguments
        else:
          scheduler.step()

        # Save the best model
        if val_perp < best_val_loss:
            print(f"Saw better model at Epoch {epoch+1}")
            best_val_loss = val_perp
            best_model = {k: v.clone() for k, v in model.state_dict().items()}

    # Test set perplexity
    model.load_state_dict(best_model)
    test_perp = perplexity(tst, model, batch_size)
    print(f"Test Set Perplexity: {test_perp:.3f} Model: {model.rnn_type} Dropout: {model.dropout.p} Hidden_size: {model.hidden_size}")

    torch.save(model, path)
    print("Training complete. Best model saved.")


# Training

In [None]:
# Hyperparameters
batch_size = 20
seq_length = 20
hidden_size = 200
layer_num = 2
max_grad_norm = 5

epoch_threshold = 7
lr_decay = 0.5

# Initialize datasets
trn, vld, tst, vocab_size = data_init()

trn = minibatch(trn, batch_size, seq_length)
vld = minibatch(vld, batch_size, seq_length)
tst = minibatch(tst, batch_size, seq_length)

def run_experiments_no_dropout():

    dropout = 0.0
    total_epochs = 15

    step_size = 5
    gamma = 0.5


    for rnn_type in ['LSTM', 'GRU']:
        model = Model(vocab_size, hidden_size, layer_num, dropout, rnn_type=rnn_type).to(device)
        learning_rate = 2.0 if rnn_type=='LSTM' else 1.0
        path = f'/content/drive/MyDrive/Colab Notebooks/model/best_model_{model.rnn_type}_{learning_rate}_{model.dropout.p}.pth'
        train((trn, vld, tst), path, model, total_epochs, learning_rate, max_grad_norm, epoch_threshold, lr_decay, step_size, gamma, dropout=False)

def run_experiments_with_dropout():

    total_epochs = 25
    step_size = 6

    for dropout in [0.25]:
      for rnn_type in ['LSTM', 'GRU']:
          model = Model(vocab_size, hidden_size, layer_num, dropout, rnn_type=rnn_type).to(device)
          learning_rate = 4.0 if rnn_type=='LSTM' else 2.0
          gamma = 1.0/1.65 if rnn_type == 'GRU' else 1.0/1.15
          step_size = 15 if rnn_type=='LSTM' else 6
          path = f'/content/drive/MyDrive/Colab Notebooks/model/best_model_{model.rnn_type}_{learning_rate}_{model.dropout.p}.pth'
          train((trn, vld, tst), path, model, total_epochs, learning_rate, max_grad_norm, epoch_threshold, lr_decay, step_size, gamma, dropout=True)

# Call the function to run experiments
run_experiments_with_dropout()
run_experiments_no_dropout()

# Table

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Perplexity values are from WandB runs history
data = {
    'Model': ['LSTM No Dropout', 'GRU No Dropout', 'LSTM 25% Dropout', 'GRU 25% Dropout'],
    'Training Perplexity': [72.49, 67.35, 67.97, 59.98],
    'Validation Perplexity': [123.75, 124.40, 102.66, 104.49],
    'Test Perplexity': [119.82, 119.98, 99.08, 100.88]
}

# Display table
df = pd.DataFrame(data)

fig = go.Figure(data=[go.Table(
    header=dict(values=list(df.columns),
                fill_color='coral',
                align='left'),
    cells=dict(values=[df[col] for col in df.columns],
               fill_color='lavender',
               align='left'))
])

fig.update_layout(
    title="Perplexities of Various Models",
    height=500,
    width=750
)

fig.show()

In [None]:
model = torch.load('/content/drive/MyDrive/Colab Notebooks/model/best_model_LSTM_2.0_0.0.pth')
model.eval()
test_perp = perplexity(tst, model, batch_size)
test_perp

  model = torch.load('/content/drive/MyDrive/Colab Notebooks/model/best_model_LSTM_2.0_0.0.pth')


123.50955553789018