#  <font color='#FFE15D'><b> Project 2: Language Modeling AWD </b></font>


# 🔴 **Environment Setup**

## 🟠 Install Requirements

In [None]:
!pip install -q portalocker>=2.0.0
!pip install -q torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 torchtext==0.17.2 torchmetrics==1.3.1 numpy==2.0.0 torchdata==0.11.0 tqdm==4.67.1
!pip uninstall -q torchtune
!pip install -q wandb



# ⚠️ **Don't forget to restart the runtime!**

# 🔴 **Import Libs**

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torchtext
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split

from torch import optim
from torch.nn import functional as F

import tqdm
import torchmetrics as tm
import wandb

import os
from collections import Counter

import ipywidgets as widgets
from IPython.display import display

In [None]:
!python --version

Python 3.11.13


In [None]:
for lib in [np, torch, torchtext, tqdm]:
  print(lib.__name__, '-->', lib.__version__)

numpy --> 2.0.2
torch --> 2.2.2+cu121
torchtext --> 0.17.2+cpu
tqdm --> 4.67.1


# 🔴 **Utils**

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

In [None]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
      torch.cuda.manual_seed(seed)
      # torch.cuda.manual_seed_all(seed)

      # torch.backends.cudnn.deterministic = True
      # torch.backends.cudnn.benchmark = False


# 🔴 **Arguments**

In [None]:
seed = 8

batch_size = 80
seq_len = 70

embedding_dim = 300

num_layers = 3
hidden_dim = 1150
dropoute = 0.1
dropouti = 0.65
dropouth = 0.3
dropouto = 0.4
weight_drop = 0.

lr = 30
wd = 1.2e-6
momentum = 0.9

clip = 0.25

wandb_enable = False

In [None]:
wandb_arg_name = input('Please input the WandB argument (run) name:')

Please input the WandB argument (run) name:Base


In [None]:
wandb_arg_name

'Base'

# 🔴 **Dataset**

## 🟠 Load the Dataset

🔰 In this session you should load WikiText2 dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip '/content/drive/MyDrive/Datasets/wikitext-2-v1.zip' -d '/content/'

Archive:  /content/drive/MyDrive/Datasets/wikitext-2-v1.zip
   creating: /content/wikitext-2/
  inflating: /content/wikitext-2/wiki.test.tokens  
  inflating: /content/wikitext-2/wiki.valid.tokens  
  inflating: /content/wikitext-2/wiki.train.tokens  


## 🟠 Build vocabulary and save it

🔰 In this section we need to:

*   Define a tokenizer using `basic_english`
*   Tokenize the dataset and collect tokens
*   Build the vocabulary using `build_vocab_from_iterator`
*   Manually insert special tokens and set the default index


In [None]:
def load_data_iterators(base_path):
    import os
    def read_file_gen(filename):
        filepath = os.path.join(base_path, filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            yield from (line.strip() for line in f if line.strip() and not line.startswith('='))

    train_iter = read_file_gen('wiki.train.tokens')
    valid_iter = read_file_gen('wiki.valid.tokens')
    test_iter  = read_file_gen('wiki.test.tokens')

    return train_iter, valid_iter, test_iter


In [None]:
base_path = '/content/wikitext-2'
train_iter, valid_iter, test_iter = load_data_iterators(base_path)

In [None]:
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
torch.save(vocab, 'vocab.pt')

## 🟠 Transform the data

🛑 Make sure to perform the transformations on train, validation and test datasets.

🔰 Reshape the dataset into an `N x B x L` or `M x L` format, where `N` represents the number of batches, `B` is the batch size, `L` is the length of a sample within each batch, and `M` is equal to `N x B`.

In [None]:
def data_process(raw_text_iter, seq_len):
  data = torch.cat([torch.LongTensor(vocab(tokenizer(line))) for line in raw_text_iter])

  M = len(data) // seq_len

  r = len(data) % seq_len
  data = torch.cat((data, torch.LongTensor([0]))) if r==0 else data

  inputs = data[:M*seq_len]
  targets = data[1:M*seq_len+1]

  inputs = inputs.reshape(-1, seq_len)
  targets = targets.reshape(-1, seq_len)

  return inputs, targets

In [None]:
X_train, y_train = data_process(train_iter, seq_len)
X_valid, y_valid = data_process(valid_iter, seq_len)
X_test, y_test = data_process(test_iter, seq_len)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

(torch.Size([29285, 70]),
 torch.Size([29285, 70]),
 torch.Size([3063, 70]),
 torch.Size([3063, 70]),
 torch.Size([3455, 70]),
 torch.Size([3455, 70]))

## 🟠 Custom dataset

🔰 Write a custom dataset class for LanguageModelDataset.

In [None]:
class LanguageModelDataset(Dataset):

  def __init__(self, inputs, targets):
    self.inputs = inputs
    self.targets = targets

  def __len__(self):
    return self.inputs.shape[0]

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]

In [None]:
train_set = LanguageModelDataset(X_train, y_train)
valid_set = LanguageModelDataset(X_valid, y_valid)
test_set = LanguageModelDataset(X_test, y_test)

In [None]:
train_set[0]

(tensor([    9,  3849,  3869,   881,     9, 20000,    83,  3849,    88,     0,
          3869,    21,   780, 28780,     2,  6182,     3,  3849,     4,     1,
          5023,    88,    20,     2,  1837,  1018,     7,    14,  3849,  3869,
           881,   629,   976,     2,    23,     8,  5790,   299,    12,   575,
           232,    67,   452,    19, 13722,     5,   757,     3,  2500,    17,
             1,  1767,  5637,     3,   155,     6,   246,   354,     6,   976,
             2,    24,    23,     1,   237,    67,     6,     1,  3849,    93]),
 tensor([ 3849,  3869,   881,     9, 20000,    83,  3849,    88,     0,  3869,
            21,   780, 28780,     2,  6182,     3,  3849,     4,     1,  5023,
            88,    20,     2,  1837,  1018,     7,    14,  3849,  3869,   881,
           629,   976,     2,    23,     8,  5790,   299,    12,   575,   232,
            67,   452,    19, 13722,     5,   757,     3,  2500,    17,     1,
          1767,  5637,     3,   155,     6,   246,

## 🟠 Define a dataloader if needed

🔰 Write dataloaders for the training, validation, and test sets.

In [None]:
set_seed(seed)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [None]:
x_batch, y_batch = next(iter(train_loader))
x_batch.shape, y_batch.shape, x_batch

(torch.Size([80, 70]),
 torch.Size([80, 70]),
 tensor([[ 1985,    13,     1,  ...,  1985,    13,     1],
         [  104,     2,    57,  ..., 16138,  2285,    92],
         [    2,    22,   100,  ...,   116,    22,     2],
         ...,
         [   22,     0,   173,  ...,    37, 12908,     6],
         [    6,    43,  8400,  ...,    93,     3,     1],
         [25828,    65,    46,  ...,     3,   179,  1108]]))

In [None]:
set_seed(seed)

for inputs, targets in train_loader:
  print(inputs[0, 0], targets[0, 0])
  break

tensor(1985) tensor(13)


# 🔴 **Model**

In [None]:
class WeightDrop(torch.nn.Module):

  def __init__(self, module, weights, dropout=0):
    super(WeightDrop, self).__init__()
    self.module = module
    self.weights = weights
    self.dropout = dropout
    self._setup()

  def widget_demagnetizer_y2k_edition(*args, **kwargs):
    return

  def _setup(self):
    if issubclass(type(self.module), torch.nn.RNNBase):
      self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition

      for name_w in self.weights:
        print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
        w = getattr(self.module, name_w)
        del self.module._parameters[name_w]
        self.module.register_parameter(name_w + '_raw', nn.Parameter(w.data))

  def _setweights(self):
    for name_w in self.weights:
      raw_w = getattr(self.module, name_w + '_raw')
      w = None
      # w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
      mask = torch.nn.functional.dropout(torch.ones_like(raw_w), p=self.dropout, training=True) * (1 - self.dropout)
      setattr(self.module, name_w, raw_w * mask)

  def forward(self, *args):
    self._setweights()
    return self.module.forward(*args)

In [None]:
def embedded_dropout(embed, words, dropout=0.1, scale=None):
  if dropout:
    mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(
        embed.weight) / (1 - dropout)
    masked_embed_weight = mask * embed.weight
  else:
    masked_embed_weight = embed.weight
  if scale:
    masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight

  padding_idx = embed.padding_idx
  if padding_idx is None:
    padding_idx = -1

  embedding = torch.nn.functional.embedding(words, masked_embed_weight,
                                            padding_idx, embed.max_norm, embed.norm_type,
                                            embed.scale_grad_by_freq, embed.sparse)
  return embedding

In [None]:
class LockedDropout(nn.Module):
  def __init__(self):
    super(LockedDropout, self).__init__()

  def forward(self, x, dropout):
    if not self.training or not dropout:
      return x
    m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
    mask = m.requires_grad_(False) / (1 - dropout)
    mask = mask.expand_as(x)
    return mask * x

🔰 AWD-LSTM Language Model

In [None]:
class LanguageModel(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers,
               dropoute=0.2, dropouti=0.2, dropouth=0.2, dropouto=0.2,
               weight_drop=0.2):
    super().__init__()
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    self.embedding_dim = embedding_dim

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.embedding.weight.data.uniform_(-0.1, 0.1)

    self.lstms = []
    self.lstms.append(nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=0, batch_first=False))
    self.lstms.append(nn.LSTM(hidden_dim, hidden_dim, num_layers=1, dropout=0, batch_first=False))
    self.lstms.append(nn.LSTM(hidden_dim, embedding_dim, num_layers=1, dropout=0, batch_first=False))
    if weight_drop > 0:
      self.lstms = [WeightDrop(lstm, ['weight_hh_l0'], dropout=weight_drop) for lstm in self.lstms]
    self.lstms = nn.ModuleList(self.lstms)

    self.fc = nn.Linear(embedding_dim, vocab_size)

    self.fc.weight = self.embedding.weight

    self.lockdrop = LockedDropout()
    self.dropoute = dropoute
    self.dropouti = dropouti
    self.dropouth = dropouth
    self.dropouto = dropouto
    # print(dropoute, dropouti, dropouth, dropouto)

  def forward(self, src):
    embedding = embedded_dropout(self.embedding, src, dropout=self.dropoute if self.training else 0)
    embedding = self.lockdrop(embedding, self.dropouti)

    new_hiddens = []
    for l, lstm in enumerate(self.lstms):
      embedding, _ = lstm(embedding)
      if l != self.num_layers-1:
        embedding = self.lockdrop(embedding, self.dropouth)

    embedding = self.lockdrop(embedding, self.dropouto)

    prediction = self.fc(embedding)
    return prediction




In [None]:
set_seed(seed)

model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto,
                      weight_drop=weight_drop)
model



LanguageModel(
  (embedding): Embedding(28782, 300)
  (lstms): ModuleList(
    (0): LSTM(300, 1150)
    (1): LSTM(1150, 1150)
    (2): LSTM(1150, 300)
  )
  (fc): Linear(in_features=300, out_features=28782, bias=True)
  (lockdrop): LockedDropout()
)

In [None]:
type(model.lstms[0])


In [None]:
model.lstms[0].weight_hh_l0, model.lstms[0].weight_ih_l0, model.embedding.weight, model.fc.weight


(Parameter containing:
 tensor([[-0.0269, -0.0171,  0.0099,  ...,  0.0265,  0.0148,  0.0252],
         [-0.0029, -0.0240,  0.0127,  ...,  0.0204, -0.0181,  0.0231],
         [ 0.0243,  0.0071,  0.0120,  ..., -0.0033,  0.0135,  0.0114],
         ...,
         [-0.0184, -0.0187, -0.0229,  ...,  0.0111,  0.0260, -0.0071],
         [ 0.0007, -0.0156, -0.0018,  ..., -0.0201, -0.0130, -0.0003],
         [-0.0117,  0.0278,  0.0266,  ...,  0.0072,  0.0089, -0.0032]],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.0056, -0.0289, -0.0022,  ...,  0.0155,  0.0037,  0.0013],
         [-0.0135,  0.0206,  0.0283,  ...,  0.0168,  0.0229,  0.0231],
         [ 0.0090, -0.0099,  0.0035,  ...,  0.0231, -0.0041,  0.0109],
         ...,
         [-0.0029,  0.0111, -0.0024,  ...,  0.0149, -0.0293, -0.0250],
         [-0.0160,  0.0080,  0.0199,  ..., -0.0099,  0.0070,  0.0180],
         [ 0.0277, -0.0171, -0.0076,  ...,  0.0268, -0.0164, -0.0243]],
        requires_grad=True),
 Parameter con

In [None]:
model(x_batch.t()).shape, x_batch.shape

(torch.Size([70, 80, 28782]), torch.Size([80, 70]))

In [None]:
num_trainable_params(model)

27.674182

In [None]:
num_trainable_params(model.embedding)

8.6346

In [None]:
data_np = model.embedding.weight.cpu().detach().numpy()
unique_rows, indices, counts = np.unique(data_np, axis=0, return_index=True, return_counts=True)
len(unique_rows)

In [None]:
glove = GloVe(name='6B', dim=glove_dim)
glove

# 🔴 **Config**

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
loss_fn = nn.CrossEntropyLoss()

metric = tm.text.Perplexity().to(device)

In [None]:
key_file = '/content/key'

if os.path.exists(key_file):
    with open(key_file) as f:
        key = f.readline().strip()
    wandb.login(key=key)
else:
    print("Key file does not exist. Please create the key file with your wandb API key.")

Key file does not exist. Please create the key file with your wandb API key.


# 🔴 **Train ➰**

🔰 This is the template for train function, change it if needed.

In [None]:
def train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch=None):
  model.train()
  loss_train = AverageMeter()
  metric.reset()

  with tqdm.tqdm(train_loader, unit='batch') as tepoch:
    for inputs, targets in tepoch:
      if epoch:
        tepoch.set_description(f'Epoch {epoch}')

      inputs = inputs.t().to(device)
      targets = targets.t().to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

      loss.backward()

      nn.utils.clip_grad.clip_grad_norm_(model.parameters(), max_norm=clip)

      optimizer.step()
      optimizer.zero_grad()

      loss_train.update(loss.item(), n=len(targets))
      metric.update(outputs, targets)

      tepoch.set_postfix(loss=loss_train.avg, metric=metric.compute().item())

  return model, loss_train.avg, metric.compute().item()

# 🔴 **Evaluation**

🔰 This is the template for evaluation function, change it if needed.

In [None]:
def evaluate(model, test_loader, loss_fn, metric):
  model.eval()
  loss_eval = AverageMeter()
  metric.reset()

  with torch.inference_mode():
    for inputs, targets in test_loader:
      inputs = inputs.t().to(device)
      targets = targets.t().to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())
      loss_eval.update(loss.item(), n=len(targets))

      metric(outputs, targets)

  return loss_eval.avg, metric.compute().item()

# 🔴 **Training Process 〽️**

## 🟠 Finding Hyper-parameters

### 🟡 **Step 1:** Calculate the loss for an untrained model using a few batches.


In [None]:
model = LanguageModel(len(vocab), embedding_dim=300,
                      hidden_dim=512, num_layers=2,
                      dropout_embd=0.5, dropout_rnn=0.2).to(device)

inputs, targets = next(iter(train_loader))
inputs = inputs.to(device)
targets = targets.to(device)

with torch.no_grad():
  outputs = model(inputs)
  loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

print(loss)

In [None]:
outputs.reshape(-1, outputs.shape[-1]).shape, targets.flatten().shape

In [None]:
torch.cuda.empty_cache()

### 🟡 **Step 2:** Try to train and overfit the model on a small subset of the dataset.

In [None]:
model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropout_embd=dropout_embd, dropout_rnn=dropout_rnn).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.9, momentum=0.9)

In [None]:
mini_train_size = 1000
_, mini_train_dataset = random_split(train_set, (len(train_set)-mini_train_size, mini_train_size))
mini_train_loader = DataLoader(mini_train_dataset, 20)

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
  model, _, _ = train_one_epoch(model, mini_train_loader, loss_fn, optimizer, metric, epoch)

100%|██████████| 50/50 [05:12<00:00,  6.24s/batch, loss=8.27, metric=3.9e+3]
Epoch 1: 100%|██████████| 50/50 [05:00<00:00,  6.01s/batch, loss=7.2, metric=1.34e+3]
Epoch 2: 100%|██████████| 50/50 [04:54<00:00,  5.90s/batch, loss=7.04, metric=1.14e+3]
Epoch 3: 100%|██████████| 50/50 [04:54<00:00,  5.90s/batch, loss=6.98, metric=1.08e+3]
Epoch 4: 100%|██████████| 50/50 [04:54<00:00,  5.89s/batch, loss=6.95, metric=1.04e+3]
Epoch 5: 100%|██████████| 50/50 [04:52<00:00,  5.85s/batch, loss=6.92, metric=1.01e+3]
Epoch 6: 100%|██████████| 50/50 [04:54<00:00,  5.89s/batch, loss=6.86, metric=957]
Epoch 7: 100%|██████████| 50/50 [04:56<00:00,  5.93s/batch, loss=6.79, metric=885]
Epoch 8: 100%|██████████| 50/50 [04:54<00:00,  5.89s/batch, loss=6.71, metric=817]
Epoch 9: 100%|██████████| 50/50 [04:55<00:00,  5.91s/batch, loss=6.64, metric=761]
Epoch 10: 100%|██████████| 50/50 [04:55<00:00,  5.90s/batch, loss=6.58, metric=722]
Epoch 11: 100%|██████████| 50/50 [04:53<00:00,  5.87s/batch, loss=6.49, m

### 🟡 **Step 3:** Train the model for a limited number of epochs, experimenting with various learning rates.

In [None]:
num_epochs = 1

for lr in [20, 15, 10, 7.5, 5, 2.5]:
  print(f'LR={lr}')

  model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto,
                      weight_drop=weight_drop, pretrained=True).to(device)
  # model = torch.load('model.pt')

  optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)

  for epoch in range(num_epochs):
    model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch)

  print()

LR=20
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
100%|██████████████████████████████████████████████████| 367/367 [01:31<00:00,  4.02batch/s, loss=8.88, metric=7.16e+3]



LR=15
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:26<00:00,  4.24batch/s, loss=6.75, metric=857]



LR=10
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:20<00:00,  4.56batch/s, loss=6.75, metric=858]



LR=7.5
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:28<00:00,  4.16batch/s, loss=6.79, metric=894]



LR=5
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:28<00:00,  4.16batch/s, loss=6.84, metric=939]



LR=2.5
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


  2%|▉                                                   | 7/367 [00:01<01:40,  3.59batch/s, loss=9.56, metric=1.42e+4]


KeyboardInterrupt: ignored

### 🟡 Step 4: Create a small grid using the weight decay and the best learning rate.





In [None]:
num_epochs = 1

for lr in [7, 8, 14, 13, 12, 11, 10, 9]:
  for wd in [1.2e-6]:
    print(f'LR={lr}, WD={wd}')

    model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto,
                      weight_drop=weight_drop, pretrained=True).to(device)

    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)

    for epoch in range(num_epochs):
      model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch)

    print()

LR=7, WD=1.2e-06
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:28<00:00,  4.15batch/s, loss=6.68, metric=795]



LR=8, WD=1.2e-06
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:29<00:00,  4.11batch/s, loss=6.61, metric=745]



LR=14, WD=1.2e-06
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


  3%|█▌                                                 | 11/367 [00:02<01:32,  3.83batch/s, loss=11.3, metric=8.28e+4]


KeyboardInterrupt: ignored

### 🟡 Step 5: Train model for longer epochs using the best model from step 4.





In [None]:
model = LanguageModel(len(vocab), embedding_dim=300,
                      hidden_dim=512, num_layers=2,
                      dropout_embd=0.5, dropout_rnn=0.2).to(device)

In [None]:
model = torch.load('/content/model-ppl_133.pt')

In [None]:
lr = 3
wd = 1e-6
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
num_epochs = 30

for epoch in range(1, num_epochs+1):
  # WandB
  run = wandb.init(
        project="language-modeling-lstms",
        config={
            "learning_rate": lr,
            "epochs": num_epochs,
        })

  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                    train_loader,
                                                    loss_fn,
                                                    optimizer,
                                                    metric,
                                                    epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                      valid_loader,
                                      loss_fn,
                                      metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  epoch_counter += 1

## 🟠 Main Loop

In [None]:
torch.cuda.empty_cache()

🔰 Define train dataloader.

In [None]:
set_seed(seed)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

🔰 Define model.

In [None]:
set_seed(seed)

model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto,
                      weight_drop=weight_drop, pretrained=pretrained).to(device)
model

LanguageModel(
  (embedding): Embedding(28782, 400)
  (lstms): ModuleList(
    (0): LSTM(400, 1150)
    (1): LSTM(1150, 1150)
    (2): LSTM(1150, 400)
  )
  (fc): Linear(in_features=400, out_features=28782, bias=True)
  (lockdrop): LockedDropout()
)

In [None]:
# model = torch.load('model.pt')

🔰 Define optimizer and Set learning rate and weight decay.

In [None]:
set_seed(seed)

lr = 7.5
# wd = 1e-6
# momentum = 0.9

optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)
# optimizer = optim.SGD([{'params': model.embedding.parameters(), 'lr': 0.1*lr},
#                        {'params': model.lstms.parameters(), 'lr': lr}],
#                       weight_decay=wd, momentum=momentum)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    foreach: None
    lr: 7.5
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 1.2e-06
)

🔰 Initialize `wandb`

In [None]:
if wandb_enable:
  wandb.init(
      project='LM-AWD-LSTM',
      name=wandb_arg_name,
      config={
          'lr': lr,
          'momentum': momentum,
          'batch_size': batch_size,
          'seq_len': seq_len,
          'hidden_dim': hidden_dim,
          'embedding_dim': embedding_dim,
          'num_layers': num_layers,
          'dropout_embed': dropoute,
          'dropout_in_lstm': dropouti,
          'dropout_h_lstm': dropouth,
          'dropout_out_lstm': dropouto,
          'clip': clip,
      }
  )

🔰 Write code to train the model for `num_epochs` epoches.

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
set_seed(seed)
num_epochs = 30

for epoch in range(1, num_epochs+1):
  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                    train_loader,
                                                    loss_fn,
                                                    optimizer,
                                                    metric,
                                                    epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                      valid_loader,
                                      loss_fn,
                                      metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  if wandb_enable:
    wandb.log({"metric_train": metric_train, "loss_train": loss_train,
                "metric_valid": metric_valid, "loss_valid": loss_valid})

  epoch_counter += 1

Epoch 1: 100%|█████████████████████████████████████████████| 733/733 [02:29<00:00,  4.91batch/s, loss=6.31, metric=552]


Model Saved!
Valid: Loss = 5.503, Metric = 245.9



Epoch 2: 100%|█████████████████████████████████████████████| 733/733 [02:29<00:00,  4.89batch/s, loss=5.61, metric=275]


Model Saved!
Valid: Loss = 5.132, Metric = 169.7



Epoch 3: 100%|█████████████████████████████████████████████| 733/733 [02:30<00:00,  4.88batch/s, loss=5.33, metric=207]


Model Saved!
Valid: Loss = 4.936, Metric = 139.4



Epoch 4: 100%|█████████████████████████████████████████████| 733/733 [02:30<00:00,  4.87batch/s, loss=5.16, metric=173]


Model Saved!
Valid: Loss = 4.838, Metric = 126.5



Epoch 5: 100%|█████████████████████████████████████████████| 733/733 [02:30<00:00,  4.88batch/s, loss=5.04, metric=155]


Model Saved!
Valid: Loss = 4.805, Metric = 122.3



Epoch 6: 100%|█████████████████████████████████████████████| 733/733 [02:30<00:00,  4.88batch/s, loss=4.95, metric=141]


Model Saved!
Valid: Loss = 4.726, Metric = 113.0



Epoch 7: 100%|█████████████████████████████████████████████| 733/733 [02:30<00:00,  4.87batch/s, loss=4.88, metric=132]


Model Saved!
Valid: Loss = 4.693, Metric = 109.3



Epoch 8: 100%|█████████████████████████████████████████████| 733/733 [02:31<00:00,  4.83batch/s, loss=4.82, metric=124]


Model Saved!
Valid: Loss = 4.678, Metric = 107.6



Epoch 9: 100%|█████████████████████████████████████████████| 733/733 [02:30<00:00,  4.86batch/s, loss=4.77, metric=118]


Model Saved!
Valid: Loss = 4.662, Metric = 105.9



Epoch 10: 100%|████████████████████████████████████████████| 733/733 [02:30<00:00,  4.87batch/s, loss=4.73, metric=113]


Model Saved!
Valid: Loss = 4.637, Metric = 103.3



Epoch 11: 100%|█████████████████████████████████████████████| 733/733 [02:30<00:00,  4.86batch/s, loss=4.7, metric=109]


Valid: Loss = 4.651, Metric = 104.9



Epoch 12: 100%|████████████████████████████████████████████| 733/733 [02:30<00:00,  4.86batch/s, loss=4.67, metric=106]


Model Saved!
Valid: Loss = 4.617, Metric = 101.3



Epoch 13: 100%|████████████████████████████████████████████| 733/733 [02:30<00:00,  4.86batch/s, loss=4.64, metric=104]


Model Saved!
Valid: Loss = 4.605, Metric = 100.1



Epoch 14: 100%|████████████████████████████████████████████| 733/733 [02:30<00:00,  4.87batch/s, loss=4.61, metric=101]


Model Saved!
Valid: Loss = 4.6, Metric = 99.56



Epoch 15: 100%|███████████████████████████████████████████| 733/733 [02:30<00:00,  4.88batch/s, loss=4.59, metric=98.9]


Model Saved!
Valid: Loss = 4.578, Metric = 97.46



Epoch 16: 100%|███████████████████████████████████████████| 733/733 [02:30<00:00,  4.87batch/s, loss=4.57, metric=96.7]


Valid: Loss = 4.59, Metric = 98.56



Epoch 17: 100%|███████████████████████████████████████████| 733/733 [02:30<00:00,  4.87batch/s, loss=4.56, metric=95.3]


Model Saved!
Valid: Loss = 4.574, Metric = 97.03



Epoch 18: 100%|███████████████████████████████████████████| 733/733 [02:29<00:00,  4.90batch/s, loss=4.54, metric=93.6]


Model Saved!
Valid: Loss = 4.566, Metric = 96.32



Epoch 19: 100%|███████████████████████████████████████████| 733/733 [02:29<00:00,  4.91batch/s, loss=4.52, metric=92.3]


Valid: Loss = 4.573, Metric = 97.0



Epoch 20: 100%|█████████████████████████████████████████████| 733/733 [02:29<00:00,  4.91batch/s, loss=4.51, metric=91]


Valid: Loss = 4.568, Metric = 96.51



Epoch 21: 100%|████████████████████████████████████████████| 733/733 [02:30<00:00,  4.86batch/s, loss=4.5, metric=90.3]


Valid: Loss = 4.568, Metric = 96.5



Epoch 22:  10%|████▌                                       | 75/733 [00:15<02:19,  4.73batch/s, loss=4.46, metric=86.9]

In [None]:
wandb.finish()

## 🟠 Plot

🔰 Plot learning curves

In [None]:
plt.figure(figsize=(8, 6))

plt.plot(range(epoch_counter), loss_train_hist, 'r-', label='Train')
plt.plot(range(epoch_counter), loss_valid_hist, 'b-', label='Validation')

plt.xlabel('Epoch')
plt.ylabel('loss')
plt.grid(True)
plt.legend()

# 🔴 **Test**

🔰 Test your model using data from the test set and images that are not present in the dataset.

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
loss_valid, metric_valid = evaluate(model, valid_loader, loss_fn, metric)
metric_valid

In [None]:
loss_test, metric_test = evaluate(model, test_loader, loss_fn, metric)
metric_test

# 🔴 **Generate**

🔰 Your mission is to write a `generate` function and use a desired sentence to evaluate the model

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
prompt = 'In a galaxy far, far away, there'

indices = vocab(tokenizer(prompt))
itos = vocab.get_itos()

max_seq_len = 35
for i in range(max_seq_len):
  src = torch.LongTensor(indices).to(device)

  with torch.no_grad():
    prediction = model(src)

  # Method 1
  # idx = torch.argmax(prediction[-1])
  # itos = vocab.get_itos()
  # itos[idx]

  # Method 2
  temperature = 0.5
  probs = torch.softmax(prediction[-1]/temperature, dim=0)

  idx = vocab['<ukn>']
  while idx == vocab['<ukn>']:
    idx = torch.multinomial(probs, num_samples=1).item()

  token = itos[idx]
  prompt += ' ' + token

  if idx == vocab['.']:
    break

  indices.append(idx)

print(prompt)

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, seed=None):
  if seed is not None:
    torch.manual_seed(seed)

  indices = vocab(tokenizer(prompt))
  itos = vocab.get_itos()

  for i in range(max_seq_len):
    src = torch.LongTensor(indices).to(device)

    with torch.no_grad():
      prediction = model(src)

    # Method 1
    # idx = torch.argmax(prediction[-1])
    # itos = vocab.get_itos()
    # itos[idx]

    # Method 2
    probs = torch.softmax(prediction[-1]/temperature, dim=0)

    idx = vocab['<ukn>']
    while idx == vocab['<ukn>']:
      idx = torch.multinomial(probs, num_samples=1).item()

    token = itos[idx]
    prompt += ' ' + token

    if idx == vocab['.']:
      return prompt

    indices.append(idx)

  return prompt

In [None]:
prompt = 'In a galaxy far, far away, there'
prompt = 'The sun was setting in the'
prompt = 'Once upon a time, there lived a young princess named'
prompt = 'What is the meaning '

generate(prompt, 35, 0.5, model, tokenizer, vocab)