<a href="https://colab.research.google.com/github/ft-Azad/Language-Modeling/blob/main/LanguageModel(AWD_LSTM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Import Libs

In [None]:
!pip install -q torchmetrics
!pip install -q torchdata==0.6.1
!pip install -q 'portalocker>=2.0.0'
!pip install -q torchtext==0.15.1
!pip install -q comet_ml

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torchtext
from torchtext.datasets import WikiText2
from torchdata.datapipes.iter import IterableWrapper, Mapper
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split

from torch import optim
from torch.nn import functional as F

import os
import tqdm
import torchmetrics as tm

from collections import Counter

from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

In [None]:
for lib in [np, torch, torchtext, tqdm]:
  print(lib.__name__, '-->', lib.__version__)

numpy --> 1.25.2
torch --> 2.0.0+cu117
torchtext --> 0.15.1+cpu
tqdm --> 4.66.4


# Utils

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

In [None]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
      torch.cuda.manual_seed(seed)
      # torch.cuda.manual_seed_all(seed)

      # torch.backends.cudnn.deterministic = True
      # torch.backends.cudnn.benchmark = False


# Arguments

In [None]:
seed = 8

batch_size = 80
seq_len = 70

embedding_dim = 400

num_layers = 3
hidden_dim = 1150
# dropout_embd = 0.5
# dropout_rnn = 0
dropoute = 0.1
dropouti = 0.65
dropouth = 0.3
dropouto = 0.4


lr = 3
wd = 1e-6
momentum = 0.9

clip = 0.25

wandb_enable = True

#Dataset

## Load Dataset and Build Vocab

- Loading Iterable Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip -q "/content/drive/MyDrive/LanguageModel/Data/wikitext-2-v1.zip" -d '/content/'

In [None]:
with open('/content/wikitext-2/wiki.train.tokens', 'r') as file:
    train_iter = file.read().splitlines()

with open('/content/wikitext-2/wiki.test.tokens', 'r') as file:
    test_iter = file.read().splitlines()

with open('/content/wikitext-2/wiki.valid.tokens', 'r') as file:
    valid_iter = file.read().splitlines()

train_iter = IterableWrapper(train_iter)
test_iter = IterableWrapper(test_iter)
valid_iter = IterableWrapper(valid_iter)

- Build Vocabulary

In [None]:
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
torch.save(vocab, 'vocab.pt')

## Dataset Prepreation

- Creat Target Array

In [None]:
def data_process(raw_text_iter, seq_len):
  data = torch.cat([torch.LongTensor(vocab(tokenizer(line))) for line in raw_text_iter])

  M = len(data) // seq_len

  r = len(data) % seq_len
  data = torch.cat((data, torch.LongTensor([0]))) if r==0 else data

  inputs = data[:M*seq_len]
  targets = data[1:M*seq_len+1]

  inputs = inputs.reshape(-1, seq_len)
  targets = targets.reshape(-1, seq_len)

  return inputs, targets

In [None]:
X_train, y_train = data_process(train_iter, seq_len)
X_valid, y_valid = data_process(valid_iter, seq_len)
X_test, y_test = data_process(test_iter, seq_len)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

(torch.Size([29285, 70]),
 torch.Size([29285, 70]),
 torch.Size([3063, 70]),
 torch.Size([3063, 70]),
 torch.Size([3455, 70]),
 torch.Size([3455, 70]))

- Custom Dataset Class Definition

In [None]:
class LanguageModelDataset(Dataset):

  def __init__(self, inputs, targets):
    self.inputs = inputs
    self.targets = targets

  def __len__(self):
    return self.inputs.shape[0]

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]

In [None]:
train_set = LanguageModelDataset(X_train, y_train)
valid_set = LanguageModelDataset(X_valid, y_valid)
test_set = LanguageModelDataset(X_test, y_test)

In [None]:
train_set[0]

(tensor([    9,  3849,  3869,   881,     9, 20000,    83,  3849,    88,     0,
          3869,    21,   780, 28780,     2,  6182,     3,  3849,     4,     1,
          5023,    88,    20,     2,  1837,  1018,     7,    14,  3849,  3869,
           881,   629,   976,     2,    23,     8,  5790,   299,    12,   575,
           232,    67,   452,    19, 13722,     5,   757,     3,  2500,    17,
             1,  1767,  5637,     3,   155,     6,   246,   354,     6,   976,
             2,    24,    23,     1,   237,    67,     6,     1,  3849,    93]),
 tensor([ 3849,  3869,   881,     9, 20000,    83,  3849,    88,     0,  3869,
            21,   780, 28780,     2,  6182,     3,  3849,     4,     1,  5023,
            88,    20,     2,  1837,  1018,     7,    14,  3849,  3869,   881,
           629,   976,     2,    23,     8,  5790,   299,    12,   575,   232,
            67,   452,    19, 13722,     5,   757,     3,  2500,    17,     1,
          1767,  5637,     3,   155,     6,   246,

- Data Loader

In [None]:
set_seed(seed)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [None]:
x_batch, y_batch = next(iter(train_loader))
x_batch.shape, y_batch.shape

(torch.Size([80, 70]), torch.Size([80, 70]))

In [None]:
set_seed(seed)

for inputs, targets in train_loader:
  print(inputs[0, 0], targets[0, 0])
  break

tensor(1985) tensor(13)


# Model

In [None]:
def embedded_dropout(embed, words, dropout=0.1, scale=None):
  if dropout:
    mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(
        embed.weight) / (1 - dropout)
    masked_embed_weight = mask * embed.weight
  else:
    masked_embed_weight = embed.weight
  if scale:
    masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight

  padding_idx = embed.padding_idx
  if padding_idx is None:
    padding_idx = -1

  embedding = torch.nn.functional.embedding(words, masked_embed_weight,
                                            padding_idx, embed.max_norm, embed.norm_type,
                                            embed.scale_grad_by_freq, embed.sparse)
  return embedding

In [None]:
class LockedDropout(nn.Module):
  def __init__(self):
    super(LockedDropout, self).__init__()

  def forward(self, x, dropout):
    if not self.training or not dropout:
      return x
    m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
    mask = m.requires_grad_(False) / (1 - dropout)
    mask = mask.expand_as(x)
    return mask * x

In [None]:
class LanguageModel(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers,
                dropoute=0.2, dropouti=0.2, dropouth=0.2, dropouto=0.2):
    super().__init__()
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    self.embedding_dim = embedding_dim

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.embedding.weight.data.uniform_(-0.1, 0.1)
    # self.dropout = nn.Dropout(p=dropout_embd)

    self.lstms = []
    self.lstms.append(nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=0, batch_first=False))
    self.lstms.append(nn.LSTM(hidden_dim, hidden_dim, num_layers=1, dropout=0, batch_first=False))
    self.lstms.append(nn.LSTM(hidden_dim, embedding_dim, num_layers=1, dropout=0, batch_first=False))
    self.lstms = nn.ModuleList(self.lstms)

    self.fc = nn.Linear(embedding_dim, vocab_size)

    self.fc.weight = self.embedding.weight

    self.lockdrop = LockedDropout()
    self.dropoute = dropoute
    self.dropouti = dropouti
    self.dropouth = dropouth
    self.dropouto = dropouto

  def forward(self, src):
    # embedding = self.dropout(self.embedding(src))
    embedding = embedded_dropout(self.embedding, src, dropout=self.dropoute if self.training else 0)
    embedding = self.lockdrop(embedding, self.dropouti)

    # new_hiddens = []
    for l, lstm in enumerate(self.lstms):
      embedding, _ = lstm(embedding)
      if l != self.num_layers-1:
        embedding = self.lockdrop(embedding, self.dropouth)

    embedding = self.lockdrop(embedding, self.dropouto)

    prediction = self.fc(embedding)
    return prediction

In [None]:
set_seed(seed)

model = LanguageModel(vocab_size=len(vocab),
                      embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim,
                      num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto)
model


LanguageModel(
  (embedding): Embedding(28782, 400)
  (lstms): ModuleList(
    (0): LSTM(400, 1150)
    (1): LSTM(1150, 1150)
    (2): LSTM(1150, 400)
  )
  (fc): Linear(in_features=400, out_features=28782, bias=True)
  (lockdrop): LockedDropout()
)

# Configuration

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
loss_fn = nn.CrossEntropyLoss()

metric = tm.text.Perplexity().to(device)

In [None]:
key_file = "/content/drive/MyDrive/LanguageModel/key.txt"

if os.path.exists(key_file):
    with open(key_file) as f:
        key = f.readline().strip()
else:
    print("Key file does not exist. Please create the key file with your wandb API key.")

In [None]:
wandb_arg_name = input('Please input the WandB argument (run) name:')

Please input the WandB argument (run) name:embd&lock-drop


In [None]:
# comet_ml.init(project_name="LM_AWD-LSTM")

In [None]:
experiment = Experiment(
  api_key=key,
  project_name="LM_AWD-LSTM",
)
experiment.set_name(wandb_arg_name)

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/ft-azad/lm-awd-lstm/7aeb69dee61b4328a5d0c0800cd9c783



# Train

## Train and Evaluate Functions

In [None]:
# Train Functions
def train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch=None):
  model.train()
  loss_train = AverageMeter()
  metric.reset()

  with tqdm.tqdm(train_loader, unit='batch') as tepoch:
    for inputs, targets in tepoch:
      if epoch:
        tepoch.set_description(f'Epoch {epoch}')

      inputs = inputs.t().to(device)
      targets = targets.t().to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

      loss.backward()

      nn.utils.clip_grad.clip_grad_norm_(model.parameters(), max_norm=clip)

      optimizer.step()
      optimizer.zero_grad()

      loss_train.update(loss.item(), n=len(targets))
      metric.update(outputs, targets)

      tepoch.set_postfix(loss=loss_train.avg, metric=metric.compute().item())

  return model, loss_train.avg, metric.compute().item()

In [None]:
# Evaluate Function
def evaluate(model, test_loader, loss_fn, metric):
  model.eval()
  loss_eval = AverageMeter()
  metric.reset()

  with torch.inference_mode():
    for inputs, targets in test_loader:
      inputs = inputs.t().to(device)
      targets = targets.t().to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())
      loss_eval.update(loss.item(), n=len(targets))

      metric(outputs, targets)

  return loss_eval.avg, metric.compute().item()

## Train Process and Tunning

### Finding Hyper-parameters

- Calculating the loss for untrained model using a few batches

In [None]:
for iter_num in range(5):
  model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto).to(device)

  inputs, targets = next(iter(train_loader))
  inputs = inputs.to(device)
  targets = targets.to(device)

  with torch.no_grad():
    outputs = model(inputs)
    loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

  print(loss)

tensor(10.2711, device='cuda:0')
tensor(10.2717, device='cuda:0')
tensor(10.2653, device='cuda:0')
tensor(10.2671, device='cuda:0')
tensor(10.2690, device='cuda:0')


- Train and try to overfit the model on a small subset of the dataset.

In [None]:
model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.9, momentum=0.9)

In [None]:
mini_train_size = 1000
_, mini_train_dataset = random_split(train_set, (len(train_set)-mini_train_size, mini_train_size))
mini_train_loader = DataLoader(mini_train_dataset, 20)

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
  model, _, _ = train_one_epoch(model, mini_train_loader, loss_fn, optimizer, metric, epoch)

100%|██████████| 50/50 [00:01<00:00, 26.48batch/s, loss=8.33, metric=4.15e+3]
Epoch 1: 100%|██████████| 50/50 [00:01<00:00, 25.97batch/s, loss=7.18, metric=1.31e+3]
Epoch 2: 100%|██████████| 50/50 [00:01<00:00, 25.92batch/s, loss=7, metric=1.1e+3]
Epoch 3: 100%|██████████| 50/50 [00:01<00:00, 25.80batch/s, loss=6.93, metric=1.02e+3]
Epoch 4: 100%|██████████| 50/50 [00:01<00:00, 25.78batch/s, loss=6.84, metric=937]
Epoch 5: 100%|██████████| 50/50 [00:01<00:00, 25.14batch/s, loss=6.72, metric=832]
Epoch 6: 100%|██████████| 50/50 [00:02<00:00, 24.81batch/s, loss=6.61, metric=740]
Epoch 7: 100%|██████████| 50/50 [00:01<00:00, 25.30batch/s, loss=6.51, metric=672]
Epoch 8: 100%|██████████| 50/50 [00:01<00:00, 25.30batch/s, loss=6.43, metric=619]
Epoch 9: 100%|██████████| 50/50 [00:01<00:00, 25.16batch/s, loss=6.35, metric=572]
Epoch 10: 100%|██████████| 50/50 [00:01<00:00, 25.14batch/s, loss=6.27, metric=529]
Epoch 11: 100%|██████████| 50/50 [00:01<00:00, 25.29batch/s, loss=6.2, metric=493]


KeyboardInterrupt: 

-  Train the model for a limited number of epochs, experimenting with various learning rates to find best value

In [None]:
num_epochs = 2

for lr in [12, 8, 4, 2, 0.9]:
  print(f'LR={lr}')

  model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                        hidden_dim=hidden_dim, num_layers=num_layers,
                        dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto).to(device)
  # model = torch.load('model.pt')

  optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)

  for epoch in range(num_epochs):
    model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch)

  print()

- Creat a small grid search to find exact value of lr and weight decay

In [None]:
num_epochs = 1

for lr in [7, 8, 14, 13, 12, 11, 10, 9]:
  for wd in [1.2e-6]:
    print(f'LR={lr}, WD={wd}')

    model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto).to(device)

    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)

    for epoch in range(num_epochs):
      model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch)

    print()

- Train model for more epochs using the best hyperparameters

In [None]:
model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto).to(device)
model

LanguageModel(
  (embedding): Embedding(28782, 300)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(300, 512, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=512, out_features=28782, bias=True)
)

In [None]:
model = torch.load('/content/model-ppl_133.pt')

FileNotFoundError: [Errno 2] No such file or directory: '/content/model-ppl_133.pt'

In [None]:
lr = 8
wd = 1e-6
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 8
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 1e-06
)

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
num_epochs = 30

for epoch in range(1, num_epochs+1):
  # WandB
  # run = wandb.init(
  #       project="language-modeling-lstms",
  #       config={
  #           "learning_rate": lr,
  #           "epochs": num_epochs,
  #       })

  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                    train_loader,
                                                    loss_fn,
                                                    optimizer,
                                                    metric,
                                                    epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                      valid_loader,
                                      loss_fn,
                                      metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  epoch_counter += 1

### Main Train Loop

In [None]:
torch.cuda.empty_cache()

In [None]:
set_seed(seed)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

In [None]:
set_seed(seed)

model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto).to(device)

# model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
#                       hidden_dim=hidden_dim, num_layers=num_layers,
#                       dropoute=dropoute, dropouti=dropouti,
#                       dropouth=dropouth, dropouto=dropouto,
#                       weight_drop=weight_drop, pretrained=pretrained).to(device)
model

LanguageModel(
  (embedding): Embedding(28782, 400)
  (lstms): ModuleList(
    (0): LSTM(400, 1150)
    (1): LSTM(1150, 1150)
    (2): LSTM(1150, 400)
  )
  (fc): Linear(in_features=400, out_features=28782, bias=True)
  (lockdrop): LockedDropout()
)

In [None]:
model = torch.load('model.pt')
# epoch_counter = best_epoch

In [None]:
import shutil

In [None]:
shutil.copyfile("model.pt", "/content/drive/MyDrive/LanguageModel/model-ppl100.3epoch29.pt")

'/content/drive/MyDrive/LanguageModel/model-ppl100.3epoch29.pt'

In [None]:
set_seed(seed)

lr = 1.25
wd = 1e-6
# momentum = 0.9

optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)
# optimizer = optim.SGD([{'params': model.embedding.parameters(), 'lr': 0.1*lr},
#                        {'params': model.lstms.parameters(), 'lr': lr}],
#                       weight_decay=wd, momentum=momentum)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 1.25
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 1e-06
)

In [None]:
if wandb_enable:

      config={
          'lr': lr,
          'momentum': momentum,
          'batch_size': batch_size,
          'seq_len': seq_len,
          'hidden_dim': hidden_dim,
          'embedding_dim': embedding_dim,
          'num_layers': num_layers,
          # 'dropout_embd': dropout_embd,
          # 'dropout_rnn': dropout_rnn,
          'dropout_embed': dropoute,
          'dropout_in_lstm': dropouti,
          'dropout_h_lstm': dropouth,
          'dropout_out_lstm': dropouto,
          'clip': clip,
      }

      experiment.log_parameters(config)

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 1

In [None]:
set_seed(seed)
num_epochs = 15

for epoch in range(1, num_epochs+1):
  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                    train_loader,
                                                    loss_fn,
                                                    optimizer,
                                                    metric,
                                                    epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                      valid_loader,
                                      loss_fn,
                                      metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    torch.save(model, f'/content/drive/MyDrive/LanguageModel/model.pt')
    best_loss_valid = loss_valid
    best_epoch = epoch_counter
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  if wandb_enable:
    experiment.log_metrics({"metric_train": metric_train, "loss_train": loss_train,
                "metric_valid": metric_valid, "loss_valid": loss_valid}, epoch=epoch_counter)

  epoch_counter += 1

Epoch 1: 100%|██████████| 367/367 [02:33<00:00,  2.39batch/s, loss=4.27, metric=71.7]


Model Saved!
Valid: Loss = 4.522, Metric = 92.45



Epoch 2: 100%|██████████| 367/367 [02:33<00:00,  2.38batch/s, loss=4.21, metric=67.6]


Model Saved!
Valid: Loss = 4.515, Metric = 91.83



Epoch 3: 100%|██████████| 367/367 [02:33<00:00,  2.39batch/s, loss=4.19, metric=66]


Model Saved!
Valid: Loss = 4.513, Metric = 91.66



Epoch 4: 100%|██████████| 367/367 [02:34<00:00,  2.38batch/s, loss=4.17, metric=64.6]


Model Saved!
Valid: Loss = 4.511, Metric = 91.47



Epoch 5: 100%|██████████| 367/367 [02:34<00:00,  2.38batch/s, loss=4.14, metric=62.7]


Model Saved!
Valid: Loss = 4.509, Metric = 91.26



Epoch 6: 100%|██████████| 367/367 [02:34<00:00,  2.38batch/s, loss=4.12, metric=61.7]


Valid: Loss = 4.509, Metric = 91.3



Epoch 7: 100%|██████████| 367/367 [02:33<00:00,  2.38batch/s, loss=4.09, metric=59.9]


Valid: Loss = 4.51, Metric = 91.37



Epoch 8: 100%|██████████| 367/367 [02:33<00:00,  2.39batch/s, loss=4.07, metric=58.5]


Valid: Loss = 4.512, Metric = 91.54



Epoch 9: 100%|██████████| 367/367 [02:33<00:00,  2.39batch/s, loss=4.05, metric=57.7]


Valid: Loss = 4.514, Metric = 91.71



Epoch 10: 100%|██████████| 367/367 [02:33<00:00,  2.38batch/s, loss=4.02, metric=55.9]


Valid: Loss = 4.521, Metric = 92.37



Epoch 11: 100%|██████████| 367/367 [02:33<00:00,  2.38batch/s, loss=3.99, metric=54.3]


Valid: Loss = 4.524, Metric = 92.61



Epoch 12: 100%|██████████| 367/367 [02:33<00:00,  2.38batch/s, loss=3.96, metric=52.8]


Valid: Loss = 4.518, Metric = 92.06



Epoch 13: 100%|██████████| 367/367 [02:34<00:00,  2.38batch/s, loss=3.93, metric=51.2]


Valid: Loss = 4.529, Metric = 93.09



Epoch 14: 100%|██████████| 367/367 [02:34<00:00,  2.38batch/s, loss=3.89, metric=49.3]


Valid: Loss = 4.533, Metric = 93.48



Epoch 15:   6%|▌         | 21/367 [00:09<02:31,  2.28batch/s, loss=3.8, metric=44.9]


KeyboardInterrupt: 

In [None]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : embd&lock-drop
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/ft-azad/lm-awd-lstm/7aeb69dee61b4328a5d0c0800cd9c783
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss_train [58]   : (3.8949986699491497, 6.684415862735675)
[1;38;5;39mCOMET INFO:[0m     loss_valid [58]   : (4.509146775954809, 5.797027367811936)
[1;38;5;39mCOMET INFO:[0m     metric_train [58] : (49.26998519897461, 800.5197143554688)
[1;38;5;39mCOMET INFO:[0m     metric_valid [58] : (91.26468658447266, 330.65228271

# Test

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
loss_valid, metric_valid = evaluate(model, valid_loader, loss_fn, metric)
metric_valid

In [None]:
loss_test, metric_test = evaluate(model, test_loader, loss_fn, metric)
metric_test

# Generate

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, seed=None):

  indices = vocab(tokenizer(prompt))
  itos = vocab.get_itos()

  for i in range(max_seq_len):
    src = torch.LongTensor(indices).to(device)
    with torch.no_grad():
      prediction = model(src)

    # Low values like 0.1 for temperature, Makes softmax like argmax more
    probs = torch.softmax(prediction[-1]/temperature, dim = 0)
    idx = vocab["<ukn>"]
    while idx == vocab["<ukn>"]:
      idx = torch.multinomial(probs, num_samples =1).item()
    indices.append(idx)
    prompt += " " + itos[idx]
    # print(prompt)

    if idx == vocab["."]:
      return prompt

In [None]:
prompt = "as i know about this subject,"
generate(prompt, 40, 0.5, model, tokenizer, vocab, seed=None)