<a href="https://colab.research.google.com/github/ft-Azad/Language-Modeling/blob/main/LanguageModel(AWD_LSTM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Import Libs

In [1]:
!pip install -q torchmetrics
!pip install -q torchdata==0.6.1
!pip install 'portalocker>=2.0.0'
!pip install torchtext==0.15.1
!pip install comet_ml

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [24]:
import numpy as np
import matplotlib.pyplot as plt

import torchtext
from torchtext.datasets import WikiText2
from torchdata.datapipes.iter import IterableWrapper, Mapper
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split

from torch import optim
from torch.nn import functional as F

import os
import tqdm
import torchmetrics as tm

from collections import Counter

from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

In [2]:
for lib in [np, torch, torchtext, tqdm]:
  print(lib.__name__, '-->', lib.__version__)

numpy --> 1.25.2
torch --> 2.0.0+cu117
torchtext --> 0.15.1+cpu
tqdm --> 4.66.4


# Utils

In [3]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [4]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

In [5]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
      torch.cuda.manual_seed(seed)
      # torch.cuda.manual_seed_all(seed)

      # torch.backends.cudnn.deterministic = True
      # torch.backends.cudnn.benchmark = False


# Arguments

In [21]:
seed = 8

batch_size = 20
seq_len = 35

embedding_dim = 300

num_layers = 2
hidden_dim = 512
dropout_embd = 0.5
dropout_rnn = 0.2


lr = 3
wd = 1e-6
momentum = 0.9

clip = 0.25

wandb_enable = True

#Dataset

## Load Dataset and Build Vocab

- Loading Iterable Data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!unzip -q "/content/drive/MyDrive/LanguageModel/Data/wikitext-2-v1.zip" -d '/content/'

In [9]:
with open('/content/wikitext-2/wiki.train.tokens', 'r') as file:
    train_iter = file.read().splitlines()

with open('/content/wikitext-2/wiki.test.tokens', 'r') as file:
    test_iter = file.read().splitlines()

with open('/content/wikitext-2/wiki.valid.tokens', 'r') as file:
    valid_iter = file.read().splitlines()

train_iter = IterableWrapper(train_iter)
test_iter = IterableWrapper(test_iter)
valid_iter = IterableWrapper(valid_iter)

- Build Vocabulary

In [10]:
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
torch.save(vocab, 'vocab.pt')

## Dataset Prepreation

- Creat Target Array

In [11]:
def data_process(raw_text_iter, seq_len):
  data = torch.cat([torch.LongTensor(vocab(tokenizer(line))) for line in raw_text_iter])

  M = len(data) // seq_len

  r = len(data) % seq_len
  data = torch.cat((data, torch.LongTensor([0]))) if r==0 else data

  inputs = data[:M*seq_len]
  targets = data[1:M*seq_len+1]

  inputs = inputs.reshape(-1, seq_len)
  targets = targets.reshape(-1, seq_len)

  return inputs, targets

In [12]:
X_train, y_train = data_process(train_iter, seq_len)
X_valid, y_valid = data_process(valid_iter, seq_len)
X_test, y_test = data_process(test_iter, seq_len)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

(torch.Size([58571, 35]),
 torch.Size([58571, 35]),
 torch.Size([6126, 35]),
 torch.Size([6126, 35]),
 torch.Size([6910, 35]),
 torch.Size([6910, 35]))

- Custom Dataset Class Definition

In [13]:
class LanguageModelDataset(Dataset):

  def __init__(self, inputs, targets):
    self.inputs = inputs
    self.targets = targets

  def __len__(self):
    return self.inputs.shape[0]

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]

In [14]:
train_set = LanguageModelDataset(X_train, y_train)
valid_set = LanguageModelDataset(X_valid, y_valid)
test_set = LanguageModelDataset(X_test, y_test)

In [15]:
train_set[0]

(tensor([    9,  3849,  3869,   881,     9, 20000,    83,  3849,    88,     0,
          3869,    21,   780, 28780,     2,  6182,     3,  3849,     4,     1,
          5023,    88,    20,     2,  1837,  1018,     7,    14,  3849,  3869,
           881,   629,   976,     2,    23]),
 tensor([ 3849,  3869,   881,     9, 20000,    83,  3849,    88,     0,  3869,
            21,   780, 28780,     2,  6182,     3,  3849,     4,     1,  5023,
            88,    20,     2,  1837,  1018,     7,    14,  3849,  3869,   881,
           629,   976,     2,    23,     8]))

- Data Loader

In [16]:
set_seed(seed)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [17]:
x_batch, y_batch = next(iter(train_loader))
x_batch.shape, y_batch.shape

(torch.Size([20, 35]), torch.Size([20, 35]))

In [18]:
set_seed(seed)

for inputs, targets in train_loader:
  print(inputs[0, 0], targets[0, 0])
  break

tensor(3444) tensor(201)


# Model

In [19]:
class LanguageModel(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers,
                dropout_embd=0.5, dropout_rnn=0.5):
    super().__init__()
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    self.embedding_dim = embedding_dim

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.embedding.weight.data.uniform_(-0.1, 0.1)
    self.dropout = nn.Dropout(p=dropout_embd)

    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                        dropout=dropout_rnn, batch_first=True)

    self.fc = nn.Linear(hidden_dim, vocab_size)

  def forward(self, src):
    embedding = self.dropout(self.embedding(src))
    output, hidden = self.lstm(embedding)
    prediction = self.fc(output)
    return prediction

In [22]:
set_seed(seed)

model = LanguageModel(vocab_size=len(vocab),
                      embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim,
                      num_layers=num_layers,
                      dropout_embd=dropout_embd,
                      dropout_rnn=dropout_rnn)
model


LanguageModel(
  (embedding): Embedding(28782, 300)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(300, 512, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=512, out_features=28782, bias=True)
)

# Configuration

In [25]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [26]:
loss_fn = nn.CrossEntropyLoss()

metric = tm.text.Perplexity().to(device)

In [87]:
key_file = "/content/drive/MyDrive/LanguageModel/key.txt"

if os.path.exists(key_file):
    with open(key_file) as f:
        key = f.readline().strip()
else:
    print("Key file does not exist. Please create the key file with your wandb API key.")

In [88]:
wandb_arg_name = input('Please input the WandB argument (run) name:')

Please input the WandB argument (run) name:Base-Model


In [None]:
# comet_ml.init(project_name="LM_AWD-LSTM")

In [89]:
experiment = Experiment(
  api_key=key,
  project_name="LM_AWD-LSTM",
)
experiment.set_name(wandb_arg_name)

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : base_model
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/ft-azad/lm-awd-lstm/3458ca9b061e4086a702e226aeb246ed
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss_train [8]   : (4.078268378952146, 5.890382317533928)
[1;38;5;39mCOMET INFO:[0m     loss_valid [8]   : (6.029867288732373, 6.170265618675306)
[1;38;5;39mCOMET INFO:[0m     metric_train [8] : (59.043067932128906, 361.54278564453125)
[1;38;5;39mCOMET INFO:[0m     metric_valid [8] : (415.9185485839844, 478.6871337890625)


# Train

## Train and Evaluate Functions

In [29]:
# Train Functions
def train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch=None):
  model.train()
  loss_train = AverageMeter()
  metric.reset()

  with tqdm.tqdm(train_loader, unit='batch') as tepoch:
    for inputs, targets in tepoch:
      if epoch:
        tepoch.set_description(f'Epoch {epoch}')

      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

      loss.backward()

      nn.utils.clip_grad.clip_grad_norm_(model.parameters(), max_norm=clip)

      optimizer.step()
      optimizer.zero_grad()

      loss_train.update(loss.item(), n=len(targets))
      metric.update(outputs, targets)

      tepoch.set_postfix(loss=loss_train.avg, metric=metric.compute().item())

  return model, loss_train.avg, metric.compute().item()

In [98]:
# Evaluate Function
def evaluate(model, test_loader, loss_fn, metric):
  model.eval()
  loss_eval = AverageMeter()
  metric.reset()

  with torch.inference_mode():
    for inputs, targets in test_loader:
      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())
      loss_eval.update(loss.item(), n=len(targets))

      metric(outputs, targets)

  return loss_eval.avg, metric.compute().item()

## Train Process and Tunning

### Finding Hyper-parameters

- Calculating the loss for untrained model using a few batches

In [67]:
for iter_num in range(5):
  model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropout_embd=dropout_embd, dropout_rnn=dropout_rnn).to(device)

  inputs, targets = next(iter(train_loader))
  inputs = inputs.to(device)
  targets = targets.to(device)

  with torch.no_grad():
    outputs = model(inputs)
    loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

  print(loss)

tensor(10.2711, device='cuda:0')
tensor(10.2717, device='cuda:0')
tensor(10.2653, device='cuda:0')
tensor(10.2671, device='cuda:0')
tensor(10.2690, device='cuda:0')


- Train and try to overfit the model on a small subset of the dataset.

In [68]:
model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropout_embd=dropout_embd, dropout_rnn=dropout_rnn).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.9, momentum=0.9)

In [69]:
mini_train_size = 1000
_, mini_train_dataset = random_split(train_set, (len(train_set)-mini_train_size, mini_train_size))
mini_train_loader = DataLoader(mini_train_dataset, 20)

In [70]:
num_epochs = 100
for epoch in range(num_epochs):
  model, _, _ = train_one_epoch(model, mini_train_loader, loss_fn, optimizer, metric, epoch)

100%|██████████| 50/50 [00:01<00:00, 26.48batch/s, loss=8.33, metric=4.15e+3]
Epoch 1: 100%|██████████| 50/50 [00:01<00:00, 25.97batch/s, loss=7.18, metric=1.31e+3]
Epoch 2: 100%|██████████| 50/50 [00:01<00:00, 25.92batch/s, loss=7, metric=1.1e+3]
Epoch 3: 100%|██████████| 50/50 [00:01<00:00, 25.80batch/s, loss=6.93, metric=1.02e+3]
Epoch 4: 100%|██████████| 50/50 [00:01<00:00, 25.78batch/s, loss=6.84, metric=937]
Epoch 5: 100%|██████████| 50/50 [00:01<00:00, 25.14batch/s, loss=6.72, metric=832]
Epoch 6: 100%|██████████| 50/50 [00:02<00:00, 24.81batch/s, loss=6.61, metric=740]
Epoch 7: 100%|██████████| 50/50 [00:01<00:00, 25.30batch/s, loss=6.51, metric=672]
Epoch 8: 100%|██████████| 50/50 [00:01<00:00, 25.30batch/s, loss=6.43, metric=619]
Epoch 9: 100%|██████████| 50/50 [00:01<00:00, 25.16batch/s, loss=6.35, metric=572]
Epoch 10: 100%|██████████| 50/50 [00:01<00:00, 25.14batch/s, loss=6.27, metric=529]
Epoch 11: 100%|██████████| 50/50 [00:01<00:00, 25.29batch/s, loss=6.2, metric=493]


KeyboardInterrupt: 

-  Train the model for a limited number of epochs, experimenting with various learning rates to find best value

In [81]:
num_epochs = 2

for lr in [12, 8, 4, 2, 0.9]:
  print(f'LR={lr}')

  model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                        hidden_dim=hidden_dim, num_layers=num_layers,
                        dropout_embd=dropout_embd, dropout_rnn=dropout_rnn).to(device)
  # model = torch.load('model.pt')

  optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)

  for epoch in range(num_epochs):
    model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch)

  print()

LR=12


100%|██████████| 2929/2929 [01:55<00:00, 25.42batch/s, loss=5.98, metric=394]
Epoch 1:  90%|████████▉ | 2624/2929 [01:47<00:12, 24.32batch/s, loss=5.65, metric=284]


KeyboardInterrupt: 

- Creat a small grid search to find exact value of lr and weight decay

In [None]:
num_epochs = 1

for lr in [7, 8, 14, 13, 12, 11, 10, 9]:
  for wd in [1.2e-6]:
    print(f'LR={lr}, WD={wd}')

    model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropout_embd=dropout_embd, dropout_rnn=dropout_rnn).to(device)

    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)

    for epoch in range(num_epochs):
      model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch)

    print()

- Train model for more epochs using the best hyperparameters

In [82]:
model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropout_embd=dropout_embd, dropout_rnn=dropout_rnn).to(device)
model

LanguageModel(
  (embedding): Embedding(28782, 300)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(300, 512, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=512, out_features=28782, bias=True)
)

In [78]:
model = torch.load('/content/model-ppl_133.pt')

FileNotFoundError: [Errno 2] No such file or directory: '/content/model-ppl_133.pt'

In [84]:
lr = 8
wd = 1e-6
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 8
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 1e-06
)

In [85]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [75]:
num_epochs = 30

for epoch in range(1, num_epochs+1):
  # WandB
  # run = wandb.init(
  #       project="language-modeling-lstms",
  #       config={
  #           "learning_rate": lr,
  #           "epochs": num_epochs,
  #       })

  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                    train_loader,
                                                    loss_fn,
                                                    optimizer,
                                                    metric,
                                                    epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                      valid_loader,
                                      loss_fn,
                                      metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  epoch_counter += 1

Epoch 1: 100%|██████████| 2929/2929 [01:59<00:00, 24.46batch/s, loss=5.88, metric=360]


Model Saved!
Valid: Loss = 6.046, Metric = 422.9



Epoch 2: 100%|██████████| 2929/2929 [01:59<00:00, 24.49batch/s, loss=5.24, metric=188]


Model Saved!
Valid: Loss = 6.027, Metric = 414.9



Epoch 3: 100%|██████████| 2929/2929 [01:59<00:00, 24.48batch/s, loss=4.96, metric=143]


Valid: Loss = 6.042, Metric = 421.1



Epoch 4: 100%|██████████| 2929/2929 [01:59<00:00, 24.52batch/s, loss=4.78, metric=119]


Valid: Loss = 6.064, Metric = 430.6



Epoch 5: 100%|██████████| 2929/2929 [01:59<00:00, 24.45batch/s, loss=4.65, metric=105]


Valid: Loss = 6.068, Metric = 432.1



Epoch 6:  40%|████      | 1177/2929 [00:48<01:11, 24.51batch/s, loss=4.5, metric=90]


KeyboardInterrupt: 

### Main Train Loop

In [99]:
torch.cuda.empty_cache()

In [100]:
set_seed(seed)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

In [101]:
set_seed(seed)

model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropout_embd=dropout_embd, dropout_rnn=dropout_rnn).to(device)

# model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
#                       hidden_dim=hidden_dim, num_layers=num_layers,
#                       dropoute=dropoute, dropouti=dropouti,
#                       dropouth=dropouth, dropouto=dropouto,
#                       weight_drop=weight_drop, pretrained=pretrained).to(device)
model

LanguageModel(
  (embedding): Embedding(28782, 300)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(300, 512, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=512, out_features=28782, bias=True)
)

In [105]:
model = torch.load('model.pt')

In [110]:
import shutil

In [111]:
shutil.copyfile("model-ppl130.8.pt", "/content/drive/MyDrive/LanguageModel/model-ppl130.8.pt")

'/content/drive/MyDrive/LanguageModel/model-ppl130.8.pt'

In [106]:
set_seed(seed)

lr = 0.5
wd = 1e-6
# momentum = 0.9

optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)
# optimizer = optim.SGD([{'params': model.embedding.parameters(), 'lr': 0.1*lr},
#                        {'params': model.lstms.parameters(), 'lr': lr}],
#                       weight_decay=wd, momentum=momentum)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.5
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 1e-06
)

In [95]:
if wandb_enable:

      config={
          'lr': lr,
          'momentum': momentum,
          'batch_size': batch_size,
          'seq_len': seq_len,
          'hidden_dim': hidden_dim,
          'embedding_dim': embedding_dim,
          'num_layers': num_layers,
          # 'dropout_embed': dropoute,
          # 'dropout_in_lstm': dropouti,
          # 'dropout_h_lstm': dropouth,
          # 'dropout_out_lstm': dropouto,
          'clip': clip,
      }

      experiment.log_parameters(config)

In [103]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [107]:
set_seed(seed)
num_epochs = 10

for epoch in range(1, num_epochs+1):
  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                    train_loader,
                                                    loss_fn,
                                                    optimizer,
                                                    metric,
                                                    epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                      valid_loader,
                                      loss_fn,
                                      metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  if wandb_enable:
    experiment.log_metrics({"metric_train": metric_train, "loss_train": loss_train,
                "metric_valid": metric_valid, "loss_valid": loss_valid}, epoch=epoch)

  epoch_counter += 1

Epoch 1: 100%|██████████| 2929/2929 [01:59<00:00, 24.57batch/s, loss=4.08, metric=59]


Model Saved!
Valid: Loss = 4.877, Metric = 131.2



Epoch 2: 100%|██████████| 2929/2929 [01:59<00:00, 24.50batch/s, loss=3.97, metric=53]


Model Saved!
Valid: Loss = 4.873, Metric = 130.8



Epoch 3: 100%|██████████| 2929/2929 [01:59<00:00, 24.55batch/s, loss=3.91, metric=49.8]


Valid: Loss = 4.877, Metric = 131.3



Epoch 4: 100%|██████████| 2929/2929 [01:59<00:00, 24.53batch/s, loss=3.86, metric=47.4]


Valid: Loss = 4.884, Metric = 132.1



Epoch 5:  86%|████████▋ | 2528/2929 [01:42<00:16, 24.56batch/s, loss=3.81, metric=45]


KeyboardInterrupt: 

In [108]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : Base-Model
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/ft-azad/lm-awd-lstm/4919e87f14cd4f18ac925d0c393f074d
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss_train [14]   : (3.8577467093173246, 5.890382317533928)
[1;38;5;39mCOMET INFO:[0m     loss_valid [14]   : (4.8733019167171205, 5.337851555176977)
[1;38;5;39mCOMET INFO:[0m     metric_train [14] : (47.358543395996094, 361.54278564453125)
[1;38;5;39mCOMET INFO:[0m     metric_valid [14] : (130.7518768310547, 208.064727783

# Test

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
loss_valid, metric_valid = evaluate(model, valid_loader, loss_fn, metric)
metric_valid

In [None]:
loss_test, metric_test = evaluate(model, test_loader, loss_fn, metric)
metric_test

# Generate

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, seed=None):

  indices = vocab(tokenizer(prompt))
  itos = vocab.get_itos()

  for i in range(max_seq_len):
    src = torch.LongTensor(indices).to(device)
    with torch.no_grad():
      prediction = model(src)

    # Low values like 0.1 for temperature, Makes softmax like argmax more
    probs = torch.softmax(prediction[-1]/temperature, dim = 0)
    idx = vocab["<ukn>"]
    while idx == vocab["<ukn>"]:
      idx = torch.multinomial(probs, num_samples =1).item()
    indices.append(idx)
    prompt += " " + itos[idx]
    # print(prompt)

    if idx == vocab["."]:
      return prompt

In [None]:
prompt = "as i know about this subject,"
generate(prompt, 40, 0.5, model, tokenizer, vocab, seed=None)