# **HW2- Predicting next word with GRU and LSTM**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from tqdm import trange
from datetime import datetime
from collections import OrderedDict, Counter
from itertools import product
from prettytable import PrettyTable
import os
import random

  import pandas.util.testing as tm


In [0]:
import torch
import torchvision
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.tensorboard import SummaryWriter



In [3]:
SEED = 0
torch.manual_seed(SEED)
random.seed(SEED)
torch.cuda.is_available()

True

In [4]:
from google.colab import drive
drive.mount('/content/drive')
ROOT_PATH = '/content/drive/My Drive/DL-Raja/HW2/ex2_304827702_201271509/'
os.chdir(ROOT_PATH)
os.getcwd()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


'/content/drive/My Drive/DL-Raja/HW2/ex2_304827702_201271509'

## **Load PTB dataset**

In [0]:
FOLDER_DATA = os.path.join(os.getcwd(), 'PTB')
def load_dataset(folder_path, file_name):
    full_path = os.path.join(folder_path, file_name)
    with open(full_path, 'r') as f:
        return f.readlines() # or  f.read() (character-level)

In [0]:
training_file = load_dataset(FOLDER_DATA, 'ptb.train.txt')
validation_file = load_dataset(FOLDER_DATA, 'ptb.valid.txt')
test_file = load_dataset(FOLDER_DATA, 'ptb.test.txt')

In [7]:
print(f'#sentences in train = {len(training_file)}')
print(f'#sentences in validation = {len(validation_file)}')
print(f'#sentences in test = {len(test_file)}')

#sentences in train = 42068
#sentences in validation = 3370
#sentences in test = 3761


## *Tokenizer*- tokenizing datasets (for embeddings layer)

In [0]:
class Tokenizer: #create vocab
    def __init__(self, file):
        self.file = file
        self.vocab_to_int = None

    def get_vocab_to_int(self):
        return self.vocab_to_int

    def create_vocab(self):
        words = []
        for row in self.file:
            sentence = row[1:].split(' ')
            sentence[-1] = '<eos>' # replace '\n' with <eos>
            words.extend(sentence)

        counts = Counter(words)
        vocab = sorted(counts, key=counts.get, reverse=True)
        vocab_to_int = {word: idx for idx, word in enumerate(vocab, 1)}
        self.vocab_to_int = vocab_to_int

    def file2token(self, file):
        vocab_to_int = self.vocab_to_int
        tokens = []
        for sentence in file:
            words = sentence[1:].split(' ')
            words[-1] = '<eos>'
            tokens.extend([vocab_to_int[word] for word in words])
        return tokens

In [0]:
# get all words (from training file)
tokenizer = Tokenizer(training_file)
tokenizer.create_vocab()
training_token = tokenizer.file2token(training_file)
validation_token = tokenizer.file2token(validation_file)
test_token = tokenizer.file2token(test_file)


## Create "Dataloader"

In [0]:
BATCH_SIZE = 16
SEQ_LEN = 35
def create_dataset(data_token, batch_size=20, seq_len=20):
    chunk_size = batch_size * seq_len
    n_batch = (len(data_token) - 1) // (batch_size * seq_len)
    i_start = 0
    dataset = []
    for i_batch in range(n_batch):
        x_batch = np.array(data_token[i_start:i_start + chunk_size]).reshape((batch_size, seq_len))
        y_batch = np.array(data_token[(i_start + 1):(i_start + 1) + chunk_size]).reshape((batch_size, seq_len))
        dataset.append((x_batch, y_batch))
        i_start += chunk_size
    return dataset


def create_data_loader(dataset):
    idxes = list(range(len(dataset)))
    random.shuffle(idxes)
    for i in idxes:
        X_batch, y_batch = dataset[i]
        yield torch.from_numpy(X_batch).type(torch.LongTensor), torch.from_numpy(y_batch).type(torch.LongTensor)

In [0]:
training_set = create_dataset(training_token, BATCH_SIZE, SEQ_LEN)
validation_set = create_dataset(validation_token, BATCH_SIZE, SEQ_LEN)
test_set = create_dataset(test_token, BATCH_SIZE, SEQ_LEN)

#**Word predictor class**

In [0]:
class WordPredictor(nn.Module):
    def __init__(self, **params):
        super(WordPredictor, self).__init__()
        self.n_layers = params["n_layers"]
        self.hidden_dim = params["hidden_dim"]
        self.embedding_dim = params["embedding_dim"]
        self.rnn_type = params["rnn_type"]
        self.dropout = nn.Dropout(params.get("dropout", 0))
        self.embeddings = nn.Embedding(params["vocab_size"], params["embedding_dim"])
        # self.init_embedding_layer()
        self.rnn_layers = self.get_rnn_layers(params)
        # self.init_rnn_layers()
        # self.classifier = nn.Linear(params["hidden_dim"], params["vocab_size"])
        self.classifier = nn.Sequential(
            nn.Linear(params["hidden_dim"], params["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(params["hidden_dim"], params["vocab_size"])
        ) if params.get("use_seq", False) \
        else nn.Linear(params["hidden_dim"], params["vocab_size"])

    def get_rnn_type(self):
        return self.rnn_type

    def get_rnn_layers(self, params):
        if self.rnn_type == "lstm":
            return nn.LSTM(self.embedding_dim, self.hidden_dim, self.n_layers, 
                           dropout=params.get("dropout", 0), batch_first=True)
        else:
            return nn.GRU(self.embedding_dim, self.hidden_dim, self.n_layers, 
                           dropout=params.get("dropout", 0), batch_first=True)

    def init_hidden(self, batch_size, device):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        hidden = None
        if self.rnn_type == 'lstm':
            hidden = (
                weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        elif self.rnn_type == 'gru':
            hidden = \
            weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)

        return hidden

    def init_embedding_layer(self, init_weights=0.1):
        for name, param in self.embeddings.named_parameters():
            if 'weight' in name:
                param.data.uniform_(-init_weights, init_weights)
                
    
    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)
        # embeddings and lstm_out
        x = x.long()
        embeds_out = self.dropout(self.embeddings(x))
        # embeds_out = self.embeddings(x)
        rnn_out, hidden = self.rnn_layers(embeds_out, hidden)
        # stack up lstm outputs
        # print(f'rnn_out.shape before reshape = {rnn_out.shape}')
        rnn_out = rnn_out.contiguous().view(-1, self.hidden_dim)
        # print(f'rnn_out.shape after reshape = {rnn_out.shape}')
        # dropout and fully-connected layer
        # dropout_out = self.dropout(rnn_out)
        # logits = self.classifier(dropout_out)
        logits = self.classifier(rnn_out)
        return logits, hidden

## Hyperparameters generator

In [0]:
# optimizers (SGD (nesterov=True))
#lrs=[1, 0.5, 0.4, 0.25] ,dropouts=[0, 0.1, 0.3, 0.5], (w_ds=[1e-3, 1e-4, 1e-5])
class HyperparamsConfig:
    def __init__(self, hyperparams_dict): 
        self.hyperparams_dict = hyperparams_dict
        self.hyperparams_names = list(hyperparams_dict.keys())

    def create_configs(self):
        h_params_names = self.hyperparams_names
        for h_params_vals in product(*[h_params  for h_params in self.hyperparams_dict.values()]):
            yield {h_params_name: h_params_vals[i] for i, h_params_name in enumerate(h_params_names)}


## Model configuration per run 
* Create  a new WordPredictor model acclording to specific configuration (if starting epoch=0)
* Enabing load saved model with its optimizer, scheduler and starting epoch

In [0]:
class ModelRun:
    def __init__(self, config, params, epochs, device, trained_model_path=''):
        self.config = config
        self.epochs = epochs
        self.model_name = ''
        model_params = {param_name: param_val for param_name, param_val in params.items()}
        model_params['dropout'] = config.get('dropout', 0.0)
        self.model = WordPredictor(**model_params)
        self.device = device
        self.max_grad_norm = params.get('max_grad_norm', 5)
        self.start_epoch = 0
        self.optimizer = self.set_optimizer(params.get('rnn_type'))
        self.scheduler = lr_scheduler.CosineAnnealingLR(self.optimizer,epochs)
        self.dict_ppl = None
        if trained_model_path != '' and os.path.exists(trained_model_path):
            self.load_model(trained_model_path)
    
    
    def set_optim_params(self, param_groups, params_dict):
        weight_decay = self.config.get('weight_decay', 0.0)
        if weight_decay > 0:
            params_dict['weight_decay'] = weight_decay
        for param_name, param_val in  params_dict.items():
            param_groups[param_name] = param_val


    def set_optimizer(self, rnn_type):
        model = self.model
        model.to(self.device)
        config = self.config
        optimizer_dict = config['optimizer_dict']
        optim_name = optimizer_dict['optim_name']
        self.set_model_name(optim_name, rnn_type)
        lr = self.config['lr']
        optimizer = optimizer_dict['optim_func'](model.parameters(), lr=lr)
        optimizer_params_dict = optimizer_dict['optim_params']
        self.set_optim_params(optimizer.param_groups[0], optimizer_params_dict)
        return optimizer


    def set_model_name(self, optim_name, rnn_type):
        config = self.config
        lr = config.get('lr')
        weight_decay = config.get('weight_decay', 0.0) 
        dropout = config.get('dropout', 0.0)
        self.model_name = f"WordPredictor_{rnn_type}_optimizer={optim_name}_" \
                          f"lr={lr}_dropout={dropout}_weight_decay={weight_decay}"


    def get_model(self):
        return self.model
         
    def get_device(self):
        return self.device

    def get_max_grad_norm(self):
        return self.max_grad_norm

    def get_optimizer(self):
        return self.optimizer
    
    def get_scheduler(self):
        return self.scheduler

    def get_model_name(self):
        return self.model_name
    
    def get_dict_ppl(self):
        return self.dict_ppl

    def get_epochs(self):
        return self.epochs

    def get_start_epoch(self):
        return self.start_epoch
         
    def load_model(self, path_model):
        checkpoint = torch.load(path_model)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.dict_ppl = checkpoint['dict_ppl']
        self.start_epoch = checkpoint['epoch']


## Run configuration
* fit and evaluate on validation
* scalar plot (tensorboard) and train-validation plot
* enable saving the model during training

In [0]:
class RunConfig:
    def __init__(self, model_config, save_model=True,
                 folder_checkpoint=''):
        self.model_config = model_config
        self.device = model_config.get_device()
        self.criterion = nn.CrossEntropyLoss()
        self.save_model = save_model
        self.model = model_config.get_model()
        self.optimizer = model_config.get_optimizer()
        self.scheduler = model_config.get_scheduler()
        self.epochs = model_config.get_epochs()
        self.max_grad_norm = model_config.get_max_grad_norm()
        self.start_epoch = model_config.get_start_epoch()
        self.model_name = model_config.get_model_name()
        self.folder_checkpoint = folder_checkpoint
        self.dict_ppl = model_config.get_dict_ppl()
        if self.dict_ppl is None:
            self.dict_ppl = OrderedDict([
                                         ('train',[]),
                                         ('validation', [])
                                         ]) 
    @staticmethod
    def detach_hidden(model, h):
        if model.get_rnn_type() == 'lstm':
            return tuple([el.data for el in h])  # tuple([el.detach() for el in h])
        elif model.get_rnn_type() == 'gru':
            return h.data # h.detach()
    
    def get_dict_ppl(self):
        return self.dict_ppl

    def get_model_name(self):
        return self.model_name

    def get_epochs(self):
        return self.epochs
    
    def evaluate(self, eval_loader, batch_size, lst_ppl):
        device = self.device
        with torch.no_grad():
            self.model.eval()
            h = self.model.init_hidden(batch_size, device)
            running_loss = 0.0
            eval_loader_size = 0
            for eval_seq, eval_labels in eval_loader:
                eval_seq, eval_labels = eval_seq.to(device), eval_labels.to(device)
                logits, h = self.model(eval_seq, h) 
                h = self.detach_hidden(self.model, h)
                loss = self.criterion(logits, eval_labels.view(-1))
                running_loss += loss.item()
                # ps = torch.exp(log_ps)
                # top_ps, top_class = ps.topk(1, dim=1)
                # equality = top_class == eval_labels.view(*top_class.shape)
                eval_loader_size += 1
            ppl = np.exp(running_loss / eval_loader_size)
            lst_ppl.append(ppl)
        h = self.detach_hidden(self.model, h) # new addition 10.5
        self.model.train()


    def save_checkpoint(self, epoch):
        model_saved_name = self.model_name + f'_epoch={epoch+1}'
        full_path = os.path.join(self.folder_checkpoint,
                                 f'{model_saved_name}.pth')
        torch.save({'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'scheduler_state_dict': self.scheduler.state_dict(),
                    'dict_ppl': self.dict_ppl,
                    'epoch':{epoch+1}}, full_path)
        

    def fit(self, training_set, validation_set, batch_size):
        names = list(self.dict_ppl.keys())
        lst_ppl_train = self.dict_ppl[names[0]]
        lst_ppl_val = self.dict_ppl[names[1]]
        device = self.device
        model = self.model.to(device)
        start_epoch, epochs = self.start_epoch, self.epochs
        with trange(start_epoch, epochs, desc="Epochs", disable=False) as te:
            model.train()
            for epoch in te:
                h = model.init_hidden(batch_size, self.device)
                running_loss = 0.0
                tr_loader_size = 0
                for tr_seq, tr_labels in create_data_loader(training_set):
                    tr_seq, tr_labels = tr_seq.to(device), tr_labels.to(device)
                    self.optimizer.zero_grad()
                    logits, h = model(tr_seq, h) 
                    h = self.detach_hidden(model, h)
                    loss = self.criterion(logits, tr_labels.view(-1))
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)
                    self.optimizer.step()
                    running_loss += loss.item()
                    tr_loader_size += 1
                    # ps = torch.exp(log_ps)
                    # top_ps, top_class = ps.topk(1, dim=1)
                    # equality = top_class == tr_labels.view(*top_class.shape)
                    # total_loss += torch.mean(equality.type(torch.FloatTensor)).item()
                # print(f'tr_loader_size = {tr_loader_size}')
                ppl_train = np.exp(running_loss / tr_loader_size)
                te.set_postfix(PPL=ppl_train) 
                self.evaluate(create_data_loader(training_set), batch_size, lst_ppl_train)
                self.evaluate(create_data_loader(validation_set), batch_size, lst_ppl_val)

                self.scheduler.step()   

                if self.save_model and ((epoch >= 2 and lst_ppl_val[-1] < lst_ppl_val[-2]) or (epoch == epochs-1)):
                    self.save_checkpoint(epoch)
    
    def print_ppls(self): 
        names = list(self.dict_ppl.keys())
        lst_ppl_train = self.dict_ppl[names[0]]
        lst_ppl_val = self.dict_ppl[names[1]]
        t = PrettyTable(['Epoch', f'{names[0]} ppl', f'{names[1]} ppl'])
        for i in range(len(lst_ppl_train)):
            t.add_row([i+1, lst_ppl_train[i], lst_ppl_val[i]])
        print(t)

# **Plots**
* write to tensorboard when tuning on validation set
* make plots after choosing best hyperparams and check validation

In [0]:
# %load_ext tensorboard
# %reload_ext tensorboard
# %tensorboard --logdir=runs

In [0]:
# %kill 1135 #(or !kill 1135)
# !kill 438

In [0]:
# writer for tuning hyperparameters (on validation set)
def plot_hyperparams(dict_ppl, model_name): # start_epoch
    writer = SummaryWriter()
    names = list(dict_ppl.keys())
    ppl_train = dict_ppl[names[0]]
    ppl_val = dict_ppl[names[1]]
    epochs = len(ppl_train)
    for i in range(epochs): # start_epoch
        tag_scalar_dict = {names[0]: ppl_train[i], names[1]: ppl_val[i]}
        writer.add_scalars(f'perplexity/{model_name}', tag_scalar_dict, i+1)
    writer.close()

# save plots of best models
def plot_perplexity(dict_ppl, model_name, folder_plot, 
                  y_label='Perplexity', x_label='Epochs'):  # start_epoch
    df_ppl = pd.DataFrame(dict_ppl)
    df_ppl.set_index(pd.Index(range(1, df_ppl.shape[0] + 1)), inplace=True)
    sns.lineplot(data=df_ppl)
    title = model_name.replace('_lr=','\nlr=')
    plt.title(title)
    plt.ylim()
    plt.ylabel(y_label)
    plt.xlim(1,df_ppl.shape[0])
    plt.xlabel(x_label)
    # plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(folder_plot,f"{model_name}_{datetime.now().strftime('%y%m%d_%H%M%S')}.png"))
    plt.close()



# main

In [0]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 30
FOLDER_CHECKPOINT = os.path.join(os.getcwd(), 'checkpoint')
FOLDER_PLOT = os.path.join(os.getcwd(), "plots")
os.makedirs(FOLDER_CHECKPOINT,exist_ok=True)
os.makedirs(FOLDER_PLOT,exist_ok=True)


def main(**args):
    torch.manual_seed(SEED)
    random.seed(SEED)
    # check_test = args.get('check_test', False)
    save_model = args.get('save_model', True)
    # dir_model = args.get('dir_model', '')
    # make_plot = args.get('make_plot', True)  
    scalar_plot = args.get('scalar_plot', False) 
    tr_val_plot = args.get('tr_val_plot', False) 
    
    file_saved_model = args.get('file_saved_model', '')
    trained_model_path = '' if file_saved_model == '' else \
        os.path.join(os.getcwd(), FOLDER_CHECKPOINT, f'{file_saved_model}.pth')
    dict_h_params = args.get('dict_h_params')
    config_h_params = HyperparamsConfig(dict_h_params)
    dict_m_params = args.get('dict_m_params', None)
    assert isinstance(dict_m_params, dict) , "model_params should be a dict of params to WordPredictor class"
    config_m_params = HyperparamsConfig(dict_m_params)
    training_set = args.get('training_set')
    validation_set = args.get('validation_set')

    for h_config in config_h_params.create_configs():
        print(f'hyper params config:\n{h_config}')
        for m_config in config_m_params.create_configs():
            print(f'model config:\n{m_config}')
            model_run = ModelRun(h_config, m_config, EPOCHS, DEVICE, trained_model_path)
            config_run = RunConfig(model_run, save_model, FOLDER_CHECKPOINT)
            config_run.fit(training_set, validation_set, BATCH_SIZE)
            config_run.print_ppls()
            dict_ppl = config_run.get_dict_ppl()
            model_name = config_run.get_model_name()
            if scalar_plot:
                plot_hyperparams(dict_ppl, model_name)
            if tr_val_plot:
                plot_perplexity(dict_ppl, model_name, FOLDER_PLOT)
        


## Find best hyperparameters using validation set for each regularization

In [0]:
params_no_reg = OrderedDict([
                             ('optimizer_dict', 
                              [{'optim_name': 'SGD', 'optim_func': optim.SGD, 'optim_params': { 'momentum':0.9, 'nesterov': True}}]),
                             ('lr', [0.5]), #best 0.5
                             ('weight_decay', [8e-5]), #best 8e-5
                             ('dropout', [0.0]),
                             ('use_seq', [False])
                             ])

params_dropout = OrderedDict([
                              ('optimizer_dict',
                               [{'optim_name': 'SGD', 'optim_func': optim.SGD, 'optim_params': { 'momentum':0.9, 'nesterov': False}}]),
                              ('lr', [0.5]),
                             ('weight_decay', [6e-5]),
                             ('dropout', [0.2]), 
                             ('use_seq', [False])
                             ])


params_model = OrderedDict([
                            ('vocab_size', [10000]), ('hidden_dim', [200]),
                            ('embedding_dim', [200]), ('max_grad_norm', [5]), 
                            ('rnn_type', ['lstm', 'gru']), ('n_layers', [2])
                            ])

# [{'optim_name': 'Adam', 'optim_func': optim.Adam, 'optim_params': {}}]),

In [19]:
main(dict_h_params=params_no_reg,dict_m_params=params_model,
     training_set=training_set, validation_set=validation_set, scalar_plot=True)

hyper params config:
{'optimizer_dict': {'optim_name': 'SGD', 'optim_func': <class 'torch.optim.sgd.SGD'>, 'optim_params': {'momentum': 0.9, 'nesterov': True}}, 'lr': 0.5, 'weight_decay': 8e-05, 'dropout': 0.0, 'use_seq': False}
model config:
{'vocab_size': 10000, 'hidden_dim': 200, 'embedding_dim': 200, 'max_grad_norm': 5, 'rnn_type': 'lstm', 'n_layers': 2}


Epochs: 100%|██████████| 30/30 [11:13<00:00, 22.47s/it, PPL=76.2]


+-------+--------------------+--------------------+
| Epoch |     train ppl      |   validation ppl   |
+-------+--------------------+--------------------+
|   1   | 242.83639222768744 |  255.657237425029  |
|   2   | 196.49533537222047 |  216.80169901546   |
|   3   | 173.21445561826874 | 193.90740066975135 |
|   4   | 154.41425648884325 | 177.52847914918655 |
|   5   | 144.52755444060776 | 169.67785502292259 |
|   6   | 138.5620114201827  | 166.84398077586366 |
|   7   | 133.55747455313735 | 160.19204124142692 |
|   8   | 126.35015039799192 | 156.59096681512244 |
|   9   | 121.6599925734506  | 151.35376905600248 |
|   10  | 119.98617526111435 | 152.69672949419368 |
|   11  |  115.689786535443  | 148.0124090104115  |
|   12  | 111.43327106011913 | 143.81195109394446 |
|   13  | 109.9887209818037  | 142.3001638523316  |
|   14  | 105.78587238072492 | 139.41521957370662 |
|   15  | 104.39539881135778 | 138.38587915389948 |
|   16  | 101.72722373315679 | 136.93636534211305 |
|   17  | 10

Epochs: 100%|██████████| 30/30 [10:47<00:00, 21.58s/it, PPL=52]

+-------+--------------------+--------------------+
| Epoch |     train ppl      |   validation ppl   |
+-------+--------------------+--------------------+
|   1   | 214.76430807732635 | 236.38440097502212 |
|   2   | 168.22118238994702 | 197.66910132999692 |
|   3   | 146.4426587783555  | 181.2688765246167  |
|   4   | 124.98825487506203 | 156.90817319361085 |
|   5   | 114.36954493213622 |  150.27512423402   |
|   6   | 107.27670072456446 | 147.2783974959482  |
|   7   | 111.39265011981804 | 157.56142428215566 |
|   8   | 96.28858795259029  | 138.66098526505235 |
|   9   |  98.3523848030492  | 143.04574825551208 |
|   10  |  91.4629315763579  |  137.621978100531  |
|   11  |  88.9765327069539  | 138.66851139362402 |
|   12  |  85.674565236024   | 135.10610806898623 |
|   13  | 82.40724148120034  | 131.21234442531224 |
|   14  |  78.5638912554054  | 128.84017377804201 |
|   15  | 76.47932743599243  | 127.79410301248983 |
|   16  | 73.86046749647666  | 124.8537051008861  |
|   17  | 71




In [20]:
print('Networks with dropout:')
main(dict_h_params=params_dropout,dict_m_params=params_model,
     training_set=training_set, validation_set=validation_set, scalar_plot=True)

Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Networks with dropout:
hyper params config:
{'optimizer_dict': {'optim_name': 'SGD', 'optim_func': <class 'torch.optim.sgd.SGD'>, 'optim_params': {'momentum': 0.9, 'nesterov': False}}, 'lr': 0.5, 'weight_decay': 6e-05, 'dropout': 0.2, 'use_seq': False}
model config:
{'vocab_size': 10000, 'hidden_dim': 200, 'embedding_dim': 200, 'max_grad_norm': 5, 'rnn_type': 'lstm', 'n_layers': 2}


Epochs: 100%|██████████| 30/30 [12:03<00:00, 24.11s/it, PPL=81.9]
Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

+-------+--------------------+--------------------+
| Epoch |     train ppl      |   validation ppl   |
+-------+--------------------+--------------------+
|   1   |  260.382511606011  | 269.7536268506337  |
|   2   | 205.2458361989828  | 222.8421769286301  |
|   3   | 182.96359992032228 | 201.4032558850453  |
|   4   | 161.51175296843118 | 183.69245087141078 |
|   5   | 144.70331879447286 | 168.4434739406028  |
|   6   | 138.11597719631786 | 165.39420411636198 |
|   7   |  130.071930844337  | 157.9722322866992  |
|   8   | 121.83593069592322 | 152.33585189942494 |
|   9   | 116.55887371378026 | 146.0687608336205  |
|   10  | 115.65676271378771 | 148.5413534303513  |
|   11  | 111.6100513547887  | 145.2650170414873  |
|   12  | 107.09976436981458 |  140.746246170145  |
|   13  | 105.06680416132264 | 139.2064206548465  |
|   14  | 101.54182664350047 | 136.17669465684347 |
|   15  | 98.64249537093681  | 133.46512203725192 |
|   16  | 98.67333557538562  | 134.08295878269078 |
|   17  | 94

Epochs: 100%|██████████| 30/30 [11:26<00:00, 22.89s/it, PPL=63.4]

+-------+--------------------+--------------------+
| Epoch |     train ppl      |   validation ppl   |
+-------+--------------------+--------------------+
|   1   | 225.81593939621342 | 247.01578355226283 |
|   2   | 181.13651938899392 |  210.672496103996  |
|   3   | 163.9386439079125  | 199.4482157477377  |
|   4   | 135.95960620615108 | 168.39635807315454 |
|   5   | 124.34327889649965 | 161.07194747222388 |
|   6   | 115.18130738046696 | 155.1700273674772  |
|   7   | 110.69779489017861 | 154.16217569939445 |
|   8   | 100.20441627123438 | 144.1866130738108  |
|   9   | 100.94763394129326 | 147.9198826790933  |
|   10  | 95.31597494472466  | 143.06355387282085 |
|   11  | 90.77878021750382  | 140.85435605804128 |
|   12  |  85.6548826453511  | 135.6659577003743  |
|   13  | 84.50662326167934  | 134.33577703585505 |
|   14  | 80.54160821932379  | 132.07296470813884 |
|   15  | 77.48244273917352  | 127.20258571862426 |
|   16  | 75.07375863265366  | 125.29586237837114 |
|   17  | 73




## Run on test set with  best hyperparameter for each regularization

In [0]:
params_no_reg_best = OrderedDict([
                             ('optimizer_dict', 
                              [{'optim_name': 'SGD', 'optim_func': optim.SGD, 'optim_params': { 'momentum':0.9, 'nesterov': True}}]),
                             ('lr', [0.5]), #best 0.5
                             ('weight_decay', [8e-5]), #best 8e-5
                             ('dropout', [0.0]),
                             ('use_seq', [False])
                             ])

params_dropout_best = OrderedDict([
                              ('optimizer_dict',
                               [{'optim_name': 'SGD', 'optim_func': optim.SGD, 'optim_params': { 'momentum':0.9, 'nesterov': False}}]),
                              ('lr', [0.5]),
                             ('weight_decay', [6e-5]),
                             ('dropout', [0.2]), 
                             ('use_seq', [False])
                             ])


params_model = OrderedDict([
                            ('vocab_size', [10000]), ('hidden_dim', [200]),
                            ('embedding_dim', [200]), ('max_grad_norm', [5]), 
                            ('rnn_type', ['lstm', 'gru']), ('n_layers', [2])
                            ])

# [{'optim_name': 'Adam', 'optim_func': optim.Adam, 'optim_params': {}}]),

In [22]:
#take best hyperparams and run on test
main(dict_h_params=params_no_reg_best,dict_m_params=params_model, training_set=training_set,
     validation_set=validation_set, tr_val_plot=True, save_model=False)

main(dict_h_params=params_dropout_best,dict_m_params=params_model, training_set=training_set,
     validation_set=validation_set, tr_val_plot=True, save_model=False)

Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

hyper params config:
{'optimizer_dict': {'optim_name': 'SGD', 'optim_func': <class 'torch.optim.sgd.SGD'>, 'optim_params': {'momentum': 0.9, 'nesterov': True}}, 'lr': 0.5, 'weight_decay': 8e-05, 'dropout': 0.0, 'use_seq': False}
model config:
{'vocab_size': 10000, 'hidden_dim': 200, 'embedding_dim': 200, 'max_grad_norm': 5, 'rnn_type': 'lstm', 'n_layers': 2}


Epochs: 100%|██████████| 30/30 [11:11<00:00, 22.37s/it, PPL=76.2]


+-------+--------------------+--------------------+
| Epoch |     train ppl      |   validation ppl   |
+-------+--------------------+--------------------+
|   1   | 242.83639222768744 |  255.657237425029  |
|   2   | 196.49533537222047 |  216.80169901546   |
|   3   | 173.21445561826874 | 193.90740066975135 |
|   4   | 154.41425648884325 | 177.52847914918655 |
|   5   | 144.52755444060776 | 169.67785502292259 |
|   6   | 138.5620114201827  | 166.84398077586366 |
|   7   | 133.55747455313735 | 160.19204124142692 |
|   8   | 126.35015039799192 | 156.59096681512244 |
|   9   | 121.6599925734506  | 151.35376905600248 |
|   10  | 119.98617526111435 | 152.69672949419368 |
|   11  |  115.689786535443  | 148.0124090104115  |
|   12  | 111.43327106011913 | 143.81195109394446 |
|   13  | 109.9887209818037  | 142.3001638523316  |
|   14  | 105.78587238072492 | 139.41521957370662 |
|   15  | 104.39539881135778 | 138.38587915389948 |
|   16  | 101.72722373315679 | 136.93636534211305 |
|   17  | 10

Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

model config:
{'vocab_size': 10000, 'hidden_dim': 200, 'embedding_dim': 200, 'max_grad_norm': 5, 'rnn_type': 'gru', 'n_layers': 2}


Epochs: 100%|██████████| 30/30 [10:45<00:00, 21.53s/it, PPL=52]


+-------+--------------------+--------------------+
| Epoch |     train ppl      |   validation ppl   |
+-------+--------------------+--------------------+
|   1   | 214.76430807732635 | 236.38440097502212 |
|   2   | 168.22118238994702 | 197.66910132999692 |
|   3   | 146.4426587783555  | 181.2688765246167  |
|   4   | 124.98825487506203 | 156.90817319361085 |
|   5   | 114.36954493213622 |  150.27512423402   |
|   6   | 107.27670072456446 | 147.2783974959482  |
|   7   | 111.39265011981804 | 157.56142428215566 |
|   8   | 96.28858795259029  | 138.66098526505235 |
|   9   |  98.3523848030492  | 143.04574825551208 |
|   10  |  91.4629315763579  |  137.621978100531  |
|   11  |  88.9765327069539  | 138.66851139362402 |
|   12  |  85.674565236024   | 135.10610806898623 |
|   13  | 82.40724148120034  | 131.21234442531224 |
|   14  |  78.5638912554054  | 128.84017377804201 |
|   15  | 76.47932743599243  | 127.79410301248983 |
|   16  | 73.86046749647666  | 124.8537051008861  |
|   17  | 71

Epochs: 100%|██████████| 30/30 [12:00<00:00, 24.01s/it, PPL=81.7]


+-------+--------------------+--------------------+
| Epoch |     train ppl      |   validation ppl   |
+-------+--------------------+--------------------+
|   1   | 260.7934640400719  | 270.2426872968723  |
|   2   | 205.67060212728833 | 223.18379003012893 |
|   3   | 183.9914408173782  | 202.55476110097305 |
|   4   | 161.49613580341494 | 183.78705720664078 |
|   5   | 144.55049294567638 | 168.19058603518266 |
|   6   | 139.22626601342043 | 166.17952513966824 |
|   7   | 129.9499897392572  |   158.0979671734   |
|   8   | 120.60693634115523 | 151.5363615323387  |
|   9   | 115.27690322193814 | 144.94955488032414 |
|   10  | 115.59700658249977 | 148.9242409367939  |
|   11  | 111.10105550232704 | 145.28316606333033 |
|   12  | 107.6052509453132  | 141.58332052092572 |
|   13  | 105.03827676721092 | 139.43575883912618 |
|   14  | 102.01255448449119 | 136.80491278582053 |
|   15  | 98.48208454531223  | 133.92053978921706 |
|   16  | 98.01422437583226  | 133.49321745165798 |
|   17  | 94

Epochs: 100%|██████████| 30/30 [11:25<00:00, 22.84s/it, PPL=63.6]

+-------+--------------------+--------------------+
| Epoch |     train ppl      |   validation ppl   |
+-------+--------------------+--------------------+
|   1   | 221.32402615658307 | 242.11712771366413 |
|   2   | 176.56560650646955 | 206.27054279416555 |
|   3   | 158.4696762912484  | 193.33306781380503 |
|   4   | 134.17017168421475 | 166.63616394139416 |
|   5   | 122.48399250305819 | 159.5985186739808  |
|   6   | 113.80788932391006 | 154.58421074387547 |
|   7   | 108.01605220740352 | 150.48418399175293 |
|   8   | 98.43622708351354  | 141.4513151150974  |
|   9   | 100.16601014929758 | 146.46354841939274 |
|   10  | 95.17395418717469  | 143.0365228238252  |
|   11  |  90.8341760471471  | 140.55523795908692 |
|   12  | 87.25519621910908  | 138.14530932393708 |
|   13  |  84.0078100323804  | 133.99426497180858 |
|   14  | 80.50278935557961  | 131.99364159088148 |
|   15  | 77.46253526045692  | 127.5571432346587  |
|   16  | 75.20934045133171  | 125.44148352510395 |
|   17  | 73




## Test pre-trained model


In [0]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 30
FOLDER_MODEL = os.path.join(os.getcwd(), FOLDER_CHECKPOINT)

def evaluate_final_model(model, eval_loader, batch_size, criterion, device):
        device = device
        ppl = 0
        with torch.no_grad():
            model.eval()
            h = model.init_hidden(batch_size, device)
            running_loss = 0.0
            eval_loader_size = 0
            for eval_seq, eval_labels in eval_loader:
                eval_seq, eval_labels = eval_seq.to(device), eval_labels.to(device)
                logits, h = model(eval_seq, h) 
                h = detach_hidden(model, h)
                loss = criterion(logits, eval_labels.view(-1))
                running_loss += loss.item()
                eval_loader_size += 1
            ppl = np.exp(running_loss / eval_loader_size)
        return ppl

def detach_hidden(model, h):
    if model.get_rnn_type() == 'lstm':
        return tuple([el.data for el in h])  # tuple([el.detach() for el in h])
    elif model.get_rnn_type() == 'gru':
        return h.data # h.detach()

# evaluate on train and test
def get_best_model_ppl(dict_best_h_params, model_params_best, dict_dataset,
                       model_best_path): 
    config_h_params = HyperparamsConfig(dict_best_h_params)
    config_best = next(config_h_params.create_configs())
    # print(f'config = {config_best}\nmodel_params_best = {model_params_best}')
    # print(f'model_best_path = {model_best_path}')
    model_run = ModelRun(config_best, model_params_best, EPOCHS, DEVICE, 
                         model_best_path)
    best_model = model_run.get_model()
    criterion = nn.CrossEntropyLoss()
    dataset_types = list(dict_dataset.keys())
    # model_name, best_epoch = model_run.get_model_name().split('_epoch=')
    model_name = model_run.get_model_name()
    # print(f'config_best = {config_best} (epoch = {best_epoch}):')
    header = ['Model'] + [f'{dataset_type} ppl' for dataset_type in dict_dataset.keys()]
    t = PrettyTable(header)
    vals = [model_name]
    for dataset in dict_dataset.values():
        ppl_val = evaluate_final_model(best_model, create_data_loader(dataset),
                                       BATCH_SIZE, criterion, DEVICE)
        vals.append(ppl_val)
    t.add_row(vals)
    print(t)
    return vals

* Load best hyperparameters for each regularizer
* print best results for training/validation/test

In [0]:
params_no_reg_best = OrderedDict([
                             ('optimizer_dict', 
                              [{'optim_name': 'SGD', 'optim_func': optim.SGD, 'optim_params': { 'momentum':0.9, 'nesterov': True}}]),
                             ('lr', [0.5]), #best 0.5
                             ('weight_decay', [8e-5]), #best 8e-5
                             ('dropout', [0.0]),
                             ('use_seq', [False])
                             ])

params_dropout_best =  OrderedDict([
                              ('optimizer_dict',
                               [{'optim_name': 'SGD', 'optim_func': optim.SGD, 'optim_params': { 'momentum':0.9, 'nesterov': False}}]),
                              ('lr', [0.5]),
                             ('weight_decay', [6e-5]),
                             ('dropout', [0.2]), 
                             ('use_seq', [False])
                             ])

dict_dataset = OrderedDict([
                              ('train', training_set),
                              ('validation',validation_set),
                              ('test', test_set)
                              ])

params_model_lstm_best = OrderedDict([
                                  ('vocab_size', 10000), ('hidden_dim', 200),
                                  ('embedding_dim', 200), ('max_grad_norm', 5), 
                                  ('rnn_type', 'lstm'), ('n_layers', 2)
                                  ])
params_model_gru_best = OrderedDict([
                                  ('vocab_size', 10000), ('hidden_dim', 200),
                                  ('embedding_dim', 200), ('max_grad_norm', 5), 
                                  ('rnn_type', 'gru'), ('n_layers', 2)
                                  ])
model_lstm_no_reg_filename = 'WordPredictor_lstm_optimizer=SGD_lr=0.5_dropout=0.0_weight_decay=8e-05_epoch=30'
model_lstm_dropout_filename = 'WordPredictor_lstm_optimizer=SGD_lr=0.5_dropout=0.2_weight_decay=6e-05_epoch=30'
model_gru_no_reg_filename = 'WordPredictor_gru_optimizer=SGD_lr=0.5_dropout=0.0_weight_decay=8e-05_epoch=30'
model_gru_dropout_filename = 'WordPredictor_gru_optimizer=SGD_lr=0.5_dropout=0.2_weight_decay=6e-05_epoch=30'

In [46]:

print('pre-trained no regularization (best) perplexity:')

model_lstm_no_reg_best_path = os.path.join(FOLDER_MODEL, f'{model_lstm_no_reg_filename}.pth')
model_gru_no_reg_best_path = os.path.join(FOLDER_MODEL, f'{model_gru_no_reg_filename}.pth')

get_best_model_ppl(params_no_reg_best, params_model_lstm_best, dict_dataset,
                   model_lstm_no_reg_best_path)
get_best_model_ppl(params_no_reg_best, params_model_gru_best, dict_dataset,
                   model_gru_no_reg_best_path)
print('pre-trained dropout (best) perplexity:')
model_lstm_dropout_best_path = os.path.join(FOLDER_MODEL, f'{model_lstm_dropout_filename}.pth')
model_gru_dropout_best_path = os.path.join(FOLDER_MODEL, f'{model_gru_dropout_filename}.pth')
get_best_model_ppl(params_dropout_best, params_model_lstm_best, dict_dataset,
                   model_lstm_dropout_best_path)
get_best_model_ppl(params_dropout_best, params_model_gru_best, dict_dataset,
                   model_gru_dropout_best_path)

pre-trained no regularization (best) perplexity:
+------------------------------------------------------------------------+------------------+--------------------+--------------------+
|                                 Model                                  |    train ppl     |   validation ppl   |      test ppl      |
+------------------------------------------------------------------------+------------------+--------------------+--------------------+
| WordPredictor_lstm_optimizer=SGD_lr=0.5_dropout=0.0_weight_decay=8e-05 | 75.9019803664578 | 121.05276656648577 | 117.27697678328056 |
+------------------------------------------------------------------------+------------------+--------------------+--------------------+
+-----------------------------------------------------------------------+-------------------+--------------------+-------------------+
|                                 Model                                 |     train ppl     |   validation ppl   |      test ppl     |
+

['WordPredictor_gru_optimizer=SGD_lr=0.5_dropout=0.2_weight_decay=6e-05',
 57.13682047669609,
 112.33672053903588,
 109.04050665553807]