# Implement and train a LSTM for sentiment analysis

(General Hint on Lab 1/2: Trust whatever you see from the training and report it on PDF. IDMB is far from ideal as it's more like a real-world dataset)

## Step 0: set up the environment

In [2]:
import functools
import sys
import numpy as np
import pandas as pd
import random
import re
import matplotlib.pyplot as plt
import tqdm
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

nltk.download('stopwords')

torch.backends.cudnn.benchmark = True

import os
os.makedirs("resources", exist_ok=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Hyperparameters. Do not directly touch this to mess up settings.

If you want to initalize new hyperparameter sets, use "new_hparams = HyperParams()" and change corresponding fields.

In [3]:
class HyperParams:
    def __init__(self):
        # Constance hyperparameters. They have been tested and don't need to be tuned.
        self.PAD_INDEX = 0
        self.UNK_INDEX = 1
        self.PAD_TOKEN = '<pad>'
        self.UNK_TOKEN = '<unk>'
        self.STOP_WORDS = set(stopwords.words('english'))
        self.MAX_LENGTH = 256
        self.BATCH_SIZE = 96
        self.EMBEDDING_DIM = 1
        self.HIDDEN_DIM = 100
        self.OUTPUT_DIM = 2
        self.N_LAYERS = 1
        self.DROPOUT_RATE = 0.0
        self.LR = 0.01
        self.N_EPOCHS = 5
        self.WD = 0
        self.OPTIM = "sgd"
        self.BIDIRECTIONAL = False
        self.SEED = 2

## Lab 1(a) Implement your own data loader function.  
First, you need to read the data from the dataset file on the local disk. 
Then, split the dataset into three sets: train, validation and test by 7:1:2 ratio.
Finally return x_train, x_valid, x_test, y_train, y_valid, y_test where x represents reviews and y represent labels.  

In [4]:
def load_imdb(base_csv:str = './IMDBDataset.csv'):
    """
    Load the IMDB dataset
    :param base_csv: the path of the dataset file.
    :return: train, validation and test set.
    """
    # Add your code here. 

    imdb_data = pd.read_csv(base_csv)
    X = imdb_data.iloc[:, 0]
    y = imdb_data.iloc[:, 1]

    x_not_test, x_test, y_not_test, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    
    x_train, x_valid, y_train, y_valid = train_test_split(x_not_test, y_not_test, test_size = 0.125, random_state = 0)
    
    
    print(f'shape of train data is {x_train.shape}')
    print(f'shape of test data is {x_test.shape}')
    print(f'shape of valid data is {x_valid.shape}')
    return x_train, x_valid, x_test, y_train, y_valid, y_test

## Lab 1(b): Implement your function to build a vocabulary based on the training corpus.
Implement the build_vocab function to build a vocabulary based on the training corpus.
You should first compute the frequency of all the words in the training corpus. Remove the words
that are in the STOP_WORDS. Then filter the words by their frequency (≥ min_freq) and finally
generate a corpus variable that contains a list of words.

In [5]:
def build_vocab(x_train:list, min_freq: int=5, hparams=None) -> dict:
    """
    build a vocabulary based on the training corpus.
    :param x_train:  List. The training corpus. Each sample in the list is a string of text.
    :param min_freq: Int. The frequency threshold for selecting words.
    :return: dictionary {word:index}
    """
    # Add your code here. Your code should assign corpus with a list of words.

    params = HyperParams()
    stop_words = params.STOP_WORDS

    corpus = {}
    for review in x_train:
      #Should I be removing neighboring punctuation (parentheses, br, comma, semicolon)
      review_words = review.split(' ')

      for current_word in review_words:
        corpus[current_word] = corpus.get(current_word, 0) + 1

    for stop_word in stop_words:
      if stop_word in corpus:
        corpus.pop(stop_word)
    
    # sorting on the basis of most common words
    # corpus_ = sorted(corpus, key=corpus.get, reverse=True)[:1000]
    corpus_ = [word for word, freq in corpus.items() if freq >= min_freq]
    # creating a dict
    vocab = {w:i+2 for i, w in enumerate(corpus_)}
    vocab[hparams.PAD_TOKEN] = hparams.PAD_INDEX
    vocab[hparams.UNK_TOKEN] = hparams.UNK_INDEX
    return vocab


## Lab 1(c): Implement your tokenize function. 
For each word, find its index in the vocabulary. 
Return a list of int that represents the indices of words in the example. 

In [6]:
def tokenize(vocab: dict, example: str)-> list:
    """
    Tokenize the give example string into a list of token indices.
    :param vocab: dict, the vocabulary.
    :param example: a string of text.
    :return: a list of token indices.
    """
    # Your code here.
    return [vocab[current_word] for current_word in example.split(" ") if current_word in vocab]

## Lab 1 (d): Implement the __getitem__ function. Given an index i, you should return the i-th review and label. 
The review is originally a string. Please tokenize it into a sequence of token indices. 
Use the max_length parameter to truncate the sequence so that it contains at most max_length tokens. 
Convert the label string ('positive'/'negative') to a binary index. 'positive' is 1 and 'negative' is 0. 
Return a dictionary containing three keys: 'ids', 'length', 'label' which represent the list of token ids, the length of the sequence, the binary label. 

In [7]:
class IMDB(Dataset):
    def __init__(self, x, y, vocab, max_length=256) -> None:
        """
        :param x: list of reviews
        :param y: list of labels
        :param vocab: vocabulary dictionary {word:index}.
        :param max_length: the maximum sequence length.
        """
        self.x = x
        self.y = y
        self.vocab = vocab
        self.max_length = max_length

    def __getitem__(self, idx: int):
        """
        Return the tokenized review and label by the given index.
        :param idx: index of the sample.
        :return: a dictionary containing three keys: 'ids', 'length', 'label' which represent the list of token ids, the length of the sequence, the binary label.
        """
        # Add your code here.

        current_review = self.x.iloc[idx]

        my_item = {}
        my_item['ids'] = tokenize(self.vocab, current_review)[:self.max_length]
        my_item['length'] = len(my_item['ids'])
        my_item['label'] = int(self.y.iloc[idx] == 'positive')
        
        return my_item
    

    def __len__(self) -> int:
        return len(self.x)

def collate(batch, pad_index):
    batch_ids = [torch.LongTensor(i['ids']) for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_length = torch.Tensor([i['length'] for i in batch])
    batch_label = torch.LongTensor([i['label'] for i in batch])
    batch = {'ids': batch_ids, 'length': batch_length, 'label': batch_label}
    return batch

collate_fn = collate

## Lab 1 (e): Implement the LSTM model for sentiment analysis.
Q(a): Implement the initialization function.
Your task is to create the model by stacking several necessary layers including an embedding layer, a lstm cell, a linear layer, and a dropout layer.
You can call functions from Pytorch's nn library. For example, nn.Embedding, nn.LSTM, nn.Linear.<br>
Q(b): Implement the forward function.
    Decide where to apply dropout. 
    The sequences in the batch have different lengths. Write/call a function to pad the sequences into the same length. 
    Apply a fully-connected (fc) layer to the output of the LSTM layer. 
    Return the output features which is of size [batch size, output dim]. 

In [8]:
from torch.nn.utils.rnn import pack_padded_sequence

In [9]:
def init_weights(m):
    if isinstance(m, nn.Embedding):
        nn.init.xavier_normal_(m.weight)
    elif isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM) or isinstance(m, nn.GRU):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
                
class LSTM(nn.Module):
    def __init__(
        self, 
        vocab_size: int, 
        embedding_dim: int, 
        hidden_dim: int, 
        output_dim: int, 
        n_layers: int, 
        dropout_rate: float, 
        pad_index: int,
        bidirectional: bool,
        **kwargs):
        """
        Create a LSTM model for classification.
        :param vocab_size: size of the vocabulary
        :param embedding_dim: dimension of embeddings
        :param hidden_dim: dimension of hidden features
        :param output_dim: dimension of the output layer which equals to the number of labels.
        :param n_layers: number of layers.
        :param dropout_rate: dropout rate.
        :param pad_index: index of the padding token.we
        """
        super().__init__()
        # Add your code here. Initializing each layer by the given arguments.
        

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout_rate = dropout_rate
        self.pad_index = pad_index
        self.bidirectional = bidirectional

        self.emb1 = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx = self.pad_index)
        self.pack_padded = pack_padded_sequence
        self.lstm_cell1 = nn.LSTM(self.embedding_dim, self.hidden_dim,
                                  self.n_layers, bidirectional = self.bidirectional)
        self.fc1 = nn.Linear(self.hidden_dim, self.output_dim)
        self.dropout = nn.Dropout(p=self.dropout_rate)
        
        # Weight initialization. DO NOT CHANGE!
        if "weight_init_fn" not in kwargs:
            self.apply(init_weights)
        else:
            self.apply(kwargs["weight_init_fn"])


    def forward(self, ids:torch.Tensor, length:torch.Tensor):
        """
        Feed the given token ids to the model.
        :param ids: [batch size, seq len] batch of token ids.
        :param length: [batch size] batch of length of the token ids.
        :return: prediction of size [batch size, output dim].
        """
        lstm_ids = self.pack_padded(self.emb1(ids), length, batch_first = True, enforce_sorted=False)
        
        #features, hidden state, cell state
        output, (h_n, c_n) = self.lstm_cell1(lstm_ids)
        prediction = self.fc1(self.dropout(h_n[-1]))
        
        return prediction

In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def train(dataloader, model, criterion, optimizer, scheduler, device):
    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        length = batch['length']
        label = batch['label'].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
        scheduler.step()

    return epoch_losses, epoch_accs

def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

def predict_sentiment(text, model, vocab, device):
    tokens = tokenize(vocab, text)
    ids = [vocab[t] if t in vocab else UNK_INDEX for t in tokens]
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

### Lab 1 (g) Implement GRU.

In [11]:
class GRU(nn.Module):
    def __init__(
        self, 
        vocab_size: int, 
        embedding_dim: int, 
        hidden_dim: int, 
        output_dim: int, 
        n_layers: int, 
        dropout_rate: float, 
        pad_index: int,
        bidirectional: bool,
        **kwargs):
        """
        Create a LSTM model for classification.
        :param vocab_size: size of the vocabulary
        :param embedding_dim: dimension of embeddings
        :param hidden_dim: dimension of hidden features
        :param output_dim: dimension of the output layer which equals to the number of labels.
        :param n_layers: number of layers.
        :param dropout_rate: dropout rate.
        :param pad_index: index of the padding token.we
        """
        super().__init__()
        # Add your code here. Initializing each layer by the given arguments.

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout_rate = dropout_rate
        self.pad_index = pad_index
        self.bidirectional = bidirectional

        self.pack_padded = pack_padded_sequence
        self.emb1 = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx = self.pad_index)
        self.gru_cell1 = nn.GRU(self.embedding_dim, self.hidden_dim, self.n_layers,
                                 bidirectional = self.bidirectional)
        self.fc1 = nn.Linear(self.hidden_dim, self.output_dim)
        self.dropout = nn.Dropout(p=self.dropout_rate)

        # Weight Initialization. DO NOT CHANGE!
        if "weight_init_fn" not in kwargs:
            self.apply(init_weights)
        else:
            self.apply(kwargs["weight_init_fn"])


    def forward(self, ids:torch.Tensor, length:torch.Tensor):
        """
        Feed the given token ids to the model.
        :param ids: [batch size, seq len] batch of token ids.
        :param length: [batch size] batch of length of the token ids.
        :return: prediction of size [batch size, output dim].
        """
        # Add your code here.
        gru_ids = self.pack_padded(self.emb1(ids), length, batch_first = True, enforce_sorted=False)
        
        #features, hidden state, cell state
        output, h_n = self.gru_cell1(gru_ids)
        prediction = self.fc1(self.dropout(h_n[-1]))
        
        return prediction

### Learning rate warmup. DO NOT TOUCH!

In [12]:
class ConstantWithWarmup(torch.optim.lr_scheduler._LRScheduler):
    def __init__(
        self,
        optimizer,
        num_warmup_steps: int,
    ):
        self.num_warmup_steps = num_warmup_steps
        super().__init__(optimizer)

    def get_lr(self):
        if self._step_count <= self.num_warmup_steps:
            # warmup
            scale = 1.0 - (self.num_warmup_steps - self._step_count) / self.num_warmup_steps
            lr = [base_lr * scale for base_lr in self.base_lrs]
            self.last_lr = lr
        else:
            lr = self.base_lrs
        return lr

### Implement the training / validation iteration here.

In [13]:
def train_and_test_model_with_hparams(hparams, model_type="lstm", **kwargs):
    # Seeding. DO NOT TOUCH! DO NOT TOUCH hparams.SEED!
    # Set the random seeds.
    torch.manual_seed(hparams.SEED)
    random.seed(hparams.SEED)
    np.random.seed(hparams.SEED)

    x_train, x_valid, x_test, y_train, y_valid, y_test = load_imdb()
    vocab = build_vocab(x_train, hparams=hparams)
    vocab_size = len(vocab)
    print(f'Length of vocabulary is {vocab_size}')

    train_data = IMDB(x_train, y_train, vocab, hparams.MAX_LENGTH)
    valid_data = IMDB(x_valid, y_valid, vocab, hparams.MAX_LENGTH)
    test_data = IMDB(x_test, y_test, vocab, hparams.MAX_LENGTH)

    collate = functools.partial(collate_fn, pad_index=hparams.PAD_INDEX)

    train_dataloader = torch.utils.data.DataLoader(
        train_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate, shuffle=True)
    valid_dataloader = torch.utils.data.DataLoader(
        valid_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate)
    test_dataloader = torch.utils.data.DataLoader(
        test_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate)
    
    # Model
    if "override_models_with_gru" in kwargs and kwargs["override_models_with_gru"]:
        model = GRU(
            vocab_size, 
            hparams.EMBEDDING_DIM, 
            hparams.HIDDEN_DIM, 
            hparams.OUTPUT_DIM,
            hparams.N_LAYERS,
            hparams.DROPOUT_RATE, 
            hparams.PAD_INDEX,
            hparams.BIDIRECTIONAL,
            **kwargs)
    else:
        model = LSTM(
            vocab_size, 
            hparams.EMBEDDING_DIM, 
            hparams.HIDDEN_DIM, 
            hparams.OUTPUT_DIM,
            hparams.N_LAYERS,
            hparams.DROPOUT_RATE, 
            hparams.PAD_INDEX,
            hparams.BIDIRECTIONAL,
            **kwargs)
    num_params = count_parameters(model)
    print(f'The model has {num_params:,} trainable parameters')


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Optimization. Lab 2 (a)(b) should choose one of them.
    # DO NOT TOUCH optimizer-specific hyperparameters! (e.g., eps, momentum)
    # DO NOT change optimizer implementations!
    if hparams.OPTIM == "sgd":
        optimizer = optim.SGD(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, momentum=.9)        
    elif hparams.OPTIM == "adagrad":
        optimizer = optim.Adagrad(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, eps=1e-6)
    elif hparams.OPTIM == "adam":
        optimizer = optim.Adam(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, eps=1e-6)
    elif hparams.OPTIM == "rmsprop":
        optimizer = optim.RMSprop(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, eps=1e-6, momentum=.9)
    else:
        raise NotImplementedError("Optimizer not implemented!")

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    # Start training
    best_valid_loss = float('inf')
    train_losses = []
    train_accs = []
    valid_losses = []
    valid_accs = []
    
    # Warmup Scheduler. DO NOT TOUCH!
    WARMUP_STEPS = 200
    lr_scheduler = ConstantWithWarmup(optimizer, WARMUP_STEPS)

    best_path = 'BestModel.pth'

    for epoch in range(hparams.N_EPOCHS):
        
        # Your code: implement the training process and save the best model.
        
        train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, lr_scheduler, device)
        valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)
        
        
        epoch_train_loss = np.mean(train_loss)
        epoch_train_acc = np.mean(train_acc)
        epoch_valid_loss = np.mean(valid_loss)
        epoch_valid_acc = np.mean(valid_acc)

        # Save the model that achieves the smallest validation loss.
        if epoch_valid_loss < best_valid_loss:
            # Your code: save the best model somewhere (no need to submit it to Sakai)
            torch.save(model.state_dict(), best_path)

        print(f'epoch: {epoch+1}')
        print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
        print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')


    # Your Code: Load the best model's weights.
    best_state_dict = torch.load(best_path)
    model.load_state_dict(best_state_dict)

    # Your Code: evaluate test loss on testing dataset (NOT Validation)
    test_loss, test_acc = evaluate(test_dataloader, model, criterion, device)

    epoch_test_loss = np.mean(test_loss)
    epoch_test_acc = np.mean(test_acc)
    print(f'test_loss: {epoch_test_loss:.3f}, test_acc: {epoch_test_acc:.3f}')
    
    # Free memory for later usage.
    del model
    torch.cuda.empty_cache()
    return {
        'num_params': num_params,
        "test_loss": epoch_test_loss,
        "test_acc": epoch_test_acc,
    }

### Lab 1 (f): Train model with original hyperparameters, for LSTM.

Train the model with default hyperparameter settings.

In [13]:
org_hyperparams = HyperParams()
_ = train_and_test_model_with_hparams(org_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 102,235 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 40.61it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.42it/s]
epoch: 1
train_loss: 0.693, train_acc: 0.496
valid_loss: 0.694, valid_acc: 0.496
training...: 100%|██████████| 365/365 [00:07<00:00, 51.61it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 89.39it/s]
epoch: 2
train_loss: 0.693, train_acc: 0.498
valid_loss: 0.693, valid_acc: 0.504
training...: 100%|██████████| 365/365 [00:06<00:00, 52.61it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 92.07it/s]
epoch: 3
train_loss: 0.693, train_acc: 0.501
valid_loss: 0.693, valid_acc: 0.496
training...: 100%|██████████| 365/365 [00:07<00:00, 51.85it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.51it/s]
epoch: 4
train_loss: 0.693, train_acc: 0.500
valid_loss: 0.694, valid_acc: 0.496
train

### Lab 1 (h) Train GRU with vanilla hyperparameters.

In [14]:
org_hyperparams = HyperParams()
_ = train_and_test_model_with_hparams(org_hyperparams, "gru_1layer_base_sgd_e32_h100", override_models_with_gru=True)

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 91,935 trainable parameters
training...: 100%|██████████| 365/365 [00:06<00:00, 53.84it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.43it/s]
epoch: 1
train_loss: 0.694, train_acc: 0.498
valid_loss: 0.694, valid_acc: 0.496
training...: 100%|██████████| 365/365 [00:06<00:00, 52.24it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 28.47it/s]
epoch: 2
train_loss: 0.694, train_acc: 0.496
valid_loss: 0.696, valid_acc: 0.504
training...: 100%|██████████| 365/365 [00:07<00:00, 50.40it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.70it/s]
epoch: 3
train_loss: 0.694, train_acc: 0.499
valid_loss: 0.694, valid_acc: 0.496
training...: 100%|██████████| 365/365 [00:06<00:00, 55.00it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 93.13it/s]
epoch: 4
train_loss: 0.694, train_acc: 0.496
valid_loss: 0.696, valid_acc: 0.496
traini

### Lab 2 (a) Study of LSTM Optimizers. Hint: For adaptive optimizers, we recommend using a learning rate of 0.001 (instead of 0.01).

In [15]:
adagrad_optimizer_hyperparams = HyperParams()
adagrad_optimizer_hyperparams.OPTIM = "adagrad"
adagrad_optimizer_hyperparams.LR = 0.001
_ = train_and_test_model_with_hparams(adagrad_optimizer_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 102,235 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 50.22it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.41it/s]
epoch: 1
train_loss: 0.693, train_acc: 0.504
valid_loss: 0.693, valid_acc: 0.496
training...: 100%|██████████| 365/365 [00:06<00:00, 52.58it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 92.89it/s]
epoch: 2
train_loss: 0.666, train_acc: 0.618
valid_loss: 0.573, valid_acc: 0.809
training...: 100%|██████████| 365/365 [00:06<00:00, 52.97it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 94.40it/s]
epoch: 3
train_loss: 0.548, train_acc: 0.810
valid_loss: 0.537, valid_acc: 0.814
training...: 100%|██████████| 365/365 [00:07<00:00, 51.16it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 89.44it/s]
epoch: 4
train_loss: 0.502, train_acc: 0.831
valid_loss: 0.566, valid_acc: 0.783
train

In [16]:
adam_optimizer_hyperparams = HyperParams()
adam_optimizer_hyperparams.OPTIM = "adam"
adam_optimizer_hyperparams.LR = 0.001
_ = train_and_test_model_with_hparams(adam_optimizer_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 102,235 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 50.24it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.04it/s]
epoch: 1
train_loss: 0.665, train_acc: 0.578
valid_loss: 0.610, valid_acc: 0.688
training...: 100%|██████████| 365/365 [00:07<00:00, 50.06it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.10it/s]
epoch: 2
train_loss: 0.330, train_acc: 0.864
valid_loss: 0.309, valid_acc: 0.884
training...: 100%|██████████| 365/365 [00:07<00:00, 51.15it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 85.45it/s]
epoch: 3
train_loss: 0.144, train_acc: 0.950
valid_loss: 0.306, valid_acc: 0.887
training...: 100%|██████████| 365/365 [00:07<00:00, 51.23it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.53it/s]
epoch: 4
train_loss: 0.076, train_acc: 0.976
valid_loss: 0.339, valid_acc: 0.869
train

In [17]:
rmsprop_optimizer_hyperparams = HyperParams()
rmsprop_optimizer_hyperparams.OPTIM = "rmsprop"
rmsprop_optimizer_hyperparams.LR = 0.001
_ = train_and_test_model_with_hparams(rmsprop_optimizer_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 102,235 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 50.91it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.81it/s]
epoch: 1
train_loss: 0.554, train_acc: 0.706
valid_loss: 0.398, valid_acc: 0.835
training...: 100%|██████████| 365/365 [00:07<00:00, 51.59it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.75it/s]
epoch: 2
train_loss: 0.280, train_acc: 0.893
valid_loss: 0.301, valid_acc: 0.881
training...: 100%|██████████| 365/365 [00:07<00:00, 45.83it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.30it/s]
epoch: 3
train_loss: 0.147, train_acc: 0.949
valid_loss: 0.441, valid_acc: 0.859
training...: 100%|██████████| 365/365 [00:07<00:00, 51.69it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 93.29it/s]
epoch: 4
train_loss: 0.079, train_acc: 0.974
valid_loss: 0.428, valid_acc: 0.861
train

### Lab 2 (b): Study of GRU Optimizers. Hint: For adaptive optimizers, we recommend using a learning rate of 0.001 (instead of 0.01).

In [18]:
_ = train_and_test_model_with_hparams(adagrad_optimizer_hyperparams, "gru_1layer_base_sgd_e32_h100", override_models_with_gru=True)

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 91,935 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 51.94it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.22it/s]
epoch: 1
train_loss: 0.693, train_acc: 0.514
valid_loss: 0.692, valid_acc: 0.584
training...: 100%|██████████| 365/365 [00:07<00:00, 50.29it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.30it/s]
epoch: 2
train_loss: 0.613, train_acc: 0.721
valid_loss: 0.562, valid_acc: 0.770
training...: 100%|██████████| 365/365 [00:06<00:00, 52.72it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.68it/s]
epoch: 3
train_loss: 0.464, train_acc: 0.854
valid_loss: 0.465, valid_acc: 0.848
training...: 100%|██████████| 365/365 [00:06<00:00, 52.84it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.38it/s]
epoch: 4
train_loss: 0.398, train_acc: 0.879
valid_loss: 0.450, valid_acc: 0.853
traini

In [19]:
_ = train_and_test_model_with_hparams(adam_optimizer_hyperparams, "gru_1layer_base_sgd_e32_h100", override_models_with_gru=True)

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 91,935 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 50.76it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.69it/s]
epoch: 1
train_loss: 0.647, train_acc: 0.610
valid_loss: 0.466, valid_acc: 0.814
training...: 100%|██████████| 365/365 [00:07<00:00, 52.08it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.64it/s]
epoch: 2
train_loss: 0.252, train_acc: 0.901
valid_loss: 0.293, valid_acc: 0.877
training...: 100%|██████████| 365/365 [00:07<00:00, 50.32it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.17it/s]
epoch: 3
train_loss: 0.112, train_acc: 0.963
valid_loss: 0.326, valid_acc: 0.888
training...: 100%|██████████| 365/365 [00:07<00:00, 51.60it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.68it/s]
epoch: 4
train_loss: 0.048, train_acc: 0.987
valid_loss: 0.398, valid_acc: 0.884
traini

In [20]:
_ = train_and_test_model_with_hparams(rmsprop_optimizer_hyperparams, "gru_1layer_base_sgd_e32_h100", override_models_with_gru=True)

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 91,935 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 50.72it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.13it/s]
epoch: 1
train_loss: 0.536, train_acc: 0.711
valid_loss: 0.283, valid_acc: 0.881
training...: 100%|██████████| 365/365 [00:06<00:00, 53.19it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 89.98it/s]
epoch: 2
train_loss: 0.188, train_acc: 0.926
valid_loss: 0.274, valid_acc: 0.883
training...: 100%|██████████| 365/365 [00:07<00:00, 47.08it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 92.48it/s]
epoch: 3
train_loss: 0.079, train_acc: 0.972
valid_loss: 0.376, valid_acc: 0.873
training...: 100%|██████████| 365/365 [00:07<00:00, 51.50it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.58it/s]
epoch: 4
train_loss: 0.033, train_acc: 0.990
valid_loss: 0.476, valid_acc: 0.865
traini

### Lab 2 (c) Deeper LSTMs

In [21]:
adam_depth_1_hyperparams = HyperParams()
adam_depth_1_hyperparams.OPTIM = "adam"
adam_depth_1_hyperparams.LR = 0.001
adam_depth_1_hyperparams.N_LAYERS = 1

In [22]:
adam_depth_2_hyperparams = HyperParams()
adam_depth_2_hyperparams.OPTIM = "adam"
adam_depth_2_hyperparams.LR = 0.001
adam_depth_2_hyperparams.N_LAYERS = 2
_ = train_and_test_model_with_hparams(adam_depth_2_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 183,035 trainable parameters
training...: 100%|██████████| 365/365 [00:11<00:00, 32.57it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 64.54it/s]
epoch: 1
train_loss: 0.683, train_acc: 0.573
valid_loss: 0.561, valid_acc: 0.753
training...: 100%|██████████| 365/365 [00:11<00:00, 30.95it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 65.54it/s]
epoch: 2
train_loss: 0.343, train_acc: 0.855
valid_loss: 0.304, valid_acc: 0.876
training...: 100%|██████████| 365/365 [00:11<00:00, 32.66it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 68.51it/s]
epoch: 3
train_loss: 0.168, train_acc: 0.939
valid_loss: 0.308, valid_acc: 0.881
training...: 100%|██████████| 365/365 [00:11<00:00, 32.61it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 63.85it/s]
epoch: 4
train_loss: 0.087, train_acc: 0.972
valid_loss: 0.343, valid_acc: 0.872
train

In [23]:
adam_depth_3_hyperparams = HyperParams()
adam_depth_3_hyperparams.OPTIM = "adam"
adam_depth_3_hyperparams.LR = 0.001
adam_depth_3_hyperparams.N_LAYERS = 3
_ = train_and_test_model_with_hparams(adam_depth_3_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 263,835 trainable parameters
training...: 100%|██████████| 365/365 [00:14<00:00, 25.36it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 57.86it/s]
epoch: 1
train_loss: 0.671, train_acc: 0.570
valid_loss: 0.376, valid_acc: 0.839
training...: 100%|██████████| 365/365 [00:15<00:00, 24.23it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 56.96it/s]
epoch: 2
train_loss: 0.294, train_acc: 0.881
valid_loss: 0.324, valid_acc: 0.871
training...: 100%|██████████| 365/365 [00:14<00:00, 25.56it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 57.69it/s]
epoch: 3
train_loss: 0.146, train_acc: 0.949
valid_loss: 0.318, valid_acc: 0.886
training...: 100%|██████████| 365/365 [00:14<00:00, 25.92it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 57.02it/s]
epoch: 4
train_loss: 0.075, train_acc: 0.977
valid_loss: 0.419, valid_acc: 0.883
train

In [24]:
adam_depth_4_hyperparams = HyperParams()
adam_depth_4_hyperparams.OPTIM = "adam"
adam_depth_4_hyperparams.LR = 0.001
adam_depth_4_hyperparams.N_LAYERS = 4
_ = train_and_test_model_with_hparams(adam_depth_4_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 344,635 trainable parameters
training...: 100%|██████████| 365/365 [00:17<00:00, 21.13it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 51.22it/s]
epoch: 1
train_loss: 0.693, train_acc: 0.503
valid_loss: 0.693, valid_acc: 0.504
training...: 100%|██████████| 365/365 [00:17<00:00, 20.84it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 48.96it/s]
epoch: 2
train_loss: 0.593, train_acc: 0.648
valid_loss: 0.449, valid_acc: 0.815
training...: 100%|██████████| 365/365 [00:17<00:00, 21.07it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 48.04it/s]
epoch: 3
train_loss: 0.265, train_acc: 0.897
valid_loss: 0.296, valid_acc: 0.877
training...: 100%|██████████| 365/365 [00:17<00:00, 21.08it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 32.93it/s]
epoch: 4
train_loss: 0.131, train_acc: 0.955
valid_loss: 0.326, valid_acc: 0.877
train

### Lab 2 (d) Wider LSTMs

In [25]:
adam_width_25_hyperparams = HyperParams()
adam_width_25_hyperparams.OPTIM = "adam"
adam_width_25_hyperparams.LR = 0.001
adam_width_25_hyperparams.N_LAYERS = 1
#adam_width_25_hyperparams.DROPOUT_RATE = 0.5

adam_width_25_hyperparams.HIDDEN_DIM = 25
_ = train_and_test_model_with_hparams(adam_width_25_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 63,685 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 43.56it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 85.76it/s]
epoch: 1
train_loss: 0.657, train_acc: 0.577
valid_loss: 0.581, valid_acc: 0.763
training...: 100%|██████████| 365/365 [00:07<00:00, 47.41it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.48it/s]
epoch: 2
train_loss: 0.297, train_acc: 0.882
valid_loss: 0.271, valid_acc: 0.892
training...: 100%|██████████| 365/365 [00:08<00:00, 41.18it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 33.79it/s]
epoch: 3
train_loss: 0.147, train_acc: 0.950
valid_loss: 0.275, valid_acc: 0.891
training...: 100%|██████████| 365/365 [00:07<00:00, 45.91it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 55.59it/s]
epoch: 4
train_loss: 0.079, train_acc: 0.976
valid_loss: 0.383, valid_acc: 0.883
traini

In [27]:
adam_width_50_hyperparams = HyperParams()
adam_width_50_hyperparams.OPTIM = "adam"
adam_width_50_hyperparams.LR = 0.001
adam_width_50_hyperparams.N_LAYERS = 1
#adam_width_50_hyperparams.DROPOUT_RATE = 0.5

adam_width_50_hyperparams.HIDDEN_DIM = 50
_ = train_and_test_model_with_hparams(adam_width_50_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 71,535 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 45.50it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.14it/s]
epoch: 1
train_loss: 0.621, train_acc: 0.609
valid_loss: 0.332, valid_acc: 0.863
training...: 100%|██████████| 365/365 [00:06<00:00, 52.16it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.56it/s]
epoch: 2
train_loss: 0.265, train_acc: 0.897
valid_loss: 0.315, valid_acc: 0.870
training...: 100%|██████████| 365/365 [00:08<00:00, 42.75it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.57it/s]
epoch: 3
train_loss: 0.133, train_acc: 0.954
valid_loss: 0.288, valid_acc: 0.891
training...: 100%|██████████| 365/365 [00:08<00:00, 44.95it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.68it/s]
epoch: 4
train_loss: 0.066, train_acc: 0.981
valid_loss: 0.337, valid_acc: 0.883
traini

In [28]:
adam_width_75_hyperparams = HyperParams()
adam_width_75_hyperparams.OPTIM = "adam"
adam_width_75_hyperparams.LR = 0.001
adam_width_75_hyperparams.N_LAYERS = 1
#adam_width_75_hyperparams.DROPOUT_RATE = 0.5

adam_width_75_hyperparams.HIDDEN_DIM = 75
_ = train_and_test_model_with_hparams(adam_width_75_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 84,385 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 44.01it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 61.47it/s]
epoch: 1
train_loss: 0.660, train_acc: 0.581
valid_loss: 0.417, valid_acc: 0.833
training...: 100%|██████████| 365/365 [00:07<00:00, 49.25it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 83.64it/s]
epoch: 2
train_loss: 0.280, train_acc: 0.891
valid_loss: 0.281, valid_acc: 0.888
training...: 100%|██████████| 365/365 [00:07<00:00, 50.21it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 86.75it/s]
epoch: 3
train_loss: 0.141, train_acc: 0.951
valid_loss: 0.312, valid_acc: 0.870
training...: 100%|██████████| 365/365 [00:07<00:00, 50.96it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.91it/s]
epoch: 4
train_loss: 0.076, train_acc: 0.977
valid_loss: 0.341, valid_acc: 0.881
traini

In [29]:
adam_width_100_hyperparams = HyperParams()
adam_width_100_hyperparams.OPTIM = "adam"
adam_width_100_hyperparams.LR = 0.001
adam_width_100_hyperparams.N_LAYERS = 1
#adam_width_100_hyperparams.DROPOUT_RATE = 0.5

In [30]:
adam_width_125_hyperparams = HyperParams()
adam_width_125_hyperparams.OPTIM = "adam"
adam_width_125_hyperparams.LR = 0.001
adam_width_125_hyperparams.N_LAYERS = 1
#adam_width_125_hyperparams.DROPOUT_RATE = 0.5

adam_width_125_hyperparams.HIDDEN_DIM = 125
_ = train_and_test_model_with_hparams(adam_width_125_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 125,085 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 44.00it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 83.21it/s]
epoch: 1
train_loss: 0.677, train_acc: 0.561
valid_loss: 0.621, valid_acc: 0.703
training...: 100%|██████████| 365/365 [00:07<00:00, 47.06it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 83.49it/s]
epoch: 2
train_loss: 0.334, train_acc: 0.866
valid_loss: 0.362, valid_acc: 0.849
training...: 100%|██████████| 365/365 [00:07<00:00, 45.87it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.77it/s]
epoch: 3
train_loss: 0.163, train_acc: 0.942
valid_loss: 0.344, valid_acc: 0.854
training...: 100%|██████████| 365/365 [00:07<00:00, 46.35it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.46it/s]
epoch: 4
train_loss: 0.091, train_acc: 0.972
valid_loss: 0.374, valid_acc: 0.874
train

In [31]:
adam_width_150_hyperparams = HyperParams()
adam_width_150_hyperparams.OPTIM = "adam"
adam_width_150_hyperparams.LR = 0.001
adam_width_150_hyperparams.N_LAYERS = 1
#adam_width_150_hyperparams.DROPOUT_RATE = 0.5

adam_width_150_hyperparams.HIDDEN_DIM = 150
_ = train_and_test_model_with_hparams(adam_width_150_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 152,935 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 43.57it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 82.49it/s]
epoch: 1
train_loss: 0.681, train_acc: 0.573
valid_loss: 0.543, valid_acc: 0.824
training...: 100%|██████████| 365/365 [00:07<00:00, 45.77it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 82.58it/s]
epoch: 2
train_loss: 0.302, train_acc: 0.880
valid_loss: 0.308, valid_acc: 0.869
training...: 100%|██████████| 365/365 [00:08<00:00, 42.02it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 83.00it/s]
epoch: 3
train_loss: 0.153, train_acc: 0.948
valid_loss: 0.293, valid_acc: 0.878
training...: 100%|██████████| 365/365 [00:08<00:00, 45.23it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.99it/s]
epoch: 4
train_loss: 0.083, train_acc: 0.975
valid_loss: 0.342, valid_acc: 0.880
train

In [32]:
adam_width_175_hyperparams = HyperParams()
adam_width_175_hyperparams.OPTIM = "adam"
adam_width_175_hyperparams.LR = 0.001
adam_width_175_hyperparams.N_LAYERS = 1
#adam_width_175_hyperparams.DROPOUT_RATE = 0.5

adam_width_175_hyperparams.HIDDEN_DIM = 175
_ = train_and_test_model_with_hparams(adam_width_175_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 185,785 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 44.04it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 78.57it/s]
epoch: 1
train_loss: 0.676, train_acc: 0.563
valid_loss: 0.633, valid_acc: 0.673
training...: 100%|██████████| 365/365 [00:08<00:00, 44.89it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 79.04it/s]
epoch: 2
train_loss: 0.419, train_acc: 0.811
valid_loss: 0.578, valid_acc: 0.696
training...: 100%|██████████| 365/365 [00:08<00:00, 45.39it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 79.32it/s]
epoch: 3
train_loss: 0.236, train_acc: 0.910
valid_loss: 0.296, valid_acc: 0.884
training...: 100%|██████████| 365/365 [00:08<00:00, 44.70it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 80.49it/s]
epoch: 4
train_loss: 0.124, train_acc: 0.957
valid_loss: 0.320, valid_acc: 0.878
train

In [33]:
adam_width_200_hyperparams = HyperParams()
adam_width_200_hyperparams.OPTIM = "adam"
adam_width_200_hyperparams.LR = 0.001
adam_width_200_hyperparams.N_LAYERS = 1
#adam_width_200_hyperparams.DROPOUT_RATE = 0.5

adam_width_200_hyperparams.HIDDEN_DIM = 200
_ = train_and_test_model_with_hparams(adam_width_200_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 223,635 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 43.31it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.83it/s]
epoch: 1
train_loss: 0.688, train_acc: 0.565
valid_loss: 0.658, valid_acc: 0.578
training...: 100%|██████████| 365/365 [00:08<00:00, 43.96it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.79it/s]
epoch: 2
train_loss: 0.372, train_acc: 0.839
valid_loss: 0.306, valid_acc: 0.872
training...: 100%|██████████| 365/365 [00:09<00:00, 39.54it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 75.62it/s]
epoch: 3
train_loss: 0.165, train_acc: 0.941
valid_loss: 0.286, valid_acc: 0.885
training...: 100%|██████████| 365/365 [00:08<00:00, 43.92it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.48it/s]
epoch: 4
train_loss: 0.086, train_acc: 0.973
valid_loss: 0.325, valid_acc: 0.881
train

In [34]:
adam_width_225_hyperparams = HyperParams()
adam_width_225_hyperparams.OPTIM = "adam"
adam_width_225_hyperparams.LR = 0.001
adam_width_225_hyperparams.N_LAYERS = 1
#adam_width_225_hyperparams.DROPOUT_RATE = 0.5

adam_width_225_hyperparams.HIDDEN_DIM = 225
_ = train_and_test_model_with_hparams(adam_width_225_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 266,485 trainable parameters
training...: 100%|██████████| 365/365 [00:09<00:00, 39.55it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 71.11it/s]
epoch: 1
train_loss: 0.685, train_acc: 0.549
valid_loss: 0.633, valid_acc: 0.684
training...: 100%|██████████| 365/365 [00:09<00:00, 40.25it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 69.00it/s]
epoch: 2
train_loss: 0.492, train_acc: 0.768
valid_loss: 0.455, valid_acc: 0.788
training...: 100%|██████████| 365/365 [00:09<00:00, 39.84it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 69.61it/s]
epoch: 3
train_loss: 0.227, train_acc: 0.911
valid_loss: 0.325, valid_acc: 0.873
training...: 100%|██████████| 365/365 [00:09<00:00, 40.05it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 69.87it/s]
epoch: 4
train_loss: 0.117, train_acc: 0.961
valid_loss: 0.324, valid_acc: 0.879
train

In [35]:
adam_width_250_hyperparams = HyperParams()
adam_width_250_hyperparams.OPTIM = "adam"
adam_width_250_hyperparams.LR = 0.001
adam_width_250_hyperparams.N_LAYERS = 1
#adam_width_250_hyperparams.DROPOUT_RATE = 0.5

adam_width_250_hyperparams.HIDDEN_DIM = 250
_ = train_and_test_model_with_hparams(adam_width_250_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 314,335 trainable parameters
training...: 100%|██████████| 365/365 [00:09<00:00, 37.73it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 69.38it/s]
epoch: 1
train_loss: 0.704, train_acc: 0.549
valid_loss: 0.658, valid_acc: 0.605
training...: 100%|██████████| 365/365 [00:09<00:00, 38.17it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 67.00it/s]
epoch: 2
train_loss: 0.505, train_acc: 0.763
valid_loss: 0.378, valid_acc: 0.852
training...: 100%|██████████| 365/365 [00:09<00:00, 38.26it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 67.95it/s]
epoch: 3
train_loss: 0.227, train_acc: 0.911
valid_loss: 0.307, valid_acc: 0.887
training...: 100%|██████████| 365/365 [00:09<00:00, 38.21it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 68.30it/s]
epoch: 4
train_loss: 0.121, train_acc: 0.960
valid_loss: 0.324, valid_acc: 0.866
train

In [36]:
adam_width_275_hyperparams = HyperParams()
adam_width_275_hyperparams.OPTIM = "adam"
adam_width_275_hyperparams.LR = 0.001
adam_width_275_hyperparams.N_LAYERS = 1
#adam_width_275_hyperparams.DROPOUT_RATE = 0.5

adam_width_275_hyperparams.HIDDEN_DIM = 275
_ = train_and_test_model_with_hparams(adam_width_275_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 367,185 trainable parameters
training...: 100%|██████████| 365/365 [00:10<00:00, 34.87it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 67.30it/s]
epoch: 1
train_loss: 0.687, train_acc: 0.550
valid_loss: 0.633, valid_acc: 0.699
training...: 100%|██████████| 365/365 [00:10<00:00, 35.09it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 66.00it/s]
epoch: 2
train_loss: 0.456, train_acc: 0.802
valid_loss: 0.332, valid_acc: 0.865
training...: 100%|██████████| 365/365 [00:10<00:00, 35.16it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 64.73it/s]
epoch: 3
train_loss: 0.243, train_acc: 0.907
valid_loss: 0.354, valid_acc: 0.872
training...: 100%|██████████| 365/365 [00:10<00:00, 35.06it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 66.25it/s]
epoch: 4
train_loss: 0.134, train_acc: 0.954
valid_loss: 0.334, valid_acc: 0.866
train

In [37]:
adam_width_300_hyperparams = HyperParams()
adam_width_300_hyperparams.OPTIM = "adam"
adam_width_300_hyperparams.LR = 0.001
adam_width_300_hyperparams.N_LAYERS = 1
#adam_width_300_hyperparams.DROPOUT_RATE = 0.5

adam_width_300_hyperparams.HIDDEN_DIM = 300
_ = train_and_test_model_with_hparams(adam_width_300_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 425,035 trainable parameters
training...: 100%|██████████| 365/365 [00:10<00:00, 33.42it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 63.94it/s]
epoch: 1
train_loss: 0.682, train_acc: 0.562
valid_loss: 0.592, valid_acc: 0.733
training...: 100%|██████████| 365/365 [00:10<00:00, 33.95it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 63.60it/s]
epoch: 2
train_loss: 0.372, train_acc: 0.847
valid_loss: 0.317, valid_acc: 0.868
training...: 100%|██████████| 365/365 [00:10<00:00, 33.94it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 65.01it/s]
epoch: 3
train_loss: 0.177, train_acc: 0.936
valid_loss: 0.304, valid_acc: 0.883
training...: 100%|██████████| 365/365 [00:10<00:00, 33.36it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 63.04it/s]
epoch: 4
train_loss: 0.100, train_acc: 0.967
valid_loss: 0.415, valid_acc: 0.876
train

In [38]:
adam_width_320_hyperparams = HyperParams()
adam_width_320_hyperparams.OPTIM = "adam"
adam_width_320_hyperparams.LR = 0.001
adam_width_320_hyperparams.N_LAYERS = 1
#adam_width_320_hyperparams.DROPOUT_RATE = 0.5

adam_width_320_hyperparams.HIDDEN_DIM = 320
_ = train_and_test_model_with_hparams(adam_width_320_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 474,915 trainable parameters
training...: 100%|██████████| 365/365 [00:10<00:00, 33.20it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 63.97it/s]
epoch: 1
train_loss: 0.686, train_acc: 0.555
valid_loss: 0.651, valid_acc: 0.696
training...: 100%|██████████| 365/365 [00:10<00:00, 33.51it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 65.09it/s]
epoch: 2
train_loss: 0.406, train_acc: 0.819
valid_loss: 0.304, valid_acc: 0.882
training...: 100%|██████████| 365/365 [00:10<00:00, 33.47it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 63.98it/s]
epoch: 3
train_loss: 0.175, train_acc: 0.936
valid_loss: 0.337, valid_acc: 0.854
training...: 100%|██████████| 365/365 [00:12<00:00, 29.80it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 61.92it/s]
epoch: 4
train_loss: 0.095, train_acc: 0.969
valid_loss: 0.344, valid_acc: 0.886
train

### Lab 2 (e) Larger Embedding Table

In [39]:
adam_embed_1_hyperparams =  HyperParams()
adam_embed_1_hyperparams.OPTIM = "adam"
adam_embed_1_hyperparams.LR = 0.001
adam_embed_1_hyperparams.N_LAYERS = 1
#adam_embed_1_hyperparams.DROPOUT_RATE = 0.5
adam_embed_1_hyperparams.HIDDEN_DIM = 175

In [41]:
adam_embed_2_hyperparams = HyperParams()
adam_embed_2_hyperparams.OPTIM = "adam"
adam_embed_2_hyperparams.LR = 0.001
adam_embed_2_hyperparams.N_LAYERS = 1
#adam_embed_2_hyperparams.DROPOUT_RATE = 0.5
adam_embed_2_hyperparams.HIDDEN_DIM = 175

adam_embed_2_hyperparams.EMBEDDING_DIM = 2
_ = train_and_test_model_with_hparams(adam_embed_2_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 247,318 trainable parameters
training...: 100%|██████████| 365/365 [00:10<00:00, 35.04it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 80.93it/s]
epoch: 1
train_loss: 0.660, train_acc: 0.587
valid_loss: 0.557, valid_acc: 0.729
training...: 100%|██████████| 365/365 [00:08<00:00, 41.30it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 53.96it/s]
epoch: 2
train_loss: 0.300, train_acc: 0.878
valid_loss: 0.283, valid_acc: 0.891
training...: 100%|██████████| 365/365 [00:09<00:00, 40.51it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 79.58it/s]
epoch: 3
train_loss: 0.132, train_acc: 0.955
valid_loss: 0.312, valid_acc: 0.887
training...: 100%|██████████| 365/365 [00:08<00:00, 43.64it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 45.03it/s]
epoch: 4
train_loss: 0.063, train_acc: 0.981
valid_loss: 0.384, valid_acc: 0.876
train

In [42]:
adam_embed_4_hyperparams = HyperParams()
adam_embed_4_hyperparams.OPTIM = "adam"
adam_embed_4_hyperparams.LR = 0.001
adam_embed_4_hyperparams.N_LAYERS = 1
#adam_embed_4_hyperparams.DROPOUT_RATE = 0.5
adam_embed_4_hyperparams.HIDDEN_DIM = 175

adam_embed_4_hyperparams.EMBEDDING_DIM = 4
_ = train_and_test_model_with_hparams(adam_embed_4_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 370,384 trainable parameters
training...: 100%|██████████| 365/365 [00:09<00:00, 36.92it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 73.66it/s]
epoch: 1
train_loss: 0.614, train_acc: 0.629
valid_loss: 0.360, valid_acc: 0.846
training...: 100%|██████████| 365/365 [00:10<00:00, 35.47it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 56.08it/s]
epoch: 2
train_loss: 0.268, train_acc: 0.895
valid_loss: 0.315, valid_acc: 0.879
training...: 100%|██████████| 365/365 [00:09<00:00, 38.29it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.80it/s]
epoch: 3
train_loss: 0.128, train_acc: 0.957
valid_loss: 0.339, valid_acc: 0.862
training...: 100%|██████████| 365/365 [00:08<00:00, 40.82it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.85it/s]
epoch: 4
train_loss: 0.068, train_acc: 0.979
valid_loss: 0.423, valid_acc: 0.869
train

In [43]:
adam_embed_8_hyperparams = HyperParams()
adam_embed_8_hyperparams.OPTIM = "adam"
adam_embed_8_hyperparams.LR = 0.001
adam_embed_8_hyperparams.N_LAYERS = 1
#adam_embed_8_hyperparams.DROPOUT_RATE = 0.5
adam_embed_8_hyperparams.HIDDEN_DIM = 175

adam_embed_8_hyperparams.EMBEDDING_DIM = 8
_ = train_and_test_model_with_hparams(adam_embed_8_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 616,516 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 42.41it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.66it/s]
epoch: 1
train_loss: 0.573, train_acc: 0.667
valid_loss: 0.322, valid_acc: 0.867
training...: 100%|██████████| 365/365 [00:08<00:00, 40.84it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 79.10it/s]
epoch: 2
train_loss: 0.223, train_acc: 0.914
valid_loss: 0.309, valid_acc: 0.873
training...: 100%|██████████| 365/365 [00:08<00:00, 45.42it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 80.30it/s]
epoch: 3
train_loss: 0.099, train_acc: 0.968
valid_loss: 0.375, valid_acc: 0.869
training...: 100%|██████████| 365/365 [00:08<00:00, 44.30it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.42it/s]
epoch: 4
train_loss: 0.049, train_acc: 0.986
valid_loss: 0.469, valid_acc: 0.860
train

In [44]:
adam_embed_16_hyperparams = HyperParams()
adam_embed_16_hyperparams.OPTIM = "adam"
adam_embed_16_hyperparams.LR = 0.001
adam_embed_16_hyperparams.N_LAYERS = 1
#adam_embed_16_hyperparams.DROPOUT_RATE = 0.5
adam_embed_16_hyperparams.HIDDEN_DIM = 175

adam_embed_16_hyperparams.EMBEDDING_DIM = 16
_ = train_and_test_model_with_hparams(adam_embed_16_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 1,108,780 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 43.46it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 78.45it/s]
epoch: 1
train_loss: 0.487, train_acc: 0.729
valid_loss: 0.507, valid_acc: 0.742
training...: 100%|██████████| 365/365 [00:08<00:00, 43.41it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 78.74it/s]
epoch: 2
train_loss: 0.208, train_acc: 0.923
valid_loss: 0.350, valid_acc: 0.851
training...: 100%|██████████| 365/365 [00:08<00:00, 44.61it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.48it/s]
epoch: 3
train_loss: 0.101, train_acc: 0.968
valid_loss: 0.395, valid_acc: 0.873
training...: 100%|██████████| 365/365 [00:08<00:00, 44.35it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.94it/s]
epoch: 4
train_loss: 0.056, train_acc: 0.982
valid_loss: 0.477, valid_acc: 0.853
tra

In [45]:
adam_embed_32_hyperparams = HyperParams()
adam_embed_32_hyperparams.OPTIM = "adam"
adam_embed_32_hyperparams.LR = 0.001
adam_embed_32_hyperparams.N_LAYERS = 1
#adam_embed_32_hyperparams.DROPOUT_RATE = 0.5
adam_embed_32_hyperparams.HIDDEN_DIM = 175

adam_embed_32_hyperparams.EMBEDDING_DIM = 32
_ = train_and_test_model_with_hparams(adam_embed_32_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 2,093,308 trainable parameters
training...: 100%|██████████| 365/365 [00:09<00:00, 39.91it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.31it/s]
epoch: 1
train_loss: 0.513, train_acc: 0.711
valid_loss: 0.305, valid_acc: 0.872
training...: 100%|██████████| 365/365 [00:08<00:00, 44.01it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.51it/s]
epoch: 2
train_loss: 0.206, train_acc: 0.923
valid_loss: 0.329, valid_acc: 0.872
training...: 100%|██████████| 365/365 [00:08<00:00, 44.65it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.76it/s]
epoch: 3
train_loss: 0.093, train_acc: 0.970
valid_loss: 0.376, valid_acc: 0.863
training...: 100%|██████████| 365/365 [00:08<00:00, 44.33it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 71.13it/s]
epoch: 4
train_loss: 0.045, train_acc: 0.986
valid_loss: 0.563, valid_acc: 0.868
tra

In [46]:
adam_embed_64_hyperparams = HyperParams()
adam_embed_64_hyperparams.OPTIM = "adam"
adam_embed_64_hyperparams.LR = 0.001
adam_embed_64_hyperparams.N_LAYERS = 1
#adam_embed_64_hyperparams.DROPOUT_RATE = 0.5
adam_embed_64_hyperparams.HIDDEN_DIM = 175

adam_embed_64_hyperparams.EMBEDDING_DIM = 64
_ = train_and_test_model_with_hparams(adam_embed_64_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 4,062,364 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 41.64it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.15it/s]
epoch: 1
train_loss: 0.454, train_acc: 0.767
valid_loss: 0.346, valid_acc: 0.855
training...: 100%|██████████| 365/365 [00:08<00:00, 41.82it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.67it/s]
epoch: 2
train_loss: 0.196, train_acc: 0.930
valid_loss: 0.338, valid_acc: 0.870
training...: 100%|██████████| 365/365 [00:08<00:00, 42.15it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.50it/s]
epoch: 3
train_loss: 0.103, train_acc: 0.965
valid_loss: 0.451, valid_acc: 0.849
training...: 100%|██████████| 365/365 [00:08<00:00, 42.19it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 72.42it/s]
epoch: 4
train_loss: 0.054, train_acc: 0.982
valid_loss: 0.532, valid_acc: 0.853
tra

In [47]:
adam_embed_128_hyperparams = HyperParams()
adam_embed_128_hyperparams.OPTIM = "adam"
adam_embed_128_hyperparams.LR = 0.001
adam_embed_128_hyperparams.N_LAYERS = 1
#adam_embed_128_hyperparams.DROPOUT_RATE = 0.5
adam_embed_128_hyperparams.HIDDEN_DIM = 175

adam_embed_128_hyperparams.EMBEDDING_DIM = 128
_ = train_and_test_model_with_hparams(adam_embed_128_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 8,000,476 trainable parameters
training...: 100%|██████████| 365/365 [00:09<00:00, 37.74it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 75.98it/s]
epoch: 1
train_loss: 0.472, train_acc: 0.752
valid_loss: 0.320, valid_acc: 0.868
training...: 100%|██████████| 365/365 [00:09<00:00, 38.18it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.13it/s]
epoch: 2
train_loss: 0.201, train_acc: 0.926
valid_loss: 0.334, valid_acc: 0.864
training...: 100%|██████████| 365/365 [00:09<00:00, 38.40it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.06it/s]
epoch: 3
train_loss: 0.088, train_acc: 0.970
valid_loss: 0.365, valid_acc: 0.864
training...: 100%|██████████| 365/365 [00:09<00:00, 38.10it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.18it/s]
epoch: 4
train_loss: 0.055, train_acc: 0.981
valid_loss: 0.522, valid_acc: 0.869
tra

In [48]:
adam_embed_256_hyperparams = HyperParams()
adam_embed_256_hyperparams.OPTIM = "adam"
adam_embed_256_hyperparams.LR = 0.001
adam_embed_256_hyperparams.N_LAYERS = 1
#adam_embed_256_hyperparams.DROPOUT_RATE = 0.5
adam_embed_256_hyperparams.HIDDEN_DIM = 175

adam_embed_256_hyperparams.EMBEDDING_DIM = 256
_ = train_and_test_model_with_hparams(adam_embed_256_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 15,876,700 trainable parameters
training...: 100%|██████████| 365/365 [00:12<00:00, 30.27it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 69.80it/s]
epoch: 1
train_loss: 0.444, train_acc: 0.774
valid_loss: 0.304, valid_acc: 0.875
training...: 100%|██████████| 365/365 [00:11<00:00, 31.30it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 73.56it/s]
epoch: 2
train_loss: 0.200, train_acc: 0.927
valid_loss: 0.411, valid_acc: 0.845
training...: 100%|██████████| 365/365 [00:11<00:00, 31.76it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 73.74it/s]
epoch: 3
train_loss: 0.093, train_acc: 0.969
valid_loss: 0.377, valid_acc: 0.874
training...: 100%|██████████| 365/365 [00:11<00:00, 31.54it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 75.11it/s]
epoch: 4
train_loss: 0.053, train_acc: 0.984
valid_loss: 0.484, valid_acc: 0.853
tr

### Lab 2(f) Compound scaling of embedding_dim, hidden_dim, layers

EfficientNet:

```
params_dict = {
      # (width_coefficient, depth_coefficient, resolution, dropout_rate)
      'efficientnet-b0': (1.0, 1.0, 224, 0.2),
      'efficientnet-b1': (1.0, 1.1, 240, 0.2),
      'efficientnet-b2': (1.1, 1.2, 260, 0.3),
      'efficientnet-b3': (1.2, 1.4, 300, 0.3),
      'efficientnet-b4': (1.4, 1.8, 380, 0.4),
      'efficientnet-b5': (1.6, 2.2, 456, 0.4),
      'efficientnet-b6': (1.8, 2.6, 528, 0.5),
      'efficientnet-b7': (2.0, 3.1, 600, 0.5),
}
```




In [49]:
#Perform gri
compounding_hyperparams = adam_optimizer_hyperparams
compounding_hyperparams.OPTIM = "adam"
compounding_hyperparams.LR = 0.001
compounding_hyperparams.N_LAYERS = 1
#adam_embed_256_hyperparams.DROPOUT_RATE = 0.5
compounding_hyperparams.HIDDEN_DIM = 25
compounding_hyperparams.EMBEDDING_DIM = 1

#Numbers used in EfficientNet paper
#depth coefficient * width coefficient^2 * embed_coefficient ^2 = 2
depth_coefficient = 1.4
width_coefficient = 1.1
resolution_coefficient = 1.1

current_n_layers= compounding_hyperparams.N_LAYERS
current_hidden_dim = compounding_hyperparams.HIDDEN_DIM
current_embedding_dim = compounding_hyperparams.EMBEDDING_DIM

for i in range(1,5):
  current_n_layers = current_n_layers * depth_coefficient
  current_hidden_dim = current_hidden_dim * width_coefficient
  current_embedding_dim = current_embedding_dim * resolution_coefficient
  compounding_hyperparams.N_LAYERS = round(current_n_layers)
  compounding_hyperparams.HIDDEN_DIM = round(current_hidden_dim)
  compounding_hyperparams.EMBEDDING_DIM = round(current_embedding_dim)
  print(f'N_LAYERS: {compounding_hyperparams.N_LAYERS }')
  print(f'HIDDEN_DIM: {compounding_hyperparams.HIDDEN_DIM }')
  print(f'EMBEDDING_DIM: {compounding_hyperparams.EMBEDDING_DIM }')
  _ = train_and_test_model_with_hparams(compounding_hyperparams, "lstm_1layer_base_sgd_e32_h100")
  print()

N_LAYERS: 1
HIDDEN_DIM: 28
EMBEDDING_DIM: 1
shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 64,363 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 50.26it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.90it/s]
epoch: 1
train_loss: 0.596, train_acc: 0.634
valid_loss: 0.339, valid_acc: 0.858
training...: 100%|██████████| 365/365 [00:07<00:00, 52.11it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.38it/s]
epoch: 2
train_loss: 0.232, train_acc: 0.911
valid_loss: 0.278, valid_acc: 0.887
training...: 100%|██████████| 365/365 [00:06<00:00, 52.64it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 89.15it/s]
epoch: 3
train_loss: 0.113, train_acc: 0.964
valid_loss: 0.289, valid_acc: 0.891
training...: 100%|██████████| 365/365 [00:07<00:00, 51.17it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.22it/s]
epoch: 4
train_loss: 0.055, train_acc: 0.98

In [52]:
#Perform gri
compounding_hyperparams = HyperParams()
compounding_hyperparams.OPTIM = "adam"
compounding_hyperparams.LR = 0.001
compounding_hyperparams.N_LAYERS = 1
#adam_embed_256_hyperparams.DROPOUT_RATE = 0.5
compounding_hyperparams.HIDDEN_DIM = 100
compounding_hyperparams.EMBEDDING_DIM = 1

#Numbers used in EfficientNet paper
#depth coefficient * width coefficient^2 * embed_coefficient ^2 = 2
depth_coefficient = 1.2
width_coefficient = 1.1
resolution_coefficient = 1.15

current_n_layers= compounding_hyperparams.N_LAYERS
current_hidden_dim = compounding_hyperparams.HIDDEN_DIM
current_embedding_dim = compounding_hyperparams.EMBEDDING_DIM

for i in range(1,5):
  current_n_layers = current_n_layers * depth_coefficient
  current_hidden_dim = current_hidden_dim * width_coefficient
  current_embedding_dim = current_embedding_dim * resolution_coefficient
  compounding_hyperparams.N_LAYERS = round(current_n_layers)
  compounding_hyperparams.HIDDEN_DIM = round(current_hidden_dim)
  compounding_hyperparams.EMBEDDING_DIM = round(current_embedding_dim)
  print(f'N_LAYERS: {compounding_hyperparams.N_LAYERS }')
  print(f'HIDDEN_DIM: {compounding_hyperparams.HIDDEN_DIM }')
  print(f'EMBEDDING_DIM: {compounding_hyperparams.EMBEDDING_DIM }')
  _ = train_and_test_model_with_hparams(compounding_hyperparams, "lstm_1layer_base_sgd_e32_h100")
  print()

N_LAYERS: 1
HIDDEN_DIM: 110
EMBEDDING_DIM: 1
shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 110,775 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 45.80it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.72it/s]
epoch: 1
train_loss: 0.642, train_acc: 0.603
valid_loss: 0.337, valid_acc: 0.858
training...: 100%|██████████| 365/365 [00:07<00:00, 47.16it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 71.44it/s]
epoch: 2
train_loss: 0.257, train_acc: 0.899
valid_loss: 0.268, valid_acc: 0.889
training...: 100%|██████████| 365/365 [00:11<00:00, 32.52it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 82.43it/s]
epoch: 3
train_loss: 0.136, train_acc: 0.955
valid_loss: 0.287, valid_acc: 0.886
training...: 100%|██████████| 365/365 [00:07<00:00, 47.62it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 80.31it/s]
epoch: 4
train_loss: 0.076, train_acc: 0.

In [53]:
#Perform gri
compounding_hyperparams = HyperParams()
compounding_hyperparams.OPTIM = "adam"
compounding_hyperparams.LR = 0.001
compounding_hyperparams.N_LAYERS = 1
#adam_embed_256_hyperparams.DROPOUT_RATE = 0.5
compounding_hyperparams.HIDDEN_DIM = 100
compounding_hyperparams.EMBEDDING_DIM = 1

#Numbers used in EfficientNet paper
#depth coefficient * width coefficient^2 * embed_coefficient ^2 = 2
depth_coefficient = 1.4
width_coefficient = 1.1
resolution_coefficient = 1.1

current_n_layers= compounding_hyperparams.N_LAYERS
current_hidden_dim = compounding_hyperparams.HIDDEN_DIM
current_embedding_dim = compounding_hyperparams.EMBEDDING_DIM

for i in range(1,5):
  current_n_layers = current_n_layers * depth_coefficient
  current_hidden_dim = current_hidden_dim * width_coefficient
  current_embedding_dim = current_embedding_dim * resolution_coefficient
  compounding_hyperparams.N_LAYERS = round(current_n_layers)
  compounding_hyperparams.HIDDEN_DIM = round(current_hidden_dim)
  compounding_hyperparams.EMBEDDING_DIM = round(current_embedding_dim)
  print(f'N_LAYERS: {compounding_hyperparams.N_LAYERS }')
  print(f'HIDDEN_DIM: {compounding_hyperparams.HIDDEN_DIM }')
  print(f'EMBEDDING_DIM: {compounding_hyperparams.EMBEDDING_DIM }')
  _ = train_and_test_model_with_hparams(compounding_hyperparams, "lstm_1layer_base_sgd_e32_h100")
  print()

N_LAYERS: 1
HIDDEN_DIM: 110
EMBEDDING_DIM: 1
shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 110,775 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 42.67it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.06it/s]
epoch: 1
train_loss: 0.642, train_acc: 0.603
valid_loss: 0.337, valid_acc: 0.858
training...: 100%|██████████| 365/365 [00:08<00:00, 43.22it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 38.94it/s]
epoch: 2
train_loss: 0.257, train_acc: 0.899
valid_loss: 0.268, valid_acc: 0.889
training...: 100%|██████████| 365/365 [00:08<00:00, 44.63it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 83.74it/s]
epoch: 3
train_loss: 0.136, train_acc: 0.955
valid_loss: 0.287, valid_acc: 0.886
training...: 100%|██████████| 365/365 [00:07<00:00, 46.81it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.35it/s]
epoch: 4
train_loss: 0.076, train_acc: 0.

In [59]:
#Perform gri
compounding_hyperparams = HyperParams()
compounding_hyperparams.OPTIM = "adam"
compounding_hyperparams.LR = 0.001
compounding_hyperparams.N_LAYERS = 1
#adam_embed_256_hyperparams.DROPOUT_RATE = 0.5
compounding_hyperparams.HIDDEN_DIM = 150
compounding_hyperparams.EMBEDDING_DIM = 1

#Numbers used in EfficientNet paper
#depth coefficient * width coefficient^2 * embed_coefficient ^2 = 2
depth_coefficient = 1.4
width_coefficient = 1.1
resolution_coefficient = 1.1

current_n_layers= compounding_hyperparams.N_LAYERS
current_hidden_dim = compounding_hyperparams.HIDDEN_DIM
current_embedding_dim = compounding_hyperparams.EMBEDDING_DIM

for i in range(1,5):
  current_n_layers = current_n_layers * depth_coefficient
  current_hidden_dim = current_hidden_dim * width_coefficient
  current_embedding_dim = current_embedding_dim * resolution_coefficient
  compounding_hyperparams.N_LAYERS = round(current_n_layers)
  compounding_hyperparams.HIDDEN_DIM = round(current_hidden_dim)
  compounding_hyperparams.EMBEDDING_DIM = round(current_embedding_dim)
  print(f'N_LAYERS: {compounding_hyperparams.N_LAYERS }')
  print(f'HIDDEN_DIM: {compounding_hyperparams.HIDDEN_DIM }')
  print(f'EMBEDDING_DIM: {compounding_hyperparams.EMBEDDING_DIM }')
  _ = train_and_test_model_with_hparams(compounding_hyperparams, "lstm_1layer_base_sgd_e32_h100")
  print()

N_LAYERS: 1
HIDDEN_DIM: 165
EMBEDDING_DIM: 1
shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 172,045 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 44.21it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 77.53it/s]
epoch: 1
train_loss: 0.643, train_acc: 0.597
valid_loss: 0.363, valid_acc: 0.839
training...: 100%|██████████| 365/365 [00:08<00:00, 45.59it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 79.34it/s]
epoch: 2
train_loss: 0.283, train_acc: 0.888
valid_loss: 0.287, valid_acc: 0.883
training...: 100%|██████████| 365/365 [00:08<00:00, 44.50it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 75.84it/s]
epoch: 3
train_loss: 0.145, train_acc: 0.949
valid_loss: 0.320, valid_acc: 0.889
training...: 100%|██████████| 365/365 [00:08<00:00, 42.06it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 75.00it/s]
epoch: 4
train_loss: 0.085, train_acc: 0.

In [60]:
#Perform gri
compounding_hyperparams = HyperParams()
compounding_hyperparams.OPTIM = "adam"
compounding_hyperparams.LR = 0.001
compounding_hyperparams.N_LAYERS = 1
#adam_embed_256_hyperparams.DROPOUT_RATE = 0.5
compounding_hyperparams.HIDDEN_DIM = 20
compounding_hyperparams.EMBEDDING_DIM = 1

#Numbers used in EfficientNet paper
#depth coefficient * width coefficient^2 * embed_coefficient ^2 = 2
depth_coefficient = 1.2
width_coefficient = 1.3
resolution_coefficient = 1

current_n_layers= compounding_hyperparams.N_LAYERS
current_hidden_dim = compounding_hyperparams.HIDDEN_DIM
current_embedding_dim = compounding_hyperparams.EMBEDDING_DIM

for i in range(1,5):
  current_n_layers = current_n_layers * depth_coefficient
  current_hidden_dim = current_hidden_dim * width_coefficient
  current_embedding_dim = current_embedding_dim * resolution_coefficient
  compounding_hyperparams.N_LAYERS = round(current_n_layers)
  compounding_hyperparams.HIDDEN_DIM = round(current_hidden_dim)
  compounding_hyperparams.EMBEDDING_DIM = round(current_embedding_dim)
  print(f'N_LAYERS: {compounding_hyperparams.N_LAYERS }')
  print(f'HIDDEN_DIM: {compounding_hyperparams.HIDDEN_DIM }')
  print(f'EMBEDDING_DIM: {compounding_hyperparams.EMBEDDING_DIM }')
  _ = train_and_test_model_with_hparams(compounding_hyperparams, "lstm_1layer_base_sgd_e32_h100")
  print()

N_LAYERS: 1
HIDDEN_DIM: 26
EMBEDDING_DIM: 1
shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 63,903 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 45.06it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.08it/s]
epoch: 1
train_loss: 0.590, train_acc: 0.634
valid_loss: 0.343, valid_acc: 0.859
training...: 100%|██████████| 365/365 [00:07<00:00, 49.97it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 89.20it/s]
epoch: 2
train_loss: 0.234, train_acc: 0.909
valid_loss: 0.307, valid_acc: 0.877
training...: 100%|██████████| 365/365 [00:07<00:00, 49.16it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 86.58it/s]
epoch: 3
train_loss: 0.119, train_acc: 0.961
valid_loss: 0.274, valid_acc: 0.895
training...: 100%|██████████| 365/365 [00:07<00:00, 50.95it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.45it/s]
epoch: 4
train_loss: 0.061, train_acc: 0.98

In [70]:
#Perform gri
compounding_hyperparams = HyperParams()
compounding_hyperparams.OPTIM = "adam"
compounding_hyperparams.LR = 0.001
compounding_hyperparams.N_LAYERS = 1
compounding_hyperparams.DROPOUT_RATE = 0.5
compounding_hyperparams.HIDDEN_DIM = 100
compounding_hyperparams.EMBEDDING_DIM = 1

#Numbers used in EfficientNet paper
#depth coefficient * width coefficient^2 * embed_coefficient ^2 = 2
depth_coefficient = 1.15
width_coefficient = 1.1
resolution_coefficient = 1.35

current_n_layers= compounding_hyperparams.N_LAYERS
current_hidden_dim = compounding_hyperparams.HIDDEN_DIM
current_embedding_dim = compounding_hyperparams.EMBEDDING_DIM

for i in range(1,10):
  current_n_layers = current_n_layers * depth_coefficient
  current_hidden_dim = current_hidden_dim * width_coefficient
  current_embedding_dim = current_embedding_dim * resolution_coefficient
  compounding_hyperparams.N_LAYERS = round(current_n_layers)
  compounding_hyperparams.HIDDEN_DIM = round(current_hidden_dim)
  compounding_hyperparams.EMBEDDING_DIM = round(current_embedding_dim)
  print(f'N_LAYERS: {compounding_hyperparams.N_LAYERS }')
  print(f'HIDDEN_DIM: {compounding_hyperparams.HIDDEN_DIM }')
  print(f'EMBEDDING_DIM: {compounding_hyperparams.EMBEDDING_DIM }')
  _ = train_and_test_model_with_hparams(compounding_hyperparams, "lstm_1layer_base_sgd_e32_h100")
  print()

N_LAYERS: 1
HIDDEN_DIM: 110
EMBEDDING_DIM: 1
shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 110,775 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 45.21it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 86.25it/s]
epoch: 1
train_loss: 0.687, train_acc: 0.546
valid_loss: 0.658, valid_acc: 0.647
training...: 100%|██████████| 365/365 [00:07<00:00, 46.82it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 85.46it/s]
epoch: 2
train_loss: 0.389, train_acc: 0.839
valid_loss: 0.294, valid_acc: 0.877
training...: 100%|██████████| 365/365 [00:07<00:00, 46.69it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.42it/s]
epoch: 3
train_loss: 0.172, train_acc: 0.939
valid_loss: 0.297, valid_acc: 0.875
training...: 100%|██████████| 365/365 [00:08<00:00, 41.83it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 85.35it/s]
epoch: 4
train_loss: 0.091, train_acc: 0.

In [73]:
#Perform gri
compounding_hyperparams = HyperParams()
compounding_hyperparams.OPTIM = "adam"
compounding_hyperparams.LR = 0.001
compounding_hyperparams.N_LAYERS = 1
compounding_hyperparams.HIDDEN_DIM = 175
compounding_hyperparams.EMBEDDING_DIM = 1

#Numbers used in EfficientNet paper
#depth coefficient * width coefficient^2 * embed_coefficient ^2 = 2
depth_coefficient = 1.1
width_coefficient = 1.1
resolution_coefficient = 1.25

current_n_layers= compounding_hyperparams.N_LAYERS
current_hidden_dim = compounding_hyperparams.HIDDEN_DIM
current_embedding_dim = compounding_hyperparams.EMBEDDING_DIM

for i in range(1,10):
  current_n_layers = current_n_layers * depth_coefficient
  current_hidden_dim = current_hidden_dim * width_coefficient
  current_embedding_dim = current_embedding_dim * resolution_coefficient
  compounding_hyperparams.N_LAYERS = round(current_n_layers)
  compounding_hyperparams.HIDDEN_DIM = round(current_hidden_dim)
  compounding_hyperparams.EMBEDDING_DIM = round(current_embedding_dim)
  print(f'N_LAYERS: {compounding_hyperparams.N_LAYERS }')
  print(f'HIDDEN_DIM: {compounding_hyperparams.HIDDEN_DIM }')
  print(f'EMBEDDING_DIM: {compounding_hyperparams.EMBEDDING_DIM }')
  _ = train_and_test_model_with_hparams(compounding_hyperparams, "lstm_1layer_base_sgd_e32_h100")
  print()

N_LAYERS: 1
HIDDEN_DIM: 193
EMBEDDING_DIM: 1
shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 212,533 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 43.01it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 74.83it/s]
epoch: 1
train_loss: 0.626, train_acc: 0.612
valid_loss: 0.381, valid_acc: 0.851
training...: 100%|██████████| 365/365 [00:09<00:00, 39.76it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.36it/s]
epoch: 2
train_loss: 0.289, train_acc: 0.884
valid_loss: 0.310, valid_acc: 0.867
training...: 100%|██████████| 365/365 [00:08<00:00, 44.14it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.66it/s]
epoch: 3
train_loss: 0.162, train_acc: 0.945
valid_loss: 0.330, valid_acc: 0.871
training...: 100%|██████████| 365/365 [00:08<00:00, 42.98it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 76.77it/s]
epoch: 4
train_loss: 0.092, train_acc: 0.

### Lab 2 (g) Bi-Directional LSTM, using best architecture from (f)

In [24]:
adam_birectional_hyperparams = HyperParams()
adam_birectional_hyperparams.OPTIM = "adam"
adam_birectional_hyperparams.LR = 0.001
adam_birectional_hyperparams.N_LAYERS = 1
adam_birectional_hyperparams.HIDDEN_DIM = 30
adam_birectional_hyperparams.EMBEDDING_DIM = 1

adam_birectional_hyperparams.BIDIRECTIONAL = True
_ = train_and_test_model_with_hparams(adam_birectional_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60833
The model has 68,815 trainable parameters
training...: 100%|██████████| 365/365 [00:09<00:00, 40.43it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 78.28it/s]
epoch: 1
train_loss: 0.620, train_acc: 0.608
valid_loss: 0.361, valid_acc: 0.851
training...: 100%|██████████| 365/365 [00:08<00:00, 40.79it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 79.21it/s]
epoch: 2
train_loss: 0.271, train_acc: 0.897
valid_loss: 0.273, valid_acc: 0.890
training...: 100%|██████████| 365/365 [00:08<00:00, 41.14it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 79.17it/s]
epoch: 3
train_loss: 0.151, train_acc: 0.950
valid_loss: 0.291, valid_acc: 0.889
training...: 100%|██████████| 365/365 [00:08<00:00, 40.76it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 78.77it/s]
epoch: 4
train_loss: 0.090, train_acc: 0.973
valid_loss: 0.322, valid_acc: 0.894
traini