### Importing essential libraries

In [1]:
import time
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 700) 

from matplotlib import pyplot as plt
plt.ioff()
%matplotlib agg
# !pip install seaborn
import seaborn as sns
import random, os

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch import nn
from torch.nn import Sequential, Module, ModuleList, Linear, Dropout
from torch.nn import ReLU, LeakyReLU, Sigmoid, GELU
from torch.optim import SGD, Adam, Adamax
from torch.optim.lr_scheduler import ReduceLROnPlateau, CyclicLR, LinearLR
# !pip install torchmetrics
from torchmetrics import F1Score, Precision, Recall, Accuracy, ConfusionMatrix

# !pip install torchtext
import torchtext

# !pip install colorama
from colorama import init, Fore, Style
init(autoreset=True)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

[nltk_data] Downloading package stopwords to /home/giorgo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/giorgo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/giorgo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/giorgo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


### Google Drive and Relative Paths

In [2]:
# from google.colab import drive

# drive.mount('/content/gdrive')

# ROOT = 'gdrive/My Drive/ai2_data/2/'  
ROOT = './'

DATASETS     = ROOT + 'datasets/'
VECTORIZERS  = ROOT + 'vectorizers/'
GLOVE        = ROOT + 'glove/'
SAVED_MODELS = ROOT + 'saved_models/'
PLOTS        = ROOT + 'plots/'

### Helpers

In [3]:
MAX_VECTORS = 50000
def write_data(path, obj):
    with open(path, 'wb') as file:
        pickle.dump(obj, file)

def load_data(path):
    return pd.read_pickle(path)

def load_csv(path, sep='\t'):
    return pd.read_csv(path, sep=sep)

def prepare_emb(dim=300):
    glove = torchtext.vocab.GloVe(name="6B", dim=300, max_vectors=MAX_VECTORS)
    glove.vectors = torch.cat([glove.vectors, torch.zeros(1,300)], dim=0)
    return glove

### Reviews Cleanup and Train-Test split

In [4]:
def clean_df(df, glove_path):
    df['new_rating'] = df['rating'] >= 7

    sw = stopwords.words('english') 

    sw.append('movie')
    sw.append('film')
    sw.append('br')
    
    sw.remove('not')
    sw.remove('no')

    sw = set(sw)

    df['clean_review'] = df['review'].str.lower()
    df['clean_review'] = df['clean_review'].str.replace('n\'t', ' not')
    df['clean_review'] = df['clean_review'].str.replace(r'(@\S+)|(#\S+)|(http\S+)|(www.\S+)', ' ', regex=True)
    df['clean_review'] = df['clean_review'].str.replace(r'[^a-z]', ' ', regex=True)
    df['clean_review'] = df['clean_review'].map(lambda x: ' '.join([word for word in word_tokenize(x) if not word in sw]))

    glove = pd.read_pickle(glove_path)
    
    def emb(input):
        return np.array([glove.stoi[word] for word in input.split() if word in glove.stoi])
    
    df['emb_review'] = df['clean_review'].apply(emb)

    return df

### Dataset

In [5]:
class ClassifierData(Dataset):
    def __init__(self, dataset, x_label='emb_review', y_label='new_rating', device='cpu', load=True):
        df = load_data(dataset) if load else dataset

        self.inputs = [torch.from_numpy(x).to(device) for x in df[x_label].values]

        self.labels = [1. if i == True else 0. for i in df[y_label].values]
        self.labels = torch.tensor(self.labels).to(device)
        self.labels = self.labels.unsqueeze(1)


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.inputs[index], self.labels[index]

In [6]:
def get_datasets(path, device='cpu'):
    train = ClassifierData(path + 'train.pkl', device=device)
    val   = ClassifierData(path + 'val.pkl', device=device)
    test  = ClassifierData(path + 'test.pkl', device=device)
    return train, val, test

In [7]:
device = 'cuda'

def collate(batch):
    (xs, ys) = zip(*batch)
    return (pad_sequence(xs, batch_first=True, padding_value=MAX_VECTORS), [len(x) for x in xs]), torch.stack(list(ys)).to(device)

def get_dataloaders(train_data, val_data, test_data, batch_size, device='cuda'):
    device = device
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
    val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate)
    return train_dataloader, val_dataloader, test_dataloader

### Train & Test 

In [8]:
def test(dataloader, model, loss_fn, device):
    model.eval()

    samples = len(dataloader)
    loss = 0.0
    predicted, target = [], []
    
    with torch.no_grad():
        for (X, lens), y in dataloader:
            pred = model(X, lens)
            # loss += loss_fn(pred, y).item()
            loss += loss_fn(pred, y).item()
       
            predicted.append(pred)             
            target.append(y)             
    
    predicted = torch.cat(predicted, dim=0).to(device)
    target = torch.cat(target, dim=0).to(device)

    p = Precision(task="binary", num_classes=2).to(device)
    r = Recall(task="binary", num_classes=2).to(device)
    f1 = F1Score(task="binary", num_classes=2).to(device)
    acc = Accuracy(task="binary", num_classes=2).to(device)

    p  =  p(predicted, target).item()
    r  =  r(predicted, target).item()
    f1 = f1(predicted, target).item()

    return loss / samples, p, r, f1, acc(predicted, target).item(), predicted.detach().cpu().numpy(), target.detach().cpu().numpy()

In [None]:
def epoch(model, t, dataloader, loss_fn, optimizer, clip, show_epochs, scheduler, pass_loss, val_dataloader):
    model.train()
    batches = len(dataloader)
    running_loss = 0.0

    if show_epochs:
        print(f"\nEpoch {t+1}\n-------------------------------")

    predicted, target = [], []

    for (X, lens), y in dataloader:
        pred = model(X, lens)   
    
        predicted.append(pred)             
        target.append(y)         

        # loss = loss_fn(pred, y)
        loss = loss_fn(pred, y)
        running_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()

        if clip:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # Clip gradients

        optimizer.step()

    if scheduler:
        if pass_loss:
            scheduler.step(loss)
        else:
            scheduler.step()
    
    epoch_loss = running_loss / batches  
    
    predicted = torch.cat(predicted, dim=0).to(device)
    target = torch.cat(target, dim=0).to(device)
    f1 = F1Score(task="binary", num_classes=2).to(device)
    
    if show_epochs and not val_dataloader:
        print(f"\nEpoch Average Loss: {epoch_loss:>7f}\n")
    
    
    return epoch_loss, f1(predicted, target).item()

In [9]:
def train(dataloader, model, epochs, loss_fn, optimizer, val_dataloader=None, show_epochs=False, device='cpu', **kwargs):
    scheduler = kwargs['scheduler'] if 'scheduler' in kwargs else None
    pass_loss = kwargs['pass_loss'] if 'pass_loss' in kwargs else False
    patience = kwargs['patience'] if 'patience' in kwargs else None 
    clip = kwargs['clip'] if 'clip' in kwargs else None

    if not patience:
        patience = epochs
    else:
        assert(val_dataloader)

    min_loss = float('inf')
    last_change = 0

    metrics = {'loss' : {'train' : [], 'test' : []}, 
               'f1' : {'train' : [], 'test' : []}}

    for t in range(epochs):
        epoch_loss, f1_score = epoch(model, t, dataloader, loss_fn, optimizer, clip, show_epochs, scheduler, pass_loss, val_dataloader)
        metrics['loss']['train'].append(epoch_loss)
        metrics['f1']['train'].append(f1_score)

        if val_dataloader:
            val_loss, _, _, f1, _, _, _ = test(val_dataloader, model, loss_fn, device)
            
            if show_epochs:
                print(f'Loss on train set     : {epoch_loss}')
                print(f'Loss on validation set: {val_loss}')
                print(f'F1-score on validation set: {f1*100:>.2f}')

            last_change += 1

            metrics['loss']['test'].append(val_loss)
            metrics['f1']['test'].append(f1)

            if val_loss < min_loss:
                write_data(f'{SAVED_MODELS}model_{device}.pth', model)
                min_loss = val_loss
                last_change = 0
            
            if patience < last_change:
                if show_epochs:
                    print(f'No improvement on the validation loss for {last_change} > {patience} (patience) epochs.')
                    print(f'Stopping training after [{t+1:>2d}/{epochs:>2d}] epochs.')
                break

    return metrics
            

### Complete pass

#### Reproducibility

In [10]:
def torch_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

#### Learning Curve

In [11]:
# Based on: https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

def helper(train, test, xlabel, ylabel, f1=False, id=None):
    fig, axes = plt.subplots(1, 1, figsize=(8,5))

    axes.set_xlabel('Epoch')
    axes.set_ylabel('F1-Score' if f1 else 'Loss')

    axes.grid()

    epoch_step = len(train) // 20
    if not epoch_step:
        epoch_step = 1

    xvalues = np.arange(1, len(train) + 1, epoch_step)

    axes.plot(xvalues, train[::epoch_step], "o-", color="r", label=xlabel)
    axes.plot(xvalues, test[::epoch_step], "o-", color="g", label=ylabel)

    axes.legend(loc="best")

    plt.xticks(np.arange(1, len(train) + epoch_step + 1, epoch_step))

    max_stat = max(max(train), max(test)) 
    step = 0.05 if f1 else max_stat / 10
    low = 0.50 if f1 else 0.
    high = 1.01 if f1 else max_stat + step

    plt.yticks(np.arange(low, high, step))

    if id:
        name = 'f1' if f1 else 'loss'
        fig.savefig(f'{PLOTS}{name}/{id}_{name}.png', bbox_inches='tight')
        plt.close()
    else:
        plt.show()
    

def learning_curve(metrics, id=None):
    helper(np.array(metrics['f1']['train']), np.array(metrics['f1']['test']), 'Training F1-Score', 'Validation F1-Score', f1=True, id=id)
    helper(np.array(metrics['loss']['train']), np.array(metrics['loss']['test']), 'Training loss', 'Validation loss', id=id)

#### ROC curve

In [12]:
def roc(target, pred, id=None):
    fpr, tpr, _ = roc_curve(target, pred)
    roc_auc = auc(fpr, tpr)

    fig, axes = plt.subplots(1, 1, figsize=(8,5))

    axes.set_xlabel('False Positive Rate')
    axes.set_ylabel('True Positive Rate')

    axes.grid()

    axes.plot(fpr, tpr, '.', color='r', label=f'ROC curve (AUC = {100 * roc_auc : .2f}%)')

    axes.legend(loc="best")

    ticks = np.arange(0, 1.05, 0.2)
    plt.xticks(ticks)
    plt.yticks(ticks)

    if id:
        fig.savefig(f'{PLOTS}/roc/{id}_roc.png', bbox_inches='tight')
        plt.close()
    else:
        plt.show()

#### Pass

In [13]:
def complete_pass(data_path, model, **kwargs):
    # Data 
    batch_size = kwargs['batch_size'] if 'batch_size' in kwargs else 16

    # Architecture
    model_params = kwargs['model_params'] if 'model_params' in kwargs else {}

    # Hyperparameters
    device = kwargs['device'] if 'device' in kwargs else 'cpu'
    epochs = kwargs['epochs'] if 'epochs' in kwargs else 30
    
    loss_fn = kwargs['loss_fn'] if 'loss_fn' in kwargs else nn.BCELoss()
    lr = kwargs['lr'] if 'lr' in kwargs else 0.002

    optimizer = kwargs['optimizer'] if 'optimizer' in kwargs else torch.optim.Adam
    kwargs.pop('optimizer', None)
    optimizer_params = kwargs['optimizer_params'] if 'optimizer_params' in kwargs else {}

    scheduler = kwargs['scheduler'] if 'scheduler' in kwargs else None
    scheduler_params = kwargs['scheduler_params'] if 'scheduler_params' in kwargs else {}
    pass_loss = scheduler_params['pass_loss'] if 'pass_loss' in scheduler_params else False
    patience = kwargs['patience'] if 'patience' in kwargs else None    


    clip = kwargs['clip'] if 'clip' in kwargs else None

    # Results
    show_epochs = kwargs['show_epochs'] if 'show_epochs' in kwargs else False
    cmatrix = kwargs['cmatrix'] if 'cmatrix' in kwargs else False
    lc = kwargs['lc'] if 'lc' in kwargs else False
    auroc = kwargs['auroc'] if 'auroc' in kwargs else False

    validate = kwargs['validate'] if 'validate' in kwargs else False
    reproducibility = kwargs['reproducibility'] if 'reproducibility' in kwargs else False

    id = kwargs['id'] if 'id' in kwargs else None


    if reproducibility:
          torch_seed(seed=5)


    train_data, val_data, test_data = get_datasets(data_path, device=device)
    train_dataloader, val_dataloader, test_dataloader = get_dataloaders(train_data, val_data, test_data, batch_size)
    
    model = model(**model_params).to(device)

    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    if not id:
        print(f'The model has {count_parameters(model):,} trainable parameters')

    optimizer = optimizer(model.parameters(), lr=lr, **optimizer_params)

    if scheduler:
        if pass_loss:
            scheduler_params = dict(scheduler_params)
            del scheduler_params['pass_loss']

        scheduler = scheduler(optimizer, **scheduler_params)

    start_time = time.perf_counter()
    metrics = train(train_dataloader, model, epochs, loss_fn, optimizer, val_dataloader=val_dataloader if validate else None,
                    show_epochs=show_epochs, device=device, pass_loss=pass_loss, patience=patience, scheduler=scheduler, clip=clip)
    end_time = time.perf_counter()

    if validate:
        model = load_data(f'{SAVED_MODELS}model_{device}.pth')

    loss, p, r, f1, acc, pred, target = test(test_dataloader, model, loss_fn, device)

    res1 = f'\nTime taken: {end_time - start_time:>0.2f}'
    res2 = f'Avg loss: {loss:>8f}\tAccuracy: {(100*acc):>0.2f}%'
    res3 = f'Precision: {(100*p):>0.2f}%\tRecall: {(100*r):>0.2f}%\t\tF1-Score: {(100*f1):>0.2f}%'

    if id:
        with open('results.txt', 'a') as file:
            desc = model.get_desc()
            desc += f'Clip range           : ({-clip}, {clip})\n'
            desc += f'Trainable Parameters : {count_parameters(model):,}\n'
            desc += f'Batch size           : {batch_size}\n'
            desc += f'Optimizer            : Adam\n'
            desc += f'Optimizer Parameters : {optimizer_params}\n'
            desc += f'Scheduler            : {scheduler}\n'
            desc += f'Scheduler Parameters : {scheduler_params}\n'
            desc += f'Patience             : {patience}\n'

            file.write(f'Model ID : {id}\n{desc}\nResults:\n{res1}\t{res2}\n{res3}\n\n\n')
    
    print(f'Model ID : {id}{res1}\t{res2}\n{res3}')

    if lc:
        learning_curve(metrics, id)

    if auroc:
        roc(target, pred, id)

### Neural Networks

#### Feed Forward

In [14]:
class NeuralNetwork(Module):
    def __init__(self, in_dim, **kwargs):
        super(NeuralNetwork, self).__init__()

        linear_layers = [in_dim] + (kwargs['layers'] if 'layers' in kwargs else []) + [1]
        num_of_layers = len(linear_layers) - 1

        activation = kwargs['activation'] if 'activation' in kwargs else None
        dropout = kwargs['dropout'] if 'dropout' in kwargs else None

        if dropout:
            if type(dropout) != list:
                dropout = [dropout] * num_of_layers

            assert(len(dropout) == num_of_layers)

        if activation:
            if type(activation) != list:
                activation = [activation] * num_of_layers

            assert(len(activation) == num_of_layers)


        self.layers = nn.ModuleList()

        for i in range(num_of_layers):
            if dropout:
                self.layers.append(Dropout(dropout[i]))

            self.layers.append(Linear(linear_layers[i], linear_layers[i + 1]))

            if activation and i != num_of_layers - 1:
                self.layers.append(activation[i]())   

        self.layers.append(Sigmoid())     

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
            
        return x

#### Attention Layer

In [15]:
import torch.nn.functional as F

NEG_INF = -1000000

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.dk = np.sqrt(hidden_size)
    
    def forward(self, Q, K, V, mask=None):

        # Q : N x 2h
        # K : N x E x 2h
        # V : N x E x 2h

        Q = Q.unsqueeze(1)      # N x  1 x 2h
        K = K.transpose(1,2)    # N x 2h x  E

        attention = torch.bmm(Q, K) / self.dk   # N x 1 x E

        if mask != None:
            attention.masked_fill_(mask.view(attention.size()), NEG_INF)

        attention = F.softmax(attention, -1)

        context = torch.bmm(attention, V).squeeze(1) # N x 1 x 2h -> N x 2h

        return context

#### Recurrent

In [29]:
class RNN(nn.Module):

    cells = { "LSTM" : nn.LSTM, "GRU"  : nn.GRU }

    def __init__(self, glove_path, cell_type, emb_dim=300, hidden_size=50, stacked_rnns=2, dropout=0, 
                skip_connections=False, bidirectional=True, attention=False):

        super(RNN, self).__init__()

        glove = pd.read_pickle(glove_path)

        self.dropout = Dropout(dropout)
        
        assert(cell_type in self.cells)

        self.embedding = nn.Embedding.from_pretrained(glove.vectors)
        self.embedding.weight.requires_grad = True

        self.emb_dim          = emb_dim
        self.cell_type        = cell_type
        self.device           = device
        self.stacked_rnns     = stacked_rnns
        self.bidirectional    = bidirectional
        self.skip_connections = skip_connections
        self.dropout          = Dropout(dropout)

        self.rnns = nn.ModuleList()

        self.h_size = hidden_size * (int(bidirectional) + 1)
        
        for i in range(stacked_rnns):
            
            dim = self.h_size if i > 0 else emb_dim

            rnn = self.cells[cell_type](         
                input_size=dim,                 # features for each time step
                hidden_size=hidden_size,        # hidden units
                num_layers = 1,                 # layers on our stacked RNN 
                bidirectional = bidirectional,  # bidirectional rnn
                batch_first=True,               # (batch, time_step, input_size)
            )

            self.rnns.append(rnn)

        self.out        = NeuralNetwork(self.h_size, dropout=dropout)
        self.attention  = Attention(self.h_size) if attention else None
    
    def get_desc(self):
        desc = f'Single {self.cell_type}{" bidirectional" if self.bidirectional else ""} cell'
        
        if self.stacked_rnns > 1:
            desc = f'{self.stacked_rnns} stacked {self.cell_type}{" bidirectional" if self.bidirectional else ""} cells'
        
        desc += f' with {self.h_size} hidden layers\n'

        desc += f'Dropout              : {self.dropout}\n'
        desc += f'Attention            : {True if self.attention else False}\n'
        desc += f'Skip Connections     : {self.skip_connections}\n'
        return desc

    def pack_padded(self, x, lens):
        return pack_padded_sequence(x, lens, batch_first=True, enforce_sorted=False)
    
    def pad_packed(self, r_out):
        out, _ = pad_packed_sequence(r_out, batch_first=True)
        return out

    def forward(self, x, lens):

        embedded = self.embedding(x)
        
        r_out = embedded
        h_n = None

        for i, rnn in enumerate(self.rnns):

            if i + 1 < self.stacked_rnns:
                r_out = self.dropout(r_out)

            inp = self.pack_padded(r_out, lens)

            rnn.flatten_parameters()
            if self.cell_type == 'LSTM':
                r, (h_n, c_n) = rnn(inp) 
            else:
                r, h_n = rnn(inp)

            r = self.pad_packed(r)

            r_out = (r_out + r) / 2 if i > 0 and self.skip_connections else r
        ffn_input = r_out[:,-1,:]
        
        if self.attention:              
            ffn_input = torch.cat((h_n[-1,:,:], h_n[-2,:,:]), dim = 1).to(self.device) if self.bidirectional else h_n.squeeze(0)
            ffn_input = self.attention(Q=ffn_input, K=r_out, V=r_out, mask=x == 0) # self attention         

        return self.out(ffn_input)

In [32]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='LT',
              
              device='cuda', lr=2.5e-4, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 15, 'gamma' : 0.1}
              )

### Experimenting

#### Data Preperation

In [None]:
write_data(GLOVE + 'emb.pkl', prepare_emb())

In [83]:
df = load_csv(DATASETS + 'imdb-reviews.csv')
df['movie_id'] = df['url'].rank(method='dense', ascending=False).astype(int)
df = clean_df(df, GLOVE + 'emb.pkl')

write_data(DATASETS + 'imdb-reviews_clean.pkl', df)

In [84]:
random.seed(10)

df = load_data(DATASETS + 'imdb-reviews_clean.pkl')

ids = random.sample(range(max(df['movie_id']) + 1), 300)

df_test = df[df['movie_id'].isin(ids)] 
df = df[~df['movie_id'].isin(ids)] 
write_data(DATASETS + 'um_train.pkl', df)
write_data(DATASETS + 'um_test.pkl', df_test)

In [30]:
df = load_data(DATASETS + 'imdb-reviews_clean.pkl')

train_data, test_data = train_test_split(df, test_size=0.2, random_state=1, stratify=df['new_rating'].values)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=1, stratify=train_data['new_rating'].values)

write_data(DATASETS + 'train.pkl', train_data)
write_data(DATASETS + 'val.pkl', val_data)
write_data(DATASETS + 'test.pkl', test_data)

#### Experiments

In [28]:
# Clean results.txt file
open('./results.txt', 'w').close()

##### LSTM cell

In [29]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L01',
              
              device='cuda', lr=1e-3, epochs=10, patience=2, batch_size=64,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : False, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 64, 'stacked_rnns' : 1, 'dropout' : 0.4
                                      },
              clip=10
              )

Model ID : L01
Time taken: 73.52	Avg loss: 0.288406	Accuracy: 88.79%
Precision: 89.50%	Recall: 87.89%		F1-Score: 88.69%


In [30]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L02',
              
              device='cuda', lr=1e-3, epochs=15, patience=2, batch_size=64,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 64, 'stacked_rnns' : 1, 'dropout' : 0.4
                                      },
              clip=10
              )

Model ID : L02
Time taken: 73.53	Avg loss: 0.258037	Accuracy: 89.48%
Precision: 88.42%	Recall: 90.87%		F1-Score: 89.63%


Observing the curves, we notice overfitting happening, which is the first problem we must tackle.

In [31]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L03',
              
              device='cuda', lr=1e-3, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 64, 'stacked_rnns' : 1, 'dropout' : 0.4
                                      },
              clip=10
              )

Model ID : L03
Time taken: 41.93	Avg loss: 0.263777	Accuracy: 89.41%
Precision: 88.89%	Recall: 90.09%		F1-Score: 89.49%


In [32]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L04',
              
              device='cuda', lr=1e-3, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 64, 'stacked_rnns' : 1, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L04
Time taken: 33.68	Avg loss: 0.272143	Accuracy: 88.78%
Precision: 89.52%	Recall: 87.85%		F1-Score: 88.68%


In [33]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L05',
              
              device='cuda', lr=1e-3, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 1, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L05
Time taken: 50.83	Avg loss: 0.271012	Accuracy: 88.88%
Precision: 88.74%	Recall: 89.07%		F1-Score: 88.90%


In [34]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L06',
              
              device='cuda', lr=1e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 1, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L06
Time taken: 156.74	Avg loss: 0.276541	Accuracy: 88.99%
Precision: 89.73%	Recall: 88.07%		F1-Score: 88.89%


In [35]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L07',
              
              device='cuda', lr=5e-5, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 1, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L07
Time taken: 158.09	Avg loss: 0.313614	Accuracy: 87.15%
Precision: 87.06%	Recall: 87.27%		F1-Score: 87.17%


Initially, we try increasing the batch size, dropout, and hidden size while decreasing the learning rate to address the overfitting. 

With a learning rate of `1e-4`, we observe a reduction in overfitting, but it comes with a decrease in performance. 

Therefore, we try more complex models to improve performance while avoiding overfitting.

In [36]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L08',
              
              device='cuda', lr=5e-5, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L08
Time taken: 695.59	Avg loss: 0.332228	Accuracy: 86.48%
Precision: 86.92%	Recall: 85.90%		F1-Score: 86.40%


In [38]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L09',
              
              device='cuda', lr=5e-5, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 3, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L09
Time taken: 1227.00	Avg loss: 0.373562	Accuracy: 84.80%
Precision: 81.59%	Recall: 89.89%		F1-Score: 85.54%


We stack `2` and then `3` RNNs, which eliminates overfitting but also results in poor performance. 

We need to find a better learning rate to improve performance without overfitting.

In [39]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L10',
              
              device='cuda', lr=1e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L10
Time taken: 691.37	Avg loss: 0.295468	Accuracy: 88.29%
Precision: 88.76%	Recall: 87.69%		F1-Score: 88.22%


In [40]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L11',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L11
Time taken: 455.74	Avg loss: 0.263240	Accuracy: 89.56%
Precision: 86.74%	Recall: 93.40%		F1-Score: 89.95%


In [43]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L10_1',
              
              device='cuda', lr=2e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L10_1
Time taken: 503.88	Avg loss: 0.297660	Accuracy: 88.10%
Precision: 85.19%	Recall: 92.25%		F1-Score: 88.58%


We try various learning rates and finally find a good balance with a learning rate of `1e-5`, which produces a very nice curve (`L10`). However, the performance is still underwhelming, so we increase the learning rate, which results in overfitting (`L11`).

We then attempt to balance the performance and overfitting by adjusting the learning rate, but it doesn't work well (`L10_1`).

In [41]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L12',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 3, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L12
Time taken: 644.08	Avg loss: 0.364870	Accuracy: 87.79%
Precision: 84.41%	Recall: 92.71%		F1-Score: 88.37%


In [42]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L13',
              
              device='cuda', lr=1e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 3, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L13
Time taken: 979.57	Avg loss: 0.356474	Accuracy: 86.14%
Precision: 83.69%	Recall: 89.78%		F1-Score: 86.63%


In [44]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L14',
              
              device='cuda', lr=1e-4, epochs=15, patience=4, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 3, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L14
Time taken: 1137.85	Avg loss: 0.356474	Accuracy: 86.14%
Precision: 83.69%	Recall: 89.78%		F1-Score: 86.63%


In [45]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L15',
              
              device='cuda', lr=1e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : True, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 3, 'dropout' : 0.8
                                      },
              clip=10
              )

Model ID : L15
Time taken: 910.72	Avg loss: 0.357312	Accuracy: 85.76%
Precision: 83.91%	Recall: 88.49%		F1-Score: 86.14%


Having `3` stacked RNNs didn't seem to work, even with skip connections.

Although we achieved no overfitting, the performance was disappointing compared to our best model yet (`L10`).

We will try fixing `L11` using a learning rate optimizer

We choose `StepLR` to decrease the learning rate after giving the model a chance to learn

In [50]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L16',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 6, 'gamma' : 0.1}
              )

Model ID : L16
Time taken: 415.02	Avg loss: 0.275130	Accuracy: 89.60%
Precision: 87.49%	Recall: 92.43%		F1-Score: 89.89%


In [51]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L17',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 6, 'gamma' : 0.05}
              )

Model ID : L17
Time taken: 411.64	Avg loss: 0.275130	Accuracy: 89.60%
Precision: 87.49%	Recall: 92.43%		F1-Score: 89.89%


In [55]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L18',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 4, 'gamma' : 0.2}
              )

Model ID : L18
Time taken: 509.27	Avg loss: 0.281484	Accuracy: 89.27%
Precision: 87.10%	Recall: 92.20%		F1-Score: 89.58%


Resutls were dissapointing, could not stop the overffiting

We will now turn to the mechanism of self (scaled dot product) attention and experiment to see how well we can do by utilizing it.

In [44]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L19',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 4, 'gamma' : 0.2}
              )

Model ID : L19
Time taken: 820.27	Avg loss: 0.451516	Accuracy: 87.30%
Precision: 84.90%	Recall: 90.76%		F1-Score: 87.73%


In [72]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L20',
              
              device='cuda', lr=1e-3, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 4, 'gamma' : 0.2}
              )

Model ID : L20
Time taken: 529.59	Avg loss: 0.324400	Accuracy: 89.42%
Precision: 87.09%	Recall: 92.58%		F1-Score: 89.75%


In [73]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L21',
              
              device='cuda', lr=1e-3, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 1, 'gamma' : 0.1}
              )

Model ID : L21
Time taken: 303.16	Avg loss: 0.422349	Accuracy: 85.26%
Precision: 83.48%	Recall: 87.92%		F1-Score: 85.64%


In the beginning, the model showed signs of overfitting, so we attempted to address this issue. 

We tried using a scheduler to decrease the learning rate, but we eventually realized that this approach was not effective. 

Instead, we decided to simply decrease the initial learning rate.

In [19]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L22',
              
              device='cuda', lr=5e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                                      },
              clip=10,
              )

Model ID : L22
Time taken: 1367.23	Avg loss: 0.284269	Accuracy: 88.31%
Precision: 89.31%	Recall: 87.05%		F1-Score: 88.17%


In [20]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L23',
              
              device='cuda', lr=7.5e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                                      },
              clip=10,
              )

Model ID : L23
Time taken: 1363.71	Avg loss: 0.263074	Accuracy: 89.20%
Precision: 89.33%	Recall: 89.05%		F1-Score: 89.19%


In [45]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L22_5',
              
              device='cuda', lr=6e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                                      },
              clip=10,
              )

Model ID : L22_5
Time taken: 1046.56	Avg loss: 0.287097	Accuracy: 88.27%
Precision: 86.72%	Recall: 90.38%		F1-Score: 88.51%


With `L22`, we observe a similar behavior to that of `L10`. However, we wanted to see if we could do even better. 

By slightly increasing the learning rate in `L23`, we see some overfitting occurring, but early stopping helped prevent it from worsening. 

As a result, we obtain a very good model that performs approximately `0.5%` better than `L22` and exhibits minimal overfitting. Therefore, `L23` represents the best model thus far.

In [22]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L24',
              
              device='cuda', lr=7.5e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=10,
              )

Model ID : L24
Time taken: 1683.43	Avg loss: 0.333761	Accuracy: 87.05%
Precision: 87.82%	Recall: 86.03%		F1-Score: 86.92%


The attempt to prevent overfitting in `L23` by increasing dropout did not yield any improvement.

We will now test models with different clip rates and see if we can further improve performance 

In [23]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L25',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=5,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 6, 'gamma' : 0.05}
              )

Model ID : L25
Time taken: 428.30	Avg loss: 0.275130	Accuracy: 89.60%
Precision: 87.49%	Recall: 92.43%		F1-Score: 89.89%


We get the same results, which means that the gradients were already within the range of `(-5, 5)` and therefore clipping before did not have any impact.

In [25]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L26',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 6, 'gamma' : 0.05}
              )

Model ID : L26
Time taken: 432.84	Avg loss: 0.274066	Accuracy: 89.40%
Precision: 86.91%	Recall: 92.78%		F1-Score: 89.75%


In [27]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L27',
              
              device='cuda', lr=5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              )

Model ID : L27
Time taken: 530.50	Avg loss: 0.257111	Accuracy: 89.70%
Precision: 87.00%	Recall: 93.36%		F1-Score: 90.07%


In [30]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L28',
              
              device='cuda', lr=2.5e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              )

Model ID : L28
Time taken: 722.04	Avg loss: 0.262367	Accuracy: 89.99%
Precision: 87.93%	Recall: 92.71%		F1-Score: 90.26%


In [36]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L29',
              
              device='cuda', lr=2.5e-4, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              )

Model ID : L29
Time taken: 834.12	Avg loss: 0.262367	Accuracy: 89.99%
Precision: 87.93%	Recall: 92.71%		F1-Score: 90.26%


In [37]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L30',
              
              device='cuda', lr=2.5e-4, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=0.5,
              )

Model ID : L30
Time taken: 600.90	Avg loss: 0.271238	Accuracy: 89.16%
Precision: 86.73%	Recall: 92.47%		F1-Score: 89.51%


In [35]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L31',
              
              device='cuda', lr=2.5e-4, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 8, 'gamma' : 0.2}
              )

Model ID : L31
Time taken: 467.41	Avg loss: 0.309736	Accuracy: 87.45%
Precision: 83.22%	Recall: 93.82%		F1-Score: 88.20%


In [38]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L32',
              
              device='cuda', lr=2.5e-4, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 15, 'gamma' : 0.1}
              )

Model ID : L32
Time taken: 827.70	Avg loss: 0.262367	Accuracy: 89.99%
Precision: 87.93%	Recall: 92.71%		F1-Score: 90.26%


In [30]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L32_5',
              
              device='cuda', lr=2.5e-4, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              optimizer_params={'weight_decay' : 1e-4}
              )

Model ID : L32_5
Time taken: 454.61	Avg loss: 0.295778	Accuracy: 87.85%
Precision: 84.59%	Recall: 92.56%		F1-Score: 88.40%


In [40]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L33',
              
              device='cuda', lr=7.5e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                                      },
              clip=1,
              )

Model ID : L33
Time taken: 1052.47	Avg loss: 0.277182	Accuracy: 88.87%
Precision: 88.29%	Recall: 89.63%		F1-Score: 88.96%


In [41]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L34',
              
              device='cuda', lr=2.5e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                                      },
              clip=1,
              )

Model ID : L33
Time taken: 1856.36	Avg loss: 0.290034	Accuracy: 88.01%
Precision: 87.99%	Recall: 88.05%		F1-Score: 88.02%


In [42]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L35',
              
              device='cuda', lr=5e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                                      },
              clip=1,
              )

Model ID : L35
Time taken: 1455.85	Avg loss: 0.273719	Accuracy: 88.80%
Precision: 88.16%	Recall: 89.65%		F1-Score: 88.90%


In [19]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='L36',
              
              device='cuda', lr=5e-5, epochs=30, patience=2, batch_size=64,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True, 
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              optimizer_params={'weight_decay' : 1e-4}
              )

Model ID : L36
Time taken: 932.55	Avg loss: 0.312180	Accuracy: 88.28%
Precision: 86.17%	Recall: 91.20%		F1-Score: 88.62%


We see that without attention, the model is able to learn quickly but early stopping is essential to avoid overfitting.

With attention, we have managed to develop models that do not overfit and have excellent performance. Therefore, they should be preferred over their non-attention counterparts.

After the effect weight decay (L2 loss) had with the GRU cells, I tried experimenting with it here, but unfortunately it didn't work just as well

##### GRU cell

In [22]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='G01',
              
              device='cuda', lr=1e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'GRU', 
                                        'bidirectional' : False, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 64, 'stacked_rnns' : 1, 'dropout' : 0.8
                                      },
              clip=1
              )

Model ID : G01
Time taken: 81.82	Avg loss: 0.297458	Accuracy: 88.40%
Precision: 88.61%	Recall: 88.14%		F1-Score: 88.37%


In [23]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='G02',
              
              device='cuda', lr=1e-4, epochs=15, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'GRU', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 64, 'stacked_rnns' : 1, 'dropout' : 0.8
                                      },
              clip=1
              )

Model ID : G02
Time taken: 95.84	Avg loss: 0.292253	Accuracy: 88.77%
Precision: 88.00%	Recall: 89.78%		F1-Score: 88.88%


With the same amount of overfitting, the bidirectional model achieves quite the better performance and thus we will continue with it

In [27]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='G03',
              
              device='cuda', lr=1e-4, epochs=15, patience=2, batch_size=64,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'GRU', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              optimizer_params={'weight_decay' : 1e-4}
              )

Model ID : LOOOOO
Time taken: 524.49	Avg loss: 0.262575	Accuracy: 89.34%
Precision: 86.53%	Recall: 93.18%		F1-Score: 89.73%


In [32]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='G04',
              
              device='cuda', lr=1e-4, epochs=15, patience=2, batch_size=128,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'GRU', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              optimizer_params={'weight_decay' : 1e-4}
              )

Model ID : LOOOOO
Time taken: 486.61	Avg loss: 0.285353	Accuracy: 88.29%
Precision: 88.50%	Recall: 88.03%		F1-Score: 88.26%


In [28]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='G05',
              
              device='cuda', lr=5e-5, epochs=15, patience=2, batch_size=64,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'GRU', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True, 
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              optimizer_params={'weight_decay' : 1e-4}
              )

Model ID : G05
Time taken: 671.49	Avg loss: 0.286509	Accuracy: 88.26%
Precision: 86.07%	Recall: 91.29%		F1-Score: 88.61%


In [29]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='G06',
              
              device='cuda', lr=5e-5, epochs=30, patience=2, batch_size=64,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'GRU', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True, 
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              optimizer_params={'weight_decay' : 1e-4}
              )

Model ID : G06
Time taken: 1071.73	Avg loss: 0.269809	Accuracy: 89.41%
Precision: 87.91%	Recall: 91.40%		F1-Score: 89.62%


We will stop here because `G06` is an excellent model

## Testing

### Unknown movies dataset

In [23]:
ROOT = './'

DATASETS     = ROOT + 'datasets/'
GLOVE        = ROOT + 'glove/'
SAVED_MODELS = ROOT + 'saved_models/'
PLOTS        = ROOT + 'plots/'

TRAIN_PATH = DATASETS + 'um_train.pkl' # Should be the .csv file given to us
TEST_PATH  = DATASETS + 'um_test.pkl' # Should be your hidden .csv file

In [24]:
df_train = load_data(TRAIN_PATH)
df_test  = load_data(TEST_PATH)

train_data, val_data = train_test_split(df_train, test_size=0.2, random_state=1, stratify=df_train['new_rating'].values)

write_data(DATASETS + 'train.pkl', train_data)
write_data(DATASETS + 'val.pkl', val_data)
write_data(DATASETS + 'test.pkl', df_test)

In [25]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='UM1',
              
              device='cuda', lr=2.5e-4, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : False,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,

              scheduler=torch.optim.lr_scheduler.StepLR, scheduler_params={'step_size' : 15, 'gamma' : 0.1}
              )

Model ID : UM1
Time taken: 534.28	Avg loss: 0.290639	Accuracy: 88.05%
Precision: 88.04%	Recall: 88.48%		F1-Score: 88.26%


In [26]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='UM2',
              
              device='cuda', lr=2.5e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                                      },
              clip=1,
              )

Model ID : UM2
Time taken: 1064.92	Avg loss: 0.337337	Accuracy: 85.31%
Precision: 85.73%	Recall: 85.24%		F1-Score: 85.49%


In [27]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='UM3',
              
              device='cuda', lr=5e-5, epochs=30, patience=2, batch_size=256,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                                      },
              clip=1,
              )

Model ID : UM3
Time taken: 2106.41	Avg loss: 0.278387	Accuracy: 88.85%
Precision: 88.66%	Recall: 89.49%		F1-Score: 89.07%


In [28]:
complete_pass(data_path=DATASETS, validate=True, reproducibility=True, lc=True, auroc=True, id='UM4',
              
              device='cuda', lr=5e-5, epochs=30, patience=2, batch_size=64,

              model=RNN, model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'GRU', 
                                        'bidirectional' : True, 'skip_connections' : False, 'attention' : True, 
                                        'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                                      },
              clip=1,
              optimizer_params={'weight_decay' : 1e-4}
              )

Model ID : UM4
Time taken: 1329.18	Avg loss: 0.272945	Accuracy: 89.14%
Precision: 87.14%	Recall: 92.22%		F1-Score: 89.61%


Out of the models I have tested, all of them show minimal overfitting. However, I recommend using the models with attention for testing on your dataset. 

Based on my experiments, I expect that the model with the GRU cells and attention will perform the best. 

Overall, it seems that attention has proven to be a beneficial addition to the models.

### Your dataset

#### Pre-training models

In [47]:
def train_best(data_path, model, id, **kwargs):

    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam

    # Architecture
    model_params = kwargs['model_params'] if 'model_params' in kwargs else {}

    # Hyperparameters
    device = kwargs['device'] if 'device' in kwargs else 'cuda'
    batch_size = kwargs['batch_size'] if 'batch_size' in kwargs else 16
    epochs = kwargs['epochs'] if 'epochs' in kwargs else 30
    lr = kwargs['lr'] if 'lr' in kwargs else 0.002
    clip = kwargs['clip'] if 'clip' in kwargs else None
    patience = kwargs['patience'] if 'patience' in kwargs else None

    scheduler = kwargs['scheduler'] if 'scheduler' in kwargs else None
    scheduler_params = kwargs['scheduler_params'] if 'scheduler_params' in kwargs else {}  

    optimizer_params = kwargs['optimizer_params'] if 'optimizer_params' in kwargs else {}

    torch_seed(seed=5)


    df = load_data(data_path)
    train_data, val_data = train_test_split(df, test_size=0.2, random_state=1, stratify=df['new_rating'].values)

    train_data, val_data = ClassifierData(train_data, load=False, device=device), ClassifierData(val_data, load=False, device=device)
    
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
    val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
    
    
    model = model(**model_params).to(device)

    optimizer = optimizer(model.parameters(), lr=lr, **optimizer_params)

    if scheduler:
        scheduler = scheduler(optimizer, **scheduler_params)

    metrics = train(train_dataloader, model, epochs, loss_fn, optimizer, device=device, scheduler=scheduler, clip=clip, 
                                                                   val_dataloader=val_dataloader, patience=patience)
    
    write_data(f'{SAVED_MODELS}{id}.pkl', load_data(f'{SAVED_MODELS}model_{device}.pth'))

    learning_curve(metrics, id)


In [48]:
train_best( data_path=DATASETS + 'imdb-reviews_clean.pkl', model=RNN, id='lstm',
              
            device='cuda', lr=5e-5, epochs=30, patience=2, batch_size=256,

            model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'LSTM', 
                           'bidirectional' : True, 'skip_connections' : False, 'attention' : True,
                           'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.4
                        },
            clip=1,
          )

In [49]:
train_best( data_path=DATASETS + 'imdb-reviews_clean.pkl', model=RNN, id='gru',
              
            device='cuda', lr=5e-5, epochs=30, patience=2, batch_size=64,

            model_params={ 'glove_path' : GLOVE + 'emb.pkl', 'emb_dim' : 300, 'cell_type' : 'GRU', 
                           'bidirectional' : True, 'skip_connections' : False, 'attention' : True, 
                           'hidden_size' : 128, 'stacked_rnns' : 2, 'dropout' : 0.8
                        },
              clip=1,
              optimizer_params={'weight_decay' : 1e-4}
              )

#### Testing

In [22]:
ROOT = './'

DATASETS = ROOT + 'datasets/'
VECTORIZERS = ROOT + ''
GLOVE = ROOT + ''
SAVED_MODELS = ROOT + 'saved_models/'

GRU  = SAVED_MODELS + 'gru.pkl'    # Should be the .pkl file of one provided pre-trained model
LSTM = SAVED_MODELS + 'lstm.pkl'   # Should be the .pkl file of one provided pre-trained model
TEST_PATH  = DATASETS + 'imdb-reviews.csv' # Should be your hidden .csv file

write_data(GLOVE + 'emb.pkl', prepare_emb())


df_test  = load_csv(TEST_PATH)

df_test  = clean_df(df_test, GLOVE + 'emb.pkl')

data = ClassifierData(df_test, device='cuda', load=False)
loader = DataLoader(data, batch_size=64, shuffle=False, collate_fn=collate)


In [23]:
model = load_data(LSTM)

loss, p, r, f1, acc, _, _ = test(loader, model, nn.BCELoss(), 'cuda')

print(f'Avg loss: {loss:>8f}\n{"Accuracy":10s}: {(100*acc):>0.2f}%\t{"F1-Score":10s}: {(100*f1):>0.2f}%')
print(f'{"Precision":10s}: {(100*p):>0.2f}%\t{"Recall":10s}: {(100*r):>0.2f}%')

Avg loss: 0.212785
Accuracy  : 91.61%	F1-Score  : 91.66%
Precision : 91.15%	Recall    : 92.17%


In [None]:
model = load_data(GRU)

loss, p, r, f1, acc, _, _ = test(loader, model, nn.BCELoss(), 'cuda')

print(f'Avg loss: {loss:>8f}\n{"Accuracy":10s}: {(100*acc):>0.2f}%\t{"F1-Score":10s}: {(100*f1):>0.2f}%')
print(f'{"Precision":10s}: {(100*p):>0.2f}%\t{"Recall":10s}: {(100*r):>0.2f}%')