# HW2

# Importing stuff

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
import os
import pickle
from collections import OrderedDict, defaultdict
from copy import deepcopy
from operator import itemgetter
from random import shuffle

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from redditscore.tokenizer import CrazyTokenizer
from tqdm import tqdm_notebook

import data_utils
import models
import training_utils
from sklearn import metrics
from sklearn.model_selection import ParameterGrid, train_test_split

In [3]:
import matplotlib.pyplot as plt
import seaborn
from jupyterthemes import jtplot

jtplot.style()

In [4]:
SEED = 24

if torch.cuda.is_available and torch.has_cudnn:
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device("cpu")

# Preparing data

In [23]:
train = pd.read_table('data/snli_train.tsv')
val = pd.read_table('data/snli_val.tsv')

In [25]:
train_tokens1, train_tokens2 = data_utils.tokenize_dataset(train)
train_labels = list(train.label)

val_tokens1, val_tokens2 = data_utils.tokenize_dataset(val)
val_labels = list(val.label)

In [26]:
MAX_VOCAB_SIZE = 30000
EMBED_SIZE = 300
PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 64
MAX_SENT_LENGTH = 70
NUM_CLASSES = 3

In [28]:
ft_embeddings = KeyedVectors.load_word2vec_format('data/wiki-news-300d-1M.vec')

In [29]:
token2id, id2token = data_utils.build_vocab(train_tokens1 + train_tokens2, ft_embeddings)

# Training utilities

In [34]:
init_embeddings = np.zeros((len(token2id), EMBED_SIZE))

In [35]:
for token, id_ in token2id.items():
    if token in ft_embeddings:
        init_embeddings[id_] = ft_embeddings[token]
    elif token == '<unk>':
        init_embeddings[id_] = np.random.normal(size=(300, ))
    elif token == '<pad>':
        init_embeddings[id_] = np.zeros(300)

In [36]:
#with open('init_embeddings.npy', 'rb') as fin:
#    init_embeddings = np.load(fin)

In [37]:
train_dataset = data_utils.TextDataset(train_tokens1, train_tokens2, train_labels, token2id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=data_utils.text_collate_func,
                                           shuffle=True)

val_dataset = data_utils.TextDataset(val_tokens1, val_tokens2, val_labels, token2id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=data_utils.text_collate_func,
                                           shuffle=True)

In [38]:
del ft_embeddings

# Training model

In [39]:
cnn_encoder = models.CNNEncoder(EMBED_SIZE, 128, len(token2id), PAD_IDX, 3,
                                init_embeddings).to(DEVICE)
rnn_encoder = models.RNNEncoder(EMBED_SIZE, 128, len(token2id), PAD_IDX,
                                init_embeddings).to(DEVICE)
cnn_model = models.InferenceModel(cnn_encoder, NUM_CLASSES, 256).to(DEVICE)
rnn_model = models.InferenceModel(rnn_encoder, NUM_CLASSES, 256).to(DEVICE)

In [40]:
lr = 1e-4
num_epochs = 10000
patience = 5
criterion = torch.nn.CrossEntropyLoss()
cnn_optimizer = torch.optim.Adam(cnn_model.parameters(), lr=lr)
rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr=lr)

In [20]:
losses_train = []
losses_val = []
acc_val = []
best_val_loss = np.inf
counter = 0
for epoch in tqdm_notebook(range(100), desc='epoch'):
    if counter > patience:
        print('Early stopping')
        break
    loss_train = training_utils.train_model(cnn_model, cnn_optimizer, train_loader, criterion)
    losses_train.append(loss_train)
    loss_val, ys_hat, ys = training_utils.eval_model(cnn_model, val_loader, criterion)
    losses_val.append(loss_val)
    acc = metrics.accuracy_score(ys, ys_hat.argmax(axis=1))
    acc_val.append(acc)
    
    if loss_val < best_val_loss:
        best_val_loss = loss_val
        counter = 0
    else:
        counter += 1
    
    print(losses_train, losses_val, acc_val)

HBox(children=(IntProgress(value=0, description='epoch', style=ProgressStyle(description_width='initial')), HT…

[0.9307395218276976] [0.8733707156181335] [0.601]
[0.9307395218276976, 0.8320279796981812] [0.8733707156181335, 0.8431928548812865] [0.601, 0.628]
[0.9307395218276976, 0.8320279796981812, 0.8001272690963727] [0.8733707156181335, 0.8431928548812865, 0.8252369451522827] [0.601, 0.628, 0.635]
[0.9307395218276976, 0.8320279796981812, 0.8001272690963727, 0.7738109193229671] [0.8733707156181335, 0.8431928548812865, 0.8252369451522827, 0.813183289051056] [0.601, 0.628, 0.635, 0.643]
[0.9307395218276976, 0.8320279796981812, 0.8001272690963727, 0.7738109193229671, 0.7496317548942555] [0.8733707156181335, 0.8431928548812865, 0.8252369451522827, 0.813183289051056, 0.8218447961807249] [0.601, 0.628, 0.635, 0.643, 0.626]
[0.9307395218276976, 0.8320279796981812, 0.8001272690963727, 0.7738109193229671, 0.7496317548942555, 0.7271123797607412] [0.8733707156181335, 0.8431928548812865, 0.8252369451522827, 0.813183289051056, 0.8218447961807249, 0.8053658156394957] [0.601, 0.628, 0.635, 0.643, 0.626, 0.639

In [21]:
losses_train = []
losses_val = []
acc_val = []
best_val_loss = np.inf
counter = 0
for epoch in tqdm_notebook(range(100), desc='epoch'):
    if counter > patience:
        print('Early stopping')
        break
    loss_train = training_utils.train_model(rnn_model, rnn_optimizer, train_loader, criterion)
    losses_train.append(loss_train)
    loss_val, ys_hat, ys = training_utils.eval_model(rnn_model, val_loader, criterion)
    losses_val.append(loss_val)
    acc = metrics.accuracy_score(ys, ys_hat.argmax(axis=1))
    acc_val.append(acc)
    
    if loss_val < best_val_loss:
        best_val_loss = loss_val
        counter = 0
    else:
        counter += 1
    
    print(losses_train, losses_val, acc_val)

HBox(children=(IntProgress(value=0, description='epoch', style=ProgressStyle(description_width='initial')), HT…

[0.9832183313369744] [0.9257792291641235] [0.569]
[0.9832183313369744, 0.8971054028129591] [0.9257792291641235, 0.9024311680793762] [0.569, 0.577]
[0.9832183313369744, 0.8971054028129591, 0.8739359276390077] [0.9257792291641235, 0.9024311680793762, 0.8913311462402346] [0.569, 0.577, 0.582]
[0.9832183313369744, 0.8971054028129591, 0.8739359276390077, 0.8559651419448845] [0.9257792291641235, 0.9024311680793762, 0.8913311462402346, 0.8735835337638854] [0.569, 0.577, 0.582, 0.601]
[0.9832183313369744, 0.8971054028129591, 0.8739359276390077, 0.8559651419448845, 0.8398001293945316] [0.9257792291641235, 0.9024311680793762, 0.8913311462402346, 0.8735835337638854, 0.8526374406814575] [0.569, 0.577, 0.582, 0.601, 0.618]
[0.9832183313369744, 0.8971054028129591, 0.8739359276390077, 0.8559651419448845, 0.8398001293945316, 0.8264003778266912] [0.9257792291641235, 0.9024311680793762, 0.8913311462402346, 0.8735835337638854, 0.8526374406814575, 0.8528770031929016] [0.569, 0.577, 0.582, 0.601, 0.618, 0.

[0.9832183313369744, 0.8971054028129591, 0.8739359276390077, 0.8559651419448845, 0.8398001293945316, 0.8264003778266912, 0.8139372484207169, 0.7997757298660283, 0.7846933247756959, 0.7725759757423413, 0.7607841845512375, 0.7503967479133605, 0.7415834348869328, 0.7329282188224797, 0.7244444179344173, 0.716878231906891, 0.7088032433509828, 0.7009190119743344, 0.6929791067123406, 0.6851150783538809] [0.9257792291641235, 0.9024311680793762, 0.8913311462402346, 0.8735835337638854, 0.8526374406814575, 0.8528770031929016, 0.8398826608657838, 0.821358754634857, 0.8191867895126342, 0.8094690351486207, 0.7938524212837219, 0.8011828269958496, 0.7920800127983093, 0.7837953147888184, 0.7730073256492614, 0.7709213757514953, 0.7650820546150207, 0.7769924850463866, 0.7787912626266481, 0.7683150057792663] [0.569, 0.577, 0.582, 0.601, 0.618, 0.626, 0.624, 0.636, 0.639, 0.654, 0.655, 0.654, 0.666, 0.677, 0.677, 0.68, 0.683, 0.661, 0.663, 0.668]
[0.9832183313369744, 0.8971054028129591, 0.8739359276390077,

[0.9832183313369744, 0.8971054028129591, 0.8739359276390077, 0.8559651419448845, 0.8398001293945316, 0.8264003778266912, 0.8139372484207169, 0.7997757298660283, 0.7846933247756959, 0.7725759757423413, 0.7607841845512375, 0.7503967479133605, 0.7415834348869328, 0.7329282188224797, 0.7244444179344173, 0.716878231906891, 0.7088032433509828, 0.7009190119743344, 0.6929791067123406, 0.6851150783538809, 0.6764866884803783, 0.6685443989562972, 0.6607514595603937, 0.6529385019874574, 0.6451916887664798, 0.6371373191070543, 0.6289793743324287, 0.622269766407013] [0.9257792291641235, 0.9024311680793762, 0.8913311462402346, 0.8735835337638854, 0.8526374406814575, 0.8528770031929016, 0.8398826608657838, 0.821358754634857, 0.8191867895126342, 0.8094690351486207, 0.7938524212837219, 0.8011828269958496, 0.7920800127983093, 0.7837953147888184, 0.7730073256492614, 0.7709213757514953, 0.7650820546150207, 0.7769924850463866, 0.7787912626266481, 0.7683150057792663, 0.76301873254776, 0.7478510184288024, 0.7

[0.9832183313369744, 0.8971054028129591, 0.8739359276390077, 0.8559651419448845, 0.8398001293945316, 0.8264003778266912, 0.8139372484207169, 0.7997757298660283, 0.7846933247756959, 0.7725759757423413, 0.7607841845512375, 0.7503967479133605, 0.7415834348869328, 0.7329282188224797, 0.7244444179344173, 0.716878231906891, 0.7088032433509828, 0.7009190119743344, 0.6929791067123406, 0.6851150783538809, 0.6764866884803783, 0.6685443989562972, 0.6607514595603937, 0.6529385019874574, 0.6451916887664798, 0.6371373191070543, 0.6289793743324287, 0.622269766407013, 0.6147352697181706, 0.608011977634431, 0.60081620546341, 0.5940004160118109, 0.5867547268867496, 0.5792577898025516] [0.9257792291641235, 0.9024311680793762, 0.8913311462402346, 0.8735835337638854, 0.8526374406814575, 0.8528770031929016, 0.8398826608657838, 0.821358754634857, 0.8191867895126342, 0.8094690351486207, 0.7938524212837219, 0.8011828269958496, 0.7920800127983093, 0.7837953147888184, 0.7730073256492614, 0.7709213757514953, 0.76

[0.9832183313369744, 0.8971054028129591, 0.8739359276390077, 0.8559651419448845, 0.8398001293945316, 0.8264003778266912, 0.8139372484207169, 0.7997757298660283, 0.7846933247756959, 0.7725759757423413, 0.7607841845512375, 0.7503967479133605, 0.7415834348869328, 0.7329282188224797, 0.7244444179344173, 0.716878231906891, 0.7088032433509828, 0.7009190119743344, 0.6929791067123406, 0.6851150783538809, 0.6764866884803783, 0.6685443989562972, 0.6607514595603937, 0.6529385019874574, 0.6451916887664798, 0.6371373191070543, 0.6289793743324287, 0.622269766407013, 0.6147352697181706, 0.608011977634431, 0.60081620546341, 0.5940004160118109, 0.5867547268867496, 0.5792577898025516, 0.573341314258576, 0.566331647014617, 0.5601899190616608, 0.5519148110008242, 0.5456832761573799] [0.9257792291641235, 0.9024311680793762, 0.8913311462402346, 0.8735835337638854, 0.8526374406814575, 0.8528770031929016, 0.8398826608657838, 0.821358754634857, 0.8191867895126342, 0.8094690351486207, 0.7938524212837219, 0.8011

KeyboardInterrupt: 

## CNN

In [41]:
pars = {'kernel_size': [3, 5, 7],
        'hidden_size': [64, 128, 256, 512],
        'dropout': [0.0, 0.5]}
param_grid = ParameterGrid(pars)

In [None]:
best_losses_cnn = pd.DataFrame(columns=['parameters', 'val_loss', 'val_acc'])
best_losses_cnn['parameters'] = list(param_grid)
for i, params in enumerate(tqdm_notebook(param_grid, desc='grid_search')):
    cnn_encoder = models.CNNEncoder(
        EMBED_SIZE,
        params['hidden_size'],
        len(token2id),
        PAD_IDX,
        params['kernel_size'],
        init_embeddings).to(DEVICE)
    cnn_model = models.InferenceModel(
        cnn_encoder,
        NUM_CLASSES,
        params['hidden_size'] * 2,
        dropout=params['dropout']).to(DEVICE)
    cnn_optimizer = torch.optim.Adam(cnn_model.parameters(), lr=lr)

    best_val_loss = np.inf
    counter = 0
    for epoch in tqdm_notebook(range(100000), desc='epoch'):
        if counter > patience:
            print('Early stopping')
            break
        loss_train = training_utils.train_model(cnn_model, cnn_optimizer,
                                                train_loader, criterion)
        loss_val, ys_hat, ys = training_utils.eval_model(
            cnn_model, val_loader, criterion)
        acc = metrics.accuracy_score(ys, ys_hat.argmax(axis=1))
        if loss_val < best_val_loss:
            best_val_loss = loss_val
            best_acc = acc
            counter = 0
        else:
            counter += 1
    best_losses_cnn.loc[i, 'val_loss'] = best_val_loss
    best_losses_cnn.loc[i, 'val_acc'] = best_acc

    print(best_losses_cnn.head(i + 1))

HBox(children=(IntProgress(value=0, description='grid_search', max=24, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='epoch', max=100000, style=ProgressStyle(description_width='in…

Early stopping
                                          parameters  val_loss val_acc
0  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.792253   0.653


HBox(children=(IntProgress(value=0, description='epoch', max=100000, style=ProgressStyle(description_width='in…

Early stopping
                                          parameters  val_loss val_acc
0  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.792253   0.653
1  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.781188    0.66


HBox(children=(IntProgress(value=0, description='epoch', max=100000, style=ProgressStyle(description_width='in…

Early stopping
                                          parameters  val_loss val_acc
0  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.792253   0.653
1  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.781188    0.66
2  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.782115   0.657


HBox(children=(IntProgress(value=0, description='epoch', max=100000, style=ProgressStyle(description_width='in…

Early stopping
                                          parameters  val_loss val_acc
0  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.792253   0.653
1  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.781188    0.66
2  {'dropout': 0.0, 'hidden_size': 64, 'kernel_si...  0.782115   0.657
3  {'dropout': 0.0, 'hidden_size': 128, 'kernel_s...  0.781131   0.658


HBox(children=(IntProgress(value=0, description='epoch', max=100000, style=ProgressStyle(description_width='in…

## RNN

In [None]:
pars = {'hidden_size': [64, 128, 256, 512],
        'dropout': [0.0, 0.5]}
param_grid = ParameterGrid(pars)

In [None]:
best_losses_rnn = pd.DataFrame(columns=['parameters', 'val_loss', 'val_acc'])
best_losses_rnn['parameters'] = list(param_grid)
for i, params in enumerate(tqdm_notebook(param_grid, desc='grid_search')):
    rnn_encoder = models.RNNEncoder(
        EMBED_SIZE,
        params['hidden_size'],
        len(token2id),
        PAD_IDX,
        init_embeddings).to(DEVICE)
    rnn_model = models.InferenceModel(
        rnn_encoder,
        NUM_CLASSES,
        params['hidden_size'] * 2,
        dropout=params['dropout']).to(DEVICE)
    rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr=lr)

    best_val_loss = np.inf
    counter = 0
    for epoch in tqdm_notebook(range(100000), desc='epoch'):
        if counter > patience:
            print('Early stopping')
            break
        loss_train = training_utils.train_model(rnn_model, rnn_optimizer,
                                                train_loader, criterion)
        loss_val, ys_hat, ys = training_utils.eval_model(
            rnn_model, val_loader, criterion)
        acc = metrics.accuracy_score(ys, ys_hat.argmax(axis=1))
        if loss_val < best_val_loss:
            best_val_loss = loss_val
            best_acc = acc
            counter = 0
        else:
            counter += 1
    best_losses_rnn.loc[i, 'val_loss'] = best_val_loss
    best_losses_rnn.loc[i, 'val_acc'] = best_acc

    print(best_losses_rnn.head(i + 1))