## Analysis

<l>
    <li> Find best hyperparameters.</li>
    <li> Plot per class tables for best models.</li>
<l>

### Load aggregated results to find best model

In [1]:
import numpy as np
import pandas as pd

In [2]:
results = {}
results["results_1_layer"] = pd.read_csv("results/results_tuning_1_layer.csv", index_col=0)
results["results_2_3_layers"] = pd.read_csv("results/results_tuning_2_3_layers.csv", index_col=0)
results["results_1_layer_maxlen_500"] = pd.read_csv("results/results_tuning_1_layer_maxlen_500.csv", index_col=0)
results["results_2_3_layers_maxlen_500"] = pd.read_csv("results/results_tuning_2_3_layers_maxlen_500.csv", index_col=0)

In [3]:
results["results_1_layer"]["max_num_tokens"] = None
results["results_2_3_layers"]["max_num_tokens"] = None
results["results_1_layer_maxlen_500"]["max_num_tokens"] = 500
results["results_2_3_layers_maxlen_500"]["max_num_tokens"] = 500

In [4]:
df_results = pd.concat(results.values(), ignore_index=True)

In [5]:
# jupyter nbextension enable --py --sys-prefix qgrid
import qgrid
# only required if you have not enabled the ipywidgets nbextension yet
# jupyter nbextension enable --py --sys-prefix widgetsnbextension
#to show a df simply use the below:
qgrid.show_grid(df_results)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

### Load best model, show per class tables for it

In [23]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

import importlib

import utils
# importlib.reload(utils)
from preprocess import create_lookups_for_vocab, pad_collate_fn

import model
# importlib.reload(model)
from model import FinalModel
from torchcontrib.optim import SWA

In [24]:
device = "cpu" # "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [25]:
PATH_TO_EMBEDDINGS_FOLDER = "/scratch/mz2476/wiki/embeddings/"
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/"
PATH_TO_MODELS_FOLDER = "/scratch/mz2476/wiki/models/"

In [15]:
# LOAD vocab, tensor dataset, classes
vocab = torch.load(PATH_TO_DATA_FOLDER + "vocab_all_en.pt")
print("Vocab size is:", len(vocab))
index_to_word, word_to_index = create_lookups_for_vocab(vocab)

wiki_tensor_dataset = torch.load(PATH_TO_DATA_FOLDER + "wiki_tensor_dataset_vocab_all_en.pt")

classes = torch.load(PATH_TO_DATA_FOLDER + "classes_list.pt")
mlb = MultiLabelBinarizer(classes)

# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

Vocab size is: 682850


In [16]:
# import utils
# import importlib
# importlib.reload(utils)

# # Aligned fasstext. 2.5 million
embeddings = utils.load_vectors(PATH_TO_EMBEDDINGS_FOLDER + "wiki.en.align.vec")
#Creating the weight matrix for pretrained word embeddings
weights_matrix_ve = utils.create_embeddings_matrix(word_to_index, embeddings)

2519370it [03:10, 13228.62it/s]


Total words in vocab: 682850
No. of words from vocab found in embeddings: 528314


In [37]:
options_best_1_layer = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 1,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
    "file_name": "en_optimizer_SWA_num_hidden_1_dim_hidden_150_dropout_rate_0_learning_rate_0.01_num_epochs_10.pth"
}

options_best = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 200,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
    "file_name": "en_optimizer_SWA_num_hidden_2_dim_hidden_200_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth"
}

In [44]:
options = options_best_1_layer

model = FinalModel(options)

file_name = options["file_name"]
model.load_state_dict(torch.load(
    f"{PATH_TO_MODELS_FOLDER}/{file_name}",
    map_location=torch.device('cpu')
))
model.to(device)

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(682850, 300)
  )
  (layer_out): Linear(in_features=300, out_features=44, bias=True)
)

In [None]:
# ls $PATH_TO_MODELS_FOLDER

2 layer

In [39]:
# metrics_dict = utils.test_model(wiki_loaders["val"], model, device, threshold=0.5)

# df_per_class_metrics = utils.create_per_class_tables(wiki_loaders["val"], model, device, classes, threshold=0.5)
# df_per_class_metrics.to_csv("results/per_class_metrics_val_best_2_layers_model.csv")

  'precision', 'predicted', average, warn_for)


In [56]:
df_per_class_metrics = pd.read_csv("results/per_class_metrics_val_best_2_layers_model.csv", index_col=0)
# qgrid.show_grid(df_per_class_metrics)
df_per_class_metrics

Unnamed: 0,class_name,count,TN,FN,TP,FP,precision,recall,f1
0,Culture.Arts,19.0,9976,11,8,1,0.888889,0.421053,0.571429
1,Culture.Broadcasting,217.0,9716,56,161,63,0.71875,0.741935,0.730159
2,Culture.Crafts and hobbies,14.0,9982,11,3,0,1.0,0.214286,0.352941
3,Culture.Entertainment,295.0,9598,57,238,103,0.697947,0.80678,0.748428
4,Culture.Food and drink,67.0,9919,24,43,10,0.811321,0.641791,0.716667
5,Culture.Games and toys,109.0,9879,19,90,8,0.918367,0.825688,0.869565
6,Culture.Internet culture,6.0,9990,6,0,0,0.0,0.0,0.0
7,Culture.Language and literature,3631.0,6078,239,3392,287,0.92199,0.934178,0.928044
8,Culture.Media,3.0,9993,3,0,0,0.0,0.0,0.0
9,Culture.Music,435.0,9489,90,345,72,0.827338,0.793103,0.809859


1 layer

In [46]:
# metrics_dict = utils.test_model(wiki_loaders["val"], model, device, threshold=0.5)

# df_per_class_metrics = utils.create_per_class_tables(wiki_loaders["val"], model, device, classes, threshold=0.5)
# df_per_class_metrics.to_csv("results/per_class_metrics_val_best_1_layer_model.csv")

In [52]:
df_per_class_metrics = pd.read_csv("results/per_class_metrics_val_best_1_layer_model.csv", index_col=0)
df_per_class_metrics

Unnamed: 0,class_name,count,TN,FN,TP,FP,precision,recall,f1
0,Culture.Arts,19.0,9977,15,4,0,1.0,0.210526,0.347826
1,Culture.Broadcasting,217.0,9739,104,113,40,0.738562,0.520737,0.610811
2,Culture.Crafts and hobbies,14.0,9982,14,0,0,0.0,0.0,0.0
3,Culture.Entertainment,295.0,9659,131,164,42,0.796117,0.555932,0.654691
4,Culture.Food and drink,67.0,9925,32,35,4,0.897436,0.522388,0.660377
5,Culture.Games and toys,109.0,9882,41,68,5,0.931507,0.623853,0.747253
6,Culture.Internet culture,6.0,9990,6,0,0,0.0,0.0,0.0
7,Culture.Language and literature,3631.0,5993,421,3210,372,0.896147,0.884054,0.89006
8,Culture.Media,3.0,9993,3,0,0,0.0,0.0,0.0
9,Culture.Music,435.0,9472,92,343,89,0.793981,0.788506,0.791234
