# Model analysis and heat map

In [1]:
import numpy as np
import torch
import json
import spacy
from prettytable import PrettyTable
import os
import sys
import yaml
from yaml import Loader
from util import Dictionary
from torch import nn
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from train import package

In [2]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_, token.vector_norm)# token.vector

Apple PROPN nsubj 7.9169617
is AUX aux 8.597497
looking VERB ROOT 8.68382
at ADP prep 7.737509
buying VERB pcomp 9.099992
U.K. PROPN dobj 7.7217307
startup NOUN advcl 5.9194016
for ADP prep 7.6531463
$ SYM quantmod 9.727607
1 NUM compound 10.960313
billion NUM pobj 8.402847


## Download our models:

#### Git lfs is used to store our models, therefore we must install it:

In [3]:
!git lfs install

Updated git hooks.
Git LFS initialized.


#### If you do not have a file named models, it downloads ours:

In [4]:
models_exists = os.path.exists("models/")
if(not models_exists):
    #!mkdir model_test
    #%cd model_test
    !git clone https://github.com/Learningchipmunk/models.git
else:
    print("You already have models of your own !")

You already have models of your own !


## Count the model parameters:

In [5]:
def count_parameters(model):
    """Function that prints a the summary table of a network. 
        It also counts how many parameters require gradient.

    Args:
        model (nn.Module): A neural network with multiple paramters requiring gradient.

    Returns:
        int: Returns the total number of parameters that requires gradient. 
    """
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
            
        ## Splitting the name into processable data:
        l        = name.split(".")

        Net_name = l[0]
        fun      = l[1]
        
                    
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params


In [6]:
model_bestval = torch.load("models/test.pkl")

In [7]:
model_bestval.parameters

<bound method Module.parameters of Classifier(
  (encoder): SelfAttentiveEncoder(
    (bilstm): BiLSTM(
      (drop): Dropout(p=0.5, inplace=False)
      (encoder): Embedding(28199, 200)
      (bilstm): LSTM(200, 300, num_layers=2, dropout=0.5, bidirectional=True)
    )
    (drop): Dropout(p=0.5, inplace=False)
    (ws1): Linear(in_features=600, out_features=350, bias=False)
    (ws2): Linear(in_features=350, out_features=4, bias=False)
    (tanh): Tanh()
    (softmax): Softmax(dim=1)
  )
  (fc): Linear(in_features=2400, out_features=300, bias=True)
  (drop): Dropout(p=0.5, inplace=False)
  (tanh): Tanh()
  (pred): Linear(in_features=300, out_features=5, bias=True)
)>

In [8]:
count_parameters(model_bestval)

+--------------------------------------------+------------+
|                  Modules                   | Parameters |
+--------------------------------------------+------------+
|       encoder.bilstm.encoder.weight        |  5639800   |
|     encoder.bilstm.bilstm.weight_ih_l0     |   240000   |
|     encoder.bilstm.bilstm.weight_hh_l0     |   360000   |
|      encoder.bilstm.bilstm.bias_ih_l0      |    1200    |
|      encoder.bilstm.bilstm.bias_hh_l0      |    1200    |
| encoder.bilstm.bilstm.weight_ih_l0_reverse |   240000   |
| encoder.bilstm.bilstm.weight_hh_l0_reverse |   360000   |
|  encoder.bilstm.bilstm.bias_ih_l0_reverse  |    1200    |
|  encoder.bilstm.bilstm.bias_hh_l0_reverse  |    1200    |
|     encoder.bilstm.bilstm.weight_ih_l1     |   720000   |
|     encoder.bilstm.bilstm.weight_hh_l1     |   360000   |
|      encoder.bilstm.bilstm.bias_ih_l1      |    1200    |
|      encoder.bilstm.bilstm.bias_hh_l1      |    1200    |
| encoder.bilstm.bilstm.weight_ih_l1_rev

9942605

In [9]:
model_bestacc = torch.load("models/test..best_acc.pt")

In [10]:
model_bestacc.parameters

<bound method Module.parameters of Classifier(
  (encoder): SelfAttentiveEncoder(
    (bilstm): BiLSTM(
      (drop): Dropout(p=0.5, inplace=False)
      (encoder): Embedding(28199, 200)
      (bilstm): LSTM(200, 300, num_layers=2, dropout=0.5, bidirectional=True)
    )
    (drop): Dropout(p=0.5, inplace=False)
    (ws1): Linear(in_features=600, out_features=350, bias=False)
    (ws2): Linear(in_features=350, out_features=4, bias=False)
    (tanh): Tanh()
    (softmax): Softmax(dim=1)
  )
  (fc): Linear(in_features=2400, out_features=300, bias=True)
  (drop): Dropout(p=0.5, inplace=False)
  (tanh): Tanh()
  (pred): Linear(in_features=300, out_features=5, bias=True)
)>

## Fetching the data and the sentiment matrix attached to it:

#### getting the config file with the data paths:

In [11]:
with open("config_small.yaml", "r") as ymlfile:
    cfg = yaml.load(ymlfile, Loader= Loader)

cfg

{'cuda': True,
 'seed': 1111,
 'class_number': 5,
 'model': {'emsize': 200,
  'nhid': 300,
  'nlayers': 2,
  'pooling': 'all',
  'attention_unit': 350,
  'attention_hops': 4,
  'dropout': 0.5,
  'clip': 0.5,
  'nfc': 300},
 'training': {'lr': 0.001,
  'optimizer': 'Adam',
  'scheduler': {'using_scheduler': True,
   'name': 'ReduceLROnPlateau',
   'factor': 0.5,
   'patience': 1,
   'step_size': 1},
  'epochs': 20,
  'log_interval': 20,
  'batch_size': 50,
  'penalization_coeff': 1},
 'data': {'save': '/Data/pls_do_not_delete/Project-self-attentive-sentence-embedding/models/with_sched3.pkl',
  'dictionary': '/Data/pls_do_not_delete/Project-self-attentive-sentence-embedding/small/dict_review_short.json',
  'word_vector': '/Data/pls_do_not_delete/Project-self-attentive-sentence-embedding/content/glove.6B.200d.txt.pt',
  'train_data': '/Data/pls_do_not_delete/Project-self-attentive-sentence-embedding/small/train_tok.json',
  'val_data': '/Data/pls_do_not_delete/Project-self-attentive-sente

#### Downloading the data if not present:

In [12]:
small_exists = os.path.exists("small/")

if(not small_exists):
    !wget https://www.di.ens.fr/~lelarge/small_yelp.zip
    !unzip small_yelp.zip
else:
    print("You already have the data.")

You already have the data.


#### Loading the data:

In [13]:
dictionary = dictionary = Dictionary(path=cfg["data"]["dictionary"])

global dictionary

In [14]:
dictionary.word2idx

{'<pad>': 0,
 'ok': 1,
 'if': 2,
 'you': 3,
 'are': 4,
 'looking': 5,
 'for': 6,
 'amazing': 7,
 'home': 8,
 'cooked': 9,
 'italian': 10,
 'food': 11,
 'in': 12,
 'the': 13,
 'valley': 14,
 'then': 15,
 'this': 16,
 'is': 17,
 'place': 18,
 'to': 19,
 'go': 20,
 '!': 21,
 'simple': 22,
 'and': 23,
 'i': 24,
 'personally': 25,
 'love': 26,
 'eggplant': 27,
 'with': 28,
 'noodles': 29,
 'their': 30,
 'pizzas': 31,
 'which': 32,
 'stunning': 33,
 'it': 34,
 "'s": 35,
 'a': 36,
 'small': 37,
 'expect': 38,
 'wait': 39,
 'quality': 40,
 'but': 41,
 'they': 42,
 'do': 43,
 'accept': 44,
 'phone': 45,
 'orders': 46,
 '-': 47,
 'so': 48,
 'call': 49,
 'ahead': 50,
 "n't": 51,
 'look': 52,
 'any': 53,
 'further': 54,
 ',': 55,
 '.': 56,
 'while': 57,
 'was': 58,
 'making': 59,
 'an': 60,
 'appointment': 61,
 'heard': 62,
 'stefanie': 63,
 'at': 64,
 'front': 65,
 'desk': 66,
 'on': 67,
 'patient': 68,
 'that': 69,
 'needed': 70,
 'results': 71,
 'before': 72,
 'vacation': 73,
 'she': 74,
 'assu

In [15]:
data_test = open(cfg["data"]["val_data"]).readlines()

In [16]:
print(data_test[5])
print(len(data_test))

{"label": 4.0, "text": ["i", "went", "out", "on", "a", "limb", "and", "it", "worked", "out", "great", "!", "i", "read", "the", "reviews", ",", "the", "location", "was", "ideal", "and", "when", "i", "walked", "in", "i", "was", "greeted", "with", "a", "smile", ",", "friendliness", "and", "willingness", "to", "answer", "my", "questions", "and", "the", "same", "thing", "happened", "when", "i", "picked", "up", "my", "ipad", ".", "the", "screen", "had", "been", "smashed", "from", "blunt", "force", ",", "not", "just", "a", "drop", ".", "i", "originally", "thought", ",", "this", "was", "it", "..", "new", "ipad", "needed", "based", "on", "how", "damaged", "it", "was", ".", "nope", "!", "great", "price", ",", "great", "service", ",", "and", "finished", "in", "a", "couple", "of", "hours", "and", "i", "was", "able", "to", "take", "advantage", "of", "the", "yelp", "deal", "!", "i", "won", "all", "they", "way", "around", ".", "nothing", "fancy", "here", ",", "just", "four", "walls", "and", "good", "

#### Extracting the useful data for attention heatmap:

In [17]:
def package(data, dictionary):
    """Package data for training / evaluation."""
    data = list(map(lambda x: json.loads(x), data))
    text = list(map(lambda x: x["text"], data))
    dat = list(map(lambda x: list(map(lambda y: dictionary.word2idx[y], x['text'])), data))
    maxlen = 0
    for item in dat:
        maxlen = max(maxlen, len(item))
    targets = list(map(lambda x: x['label'], data))
    maxlen = min(maxlen, 500)
    for i in range(len(data)):
        if maxlen < len(dat[i]):
            dat[i] = dat[i][:maxlen]
        else:
            for j in range(maxlen - len(dat[i])):
                dat[i].append(dictionary.word2idx['<pad>'])
    dat = torch.LongTensor(dat)
    targets = torch.LongTensor(targets)
    return dat.t(), targets, text


In [18]:
model_bestval.eval()
@torch.no_grad()
def evaluate(cfg, model, data_eval):
    model.eval()  # turn on the eval() switch to disable dropout
    total_loss = 0
    total_correct = 0
    
    useful_data = []
    
    for batch, i in enumerate(range(0, len(data_eval), cfg["training"]["batch_size"])):
        data, targets, text = package(data_eval[i:min(len(data_eval), i+cfg["training"]["batch_size"])], dictionary)
        if cfg["cuda"]:
            data = data.cuda()
            targets = targets.cuda()
        hidden = model.init_hidden(data.size(1))
        output, attention, _ = model.forward(data, hidden)
        output_flat = output.view(data.size(1), -1)
        total_loss += criterion(output_flat, targets).data
        prediction = torch.max(output_flat, 1)[1]
        total_correct += torch.sum((prediction == targets).float())
        useful_data.append({'percentage_correct_pred': torch.sum((prediction == targets).float()).item()/cfg["training"]["batch_size"], 
                            'prediction':prediction.cpu().numpy(), 'label':targets.cpu().numpy(), 'attention':attention.cpu().numpy(), 'text':text})
    return total_loss.item() / (len(data_eval) // cfg["training"]["batch_size"]), total_correct.data.item() / len(data_eval), useful_data

In [19]:
criterion = nn.CrossEntropyLoss()
tot_loss, tot_correct, useful_data = evaluate(cfg, model_bestval, data_test)
print(tot_loss, tot_correct)

  targets = torch.LongTensor(targets)


0.8841343879699707 0.63


In [20]:
evaluate(cfg, model_bestacc, data_test)
tot_loss_2, tot_correct_2, useful_data_2 = evaluate(cfg, model_bestval, data_test)
print(tot_loss_2, tot_correct_2)

  targets = torch.LongTensor(targets)


0.8841343879699707 0.63


In [21]:
percentage = list(map(lambda x: x["percentage_correct_pred"], useful_data))

In [22]:
arg_max_pred = np.argmax(percentage)

In [23]:
useful_data[arg_max_pred]

{'percentage_correct_pred': 0.86,
 'prediction': array([4, 4, 0, 4, 3, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 2, 0,
        4, 4, 3, 4, 2, 4, 4, 4, 4, 1, 3, 3, 3, 3, 4, 4, 4, 0, 4, 4, 2, 4,
        4, 3, 4, 4, 4, 2]),
 'label': array([4, 0, 1, 4, 3, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 3, 4, 2, 0,
        4, 4, 3, 4, 2, 4, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 0, 3, 2, 1, 4,
        4, 3, 4, 4, 4, 2]),
 'attention': array([[[6.9479982e-17, 3.7688202e-19, 2.4956808e-18, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
         [1.1756009e-10, 3.4919123e-11, 1.5651160e-10, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
         [8.4452915e-07, 7.5302398e-07, 4.4834234e-08, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
         [5.3873210e-18, 6.6278105e-17, 6.9929129e-18, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00]],
 
        [[5.4496979e-23, 1.4840667e-24, 2.5505597e-24, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+

#### We store the useful data:

The useful data is a list of dictionnaries with 5 fields:

| Name Of Field | percentage_correct_pred                        | label                           | prediction                                   | attention                                       | text                    |
|---------------|------------------------------------------------|---------------------------------|----------------------------------------------|-------------------------------------------------|-------------------------|
| Type          | float                                          | list of ints                    | list of ints                                 | list of floats                                  | list of list of strings |
| Desc          | Percentage of correct prediction on this batch | The true rating for each review | The prediction of the rating for each review | The value of the attention matrix for the batch | The reviews in text     |


In [24]:
np.save("content/useful_data.npy", useful_data)

## Creating the heatmap:

#### You can directly load the data if you already created it:

In [25]:
# If dir content does not exist, we create it
dir_name = "content/"
if(not os.path.exists(dir_name)):
    os.makedirs(dir_name)
    print("Created directory named {}".format(dir_name))
else:
    print("You already got a content directory!")
    

useful_data_path = dir_name + "useful_data.npy"
useful_data_exists = os.path.exists(useful_data_path)

if(useful_data_exists):
    useful_data = np.load(useful_data_path, allow_pickle=True)
    arg_max_pred = np.argmax(percentage)
    print("We successfully loaded the data!")
else:
    print("You do not have the data, you must execute the cells above!")

You already got a content directory!
We successfully loaded the data!


In [26]:
useful_data[arg_max_pred]

{'percentage_correct_pred': 0.86,
 'prediction': array([4, 4, 0, 4, 3, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 2, 0,
        4, 4, 3, 4, 2, 4, 4, 4, 4, 1, 3, 3, 3, 3, 4, 4, 4, 0, 4, 4, 2, 4,
        4, 3, 4, 4, 4, 2]),
 'label': array([4, 0, 1, 4, 3, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 3, 4, 2, 0,
        4, 4, 3, 4, 2, 4, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 0, 3, 2, 1, 4,
        4, 3, 4, 4, 4, 2]),
 'attention': array([[[6.9479982e-17, 3.7688202e-19, 2.4956808e-18, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
         [1.1756009e-10, 3.4919123e-11, 1.5651160e-10, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
         [8.4452915e-07, 7.5302398e-07, 4.4834234e-08, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
         [5.3873210e-18, 6.6278105e-17, 6.9929129e-18, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00]],
 
        [[5.4496979e-23, 1.4840667e-24, 2.5505597e-24, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+

#### To be continued by Tom Sander...