## Instruções

* Rodar o notebook em ordem
* Arquivos necessários:
    * preprocessed_CETEN_v2.pkl: gerado pelo notebook 1_Data_Prep.ipynb
    * model_lstm_dicts.pkl: gerado pelo notebook 2_Train-LSTM.ipynb
    * model_96acc_bs1_state_dict.model: gerado pelo notebook 2_Train-LSTM.ipynb
    
    

In [3]:
import pickle
from sklearn.model_selection import KFold
import numpy as np
import logging
import datetime
import time

#### Leitura da base pre processada

In [4]:
with open('preprocessed_CETEN_v2.pkl', 'rb') as input:
    phrases = pickle.load(input)

#### Definição de funções auxiliares

In [6]:
def collect0(lst):
    return list(map(lambda x: x[0], lst))
def collect1(lst):
    return list(map(lambda x: x[1], lst))
def batch_idx_loader(data, shuffle=True):
    permutation = torch.randperm(len(data)) if shuffle else torch.tensor(range(len(data)))
    gen = (permutation[b:b+BATCH_SIZE] for b in range(0,len(data), BATCH_SIZE))
    return gen
def get_data_sorted(data, idx):
    data = np.array(data)
    idx = [idx] if len(idx) == 1 else idx
    
    sentences = [bat[0] for bat in data[idx]]
    tags =      [bat[1] for bat in data[idx]]

    mydict     = {idx:len(s) for idx,s in enumerate(sentences)}
    idx_sorted = [k for k in sorted(mydict, key=mydict.get, reverse=True)]

    sentences = np.array(sentences)[idx_sorted]
    tags      = np.array(tags)[idx_sorted]
    
    return sentences, tags
def prepare_batch_sequence(sentences, to_ix):
    s_lengths = [len(s) for s in sentences]

    # create an empty matrix with padding tokens
    pad_token = to_ix['<PAD>']
    longest_sent = max(s_lengths)
    batch_size = len(sentences)
    padded_sentences = np.ones((batch_size, longest_sent)) * pad_token
    # copy over the actual sequences
    for n, s_len in enumerate(s_lengths):
        sequence = sentences[n]
        idxs = [to_ix[w] for w in sequence[:s_len]]
        padded_sentences[n, 0:s_len] = idxs
    
    return torch.tensor(padded_sentences, dtype=torch.long)#, device=torch.device("cuda"))

def get_index_of_max(input):
    index = 0
    for i in range(1, len(input)):
        if input[i] > input[index]:
            index = i 
    return index

def get_max_prob_result(input, ix_to_tag):
    return ix_to_tag[get_index_of_max(input)]

### LOAD LSTM MODEL

In [7]:
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

###Important for reproductability
torch.manual_seed(1)
np.random.seed(1)
random.seed(1)

In [8]:
BATCH_SIZE=2753

In [9]:
##BACKUP WORKING LSTM biredirectional with gloves100
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, word_to_ix, embedding_weights=None):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.tagset_size = tagset_size

        padding_idx = word_to_ix['<PAD>']
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)#.cuda()
        if embedding_weights:
            self.word_embeddings.weight.data.copy_(torch.from_numpy(embedding_weights))
        
        self.num_layers = 1

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, self.num_layers, batch_first=False, bidirectional=True)#.cuda()

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)#.cuda()
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(self.num_layers*2, BATCH_SIZE, self.hidden_dim),#.cuda(),
                torch.zeros(self.num_layers*2, BATCH_SIZE, self.hidden_dim))#.cuda())
        #return torch.zeros(1, 1, self.hidden_dim)

    def forward(self, sentence, s_lengths, debug=False):
        batch_size, seq_len, = sentence.size()
        
        
        
        if debug: print("sentences input:", sentence.size())
        # 1. embed the input
        # Dim transformation: (batch_size, seq_len, 1) -> (batch_size, seq_len, embedding_dim)
        embeds = self.word_embeddings(sentence)
        if debug: print("embeds:",embeds.size())
        
        # 2. Run through RNN
        # TRICK 2 ********************************
        # Dim transformation: (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, nb_lstm_units)
        
        embeds = embeds.transpose(0,1)
        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        lstm_input = torch.nn.utils.rnn.pack_padded_sequence(embeds, s_lengths, batch_first=False)        
        #lstm_input = embeds.view(seq_len, BATCH_SIZE, -1)
        #lstm_input = embeds.transpose(0,1)
        if debug: print("lstm_input:",lstm_input.data.size())
        
        if debug: print("hidden0 (ht):",self.hidden[0].size())
        if debug: print("hidden1 (hc):",self.hidden[1].size())    
        # now run through LSTM
        lstm_out, self.hidden = self.lstm(lstm_input, self.hidden)
        
        if debug: print("lstm_out:", lstm_out.data.size())
        if debug: print("hidden0 (ht):",self.hidden[0].size())
        if debug: print("hidden1 (hc):",self.hidden[1].size())    
        
        # undo the packing operation
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=False)
        #print("lstm_out:", lstm_out.data.size())
        
        
        # 3. Project to tag space
        # Dim transformation: (batch_size, seq_len, nb_lstm_units) -> (batch_size * seq_len, nb_lstm_units)

        # this one is a bit tricky as well. First we need to reshape the data so it goes into the linear layer
        #lstm_out = lstm_out.contiguous()
        #lstm_out = lstm_out.view(-1, lstm_out.shape[2])
        #lstm_out = lstm_out.view(seq_len, -1)
        
        ##OPTION1 
        #batch_first = lstm_out.transpose(0,1)
        #print("batch_first:", batch_first.size())
        #batch_first = batch_first.contiguous()
        #print("batch_first(contiguous):", batch_first.size())
        #linear_input = batch_first.view(BATCH_SIZE, -1)
        ### Para fazer isso, preciso setar o input da Linear ao inves de 200, para 200*N,
        #onde N é o tamanho da maior sentença da base toda
        
        
        
        ###OPTION2
        #linear_input = self.hidden[0]
        
        
        ##option3
        lstm_out = lstm_out.transpose(0,1)
        lstm_out = lstm_out.contiguous()
        if debug: print("lstm_out reshaped:", lstm_out.size())
        linear_input = lstm_out.view(-1, lstm_out.shape[2])
        if debug: print("linear_input:", linear_input.size())
        
        
        
        tag_space = self.hidden2tag(linear_input)
        if debug: print("hidden out :", tag_space.size())
        tag_scores = F.log_softmax(tag_space, dim=1)
        
        #tag_scores = tag_scores.view(BATCH_SIZE, seq_len, self.tagset_size)
        return tag_scores

#### LOAD MODEL

In [10]:
RARE_WORD = '__RARE__'
with open('model_lstm_dicts.pkl', 'rb') as f:
    model_dicts = pickle.load(f)
    word_to_ix = model_dicts['word_to_ix']
    tag_to_ix = model_dicts['tag_to_ix']
    ix_to_tag = model_dicts['ix_to_tag']

In [15]:
EMBEDDING_DIM=100 #Alterar para 50 caso a rede tenha sido treinada com GLOVE50 ao invés de glove100
HIDDEN_DIM=200
lstm_model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix), word_to_ix)
lstm_model.load_state_dict(torch.load('./model_96acc_bs1_state_dict.model', map_location='cpu'))
lstm_model.eval()

LSTMTagger(
  (word_embeddings): Embedding(59709, 100, padding_idx=0)
  (lstm): LSTM(100, 200, bidirectional=True)
  (hidden2tag): Linear(in_features=400, out_features=17, bias=True)
)

#### PREPARE TEST DATA

In [20]:
def evaluate_lstm(phrases_to_test):
    with torch.no_grad():
        phrases_test_with_rare = []

        ### Ajusta tamanho do teste para ser multiplo do batchsize
        testwith=len(phrases_to_test)
        testwith = int(testwith/BATCH_SIZE)*BATCH_SIZE

        ### Add RARE 
        for s in phrases_to_test[:testwith]:
            phrases_test_with_rare.append([(tk[0],tk[1]) if word_to_ix.get(tk[0]) != None else (RARE_WORD,tk[1]) for tk in s])

        ###Define test dataset    
        validating_data_phrases = [(collect0(p),collect1(p)) for p in phrases_test_with_rare]

        since = time.time()

        preds=[]
        dt = validating_data_phrases
        loader = batch_idx_loader(dt, shuffle=False)
        count=0
        for indices in loader:
            sentences, tags = get_data_sorted(dt, indices)

            count+=len(sentences)
            if count % 27530 == 0:
                print(count)

            sentence_in = prepare_batch_sequence(sentences,word_to_ix)
            targets = prepare_batch_sequence(tags, tag_to_ix)

            s_lengths = [len(s) for s in sentences]
            tag_scores = lstm_model(sentence_in, s_lengths)
            tag_scores = tag_scores.view(BATCH_SIZE, s_lengths[0], len(tag_to_ix))

            for batchline in range(tag_scores.shape[0]):
                pred = [get_max_prob_result(tag_scores[batchline][wordidx].data.cpu().numpy(), ix_to_tag) for wordidx in range(s_lengths[batchline])]
                preds.append([(word, golden, tk) for word, golden, tk in zip(sentences[batchline], tags[batchline], pred)])

        corretas_rnn = 0
        totais = 0
        for pred in preds:
            for word, tk_golden, tk_pred in pred:
                #print(tk_golden)
                totais+=1
                if tk_golden == tk_pred:
                    corretas_rnn+=1

        time_elapsed = time.time() - since
        #print('Eval complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

        return corretas_rnn, totais, corretas_rnn/totais*100, time_elapsed


#### Test code

In [25]:
evaluate_lstm(phrases[:5000])

(43150, 44585, 96.7814287316362, 3.1840274333953857)

#### Avaliação na base completa

In [35]:
logging.basicConfig(filename='evaluation.log', level=logging.INFO)
def log(msg):
    logging.info(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+ ': '+str(msg))

In [36]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

In [38]:
phrases_data = np.array(phrases)
round = 0
for train_index, test_index in kf.split(phrases_data):
    round=round+1
    print('CV Round '+str(round))
    log('CV Round '+str(round))
    
    print('Train test split: '+ str(len(train_index))+','+str(len(test_index)))
    log('Train test split: '+ str(len(train_index))+','+str(len(test_index)))
    phrases_train = phrases_data[train_index]
    phrases_test = phrases_data[test_index]
    
    
    print('LSTM running...')
    log('LSTM running...')
    corretas_lstm, totais, acc, time_elapsed = evaluate_lstm(phrases_test)

    
    print(corretas_lstm, totais, acc, time_elapsed)
    log(str([corretas_lstm, totais, acc, time_elapsed]))
    
    print("")
    log("")
    
logging.shutdown()

CV Round 1
Train test split: 1354480,338621
LSTM running...
27530
55060
82590
110120
137650
165180
192710
220240
247770
275300
302830
330360
2.8733291625976562
2.8733291625976562
2.8733291625976562
5317961 5502766 96.64159806177474 233.8405246734619

CV Round 2
Train test split: 1354481,338620
LSTM running...
27530
55060
82590
110120
137650
165180
192710
220240
247770
275300
302830
330360
2.8733291625976562
2.8733291625976562
2.8733291625976562
5325037 5498998 96.83649639443404 239.1606993675232

CV Round 3
Train test split: 1354481,338620
LSTM running...
27530
55060
82590
110120
137650
165180
192710
220240
247770
275300
302830
330360
2.8733291625976562
2.8733291625976562
2.8733291625976562
5338313 5523072 96.65477835523419 234.00328469276428

CV Round 4
Train test split: 1354481,338620
LSTM running...
27530
55060
82590
110120
137650
165180
192710
220240
247770
275300
302830
330360
2.8733291625976562
2.8733291625976562
2.8733291625976562
5333106 5517859 96.65172669326998 234.3783137798