# Tarea 1: Author Profiling

## Aprendizaje Sutomático II


### Esteban Reyes Saldaña

In [1]:
import pandas as pd
import pickle
import numpy as np
import nltk
nltk.download('punkt')
from tqdm.auto import tqdm
import copy

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F

from sklearn.metrics import f1_score, accuracy_score

[nltk_data] Downloading package punkt to /home/esteban/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1) **[10pts]** Haga el preprocesamiento que usted considere para limpiar y extraer la información de los documentos.

Para el preprocesamiento tomaremos cada uno de los archivos y concatenaremos todos los tweets de casa usuario para la tarea de clasificación

In [2]:
import os
import glob
import re

In [168]:
class tweet_dataset(Dataset):
    def __init__(self, directory):
        super(Dataset, self).__init__()
        self.load_data(directory)
        self.vocab, self.emb_mat = self.load_vocab_embeddings()
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        '''Método principal para cargar una observación del dataset.
           label: categoría a la que pertenece la observación.
           word_ids: lista de índices de las palbras en el vocabulario.
        '''
        label = self.data.iloc[index]['Gender']
        lenght = self.data.iloc[index]['Sentence']
        words, word_ids = self.preprocessed_text(index)
        return word_ids, label, words, lenght
        
    def preprocessed_text(self, index):
        '''Preprocess text and '''

        text = self.data.iloc[index]['Text']
        words = nltk.word_tokenize(text)
        word_ids = [self.vocab[word] if word in self.vocab.keys() else self.emb_mat.shape[0]-1\
                        for word in words]
        return words, word_ids

    def load_data(self, directory):
        '''Método para cargar datos.
           El texto está en la columna "text" y las categorías en la columna "target".
        '''
        input_list = sorted([os.path.join(directory, name) for name in os.listdir(directory)])
        # Usamos el archivo de etiquetas para crear dataframe
        df = pd.read_csv(input_list[-1], sep = ':::', header = None, names = ['Id', 'Gender'], usecols=[0,1])
        # Cambiamos etiquetas a binario
        df['Gender'] = df['Gender'].map({'female': 1, 'male': 0})
        text_dictionary, sentence_dictionary = self.get_tweets(input_list[:-1]) # Cargamos diccionario de texto
        df['Text']                           = df['Id'].map(text_dictionary)    # Agregamos Columna de textos a dataframe
        df['Sentence']                       = df['Id'].map(sentence_dictionary)    # Agregamos Columna de textos a dataframe

        self.data       = df                               # Caramos dataframe a clase

    def load_vocab_embeddings(self):
        '''Embeddings preentrenados en twitter.
           emb_mat: Matriz de embeddings. Un vector de tamaño 200 para cada palabra del vocabulario.
           vocab: Diccionario, asigna a cada palabra su renglón correspondiente en la matriz de embeddings.
        '''
        embeddings_list = []
        self.vocab_dict = {}
        vocab = {}
        with open('./word2vec_col.txt', 'r') as f:
            for i, line in enumerate(f):
                if i!=0:
                    values = line.split()
                    self.vocab_dict[i+2] = values[0]
                    vocab[values[0]] = i+1
                    vector = np.asarray(values[1:], "float32")
                    embeddings_list.append(vector)
        embeddings_list.insert(0,np.mean(np.vstack(embeddings_list), axis=0))
        embeddings_list.insert(0,np.zeros(100))
        self.vocab_dict[0] = '[PAD]'
        self.vocab_dict[1] = '[UNK]'
        vocab['[PAD]'] = 0
        vocab['[UNK]'] = 1
        emb_mat = np.vstack(embeddings_list)

        return vocab, emb_mat

    def get_weights(self):
        '''Devuelve pesos inversos para cada categoría. Mayor peso para la categoría con menos observaciones.'''

        cat_0 = len(self.data[self.data['Gender']==0])
        cat_1 = len(self.data[self.data['Gender']==1])
        maxi = max(cat_0, cat_1)
        return torch.tensor([maxi/cat_0, maxi/cat_1])

    def get_tweets(self, input_list) :
        # Creamos diccionario para tweets
        text_dict     = {}
        sentence_dict = {}
        for i, document in enumerate(input_list) :
            name = input_list[i].split('/')[-1][:-4]
            with open(document, 'r') as f :
                doc = f.read()
            # Ignoramos primeras dos líneas
            doc = doc.split('\n')[2:]
            sentence_dict[name] = len(doc)
            # Concatenamos tweets
            full_tweet = ''
            for tweet in doc :
                # Remove fist 21 characters
                tweet = tweet[21:].split(']')[0].lower()
                # Remove links and replace @user
                tweet = re.sub(r'(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])', '', tweet)
                tweet = re.sub('@([a-z][A-Z][0-9]*)', '@user', tweet)
                full_tweet += tweet
            text_dict[name] = full_tweet + ' '
        return text_dict, sentence_dict

    def collate_fn(self, batch):
        '''Función que ejecuta el dataloader para formar batches de datos.'''

        zipped_batch = list(zip(*batch))
        word_ids     = [torch.tensor(t) for t in zipped_batch[0]]
        word_ids     = torch.cat(word_ids, dim=0)
        lengths      = torch.tensor([len(t) for t in zipped_batch[0]])
        labels       = torch.tensor(zipped_batch[1])
        words        = zipped_batch[2]
        lenghts      = zipped_batch[3]
        return word_ids, lengths, labels, words, lenghts

## Dataset

In [169]:
# Cargamos directorios
dataset_train_dir = './author_profiling_pan/es_train'
dataset_test_dir  = './author_profiling_pan/es_test'

In [170]:
train_dataset = tweet_dataset(dataset_train_dir)
test_dataset  = tweet_dataset(dataset_test_dir)

  return func(*args, **kwargs)


## Ejemplo

In [171]:
train_dataset.data

Unnamed: 0,Id,Gender,Text,Sentence
0,74bcc9b0882c8440716ff370494aea09,1,tiene que valer la pena que esté despierta a e...,103
1,4639c055f34ca1f944d0137a5aeb7914,1,eres lo mas importante para mi. 💜d💜lo que no t...,117
2,92ffa98bade702b86417b118e8aca319,1,@enriquepenalosa no sólo la infraestructura ta...,102
3,4560c6567afcccef265f048ed117d04d,1,@bogoinge @youtube en el cielo.tu novio el que...,105
4,393866dfaa80d414c9896cf8723932b7,1,no puedo con canada. esta muy gorda. si quiere...,103
...,...,...,...,...
4195,b93801b0abc44cb3d45e5f9cd838bcbd,0,creo que nunca va existir nada mejor que abraz...,140
4196,737923304de3e95ff95f8e0be537a2d4,0,te necesito aquí conmigo.@felipejstonem estamo...,106
4197,acb8ad6e2b8b5408526469f1fa37219a,0,pues ya no sé si voy a ir a monterrey o no...m...,103
4198,458afd67845a7747ba98b540404f87d9,0,"""fulanito"" ha aceptado tu solicitud de amistad...",105


In [172]:
train_dataset.data.iloc[100]

Id                            a2f1e821b8ebde3b561b4e2e02d8046
Gender                                                      1
Text        esa parte de mi vida,esta pequeña parte ,se ll...
Sentence                                                  102
Name: 100, dtype: object

In [173]:
word_ids, label, words, _ = train_dataset.__getitem__(100)

In [174]:
words[:20]

['esa',
 'parte',
 'de',
 'mi',
 'vida',
 ',',
 'esta',
 'pequeña',
 'parte',
 ',',
 'se',
 'llama',
 'felicidad',
 'me',
 'he',
 'dado',
 'cuenta',
 ',',
 'que',
 'la']

## DataLoader

Según los experimentos realizados, se encontró que con un batch más pequeño, la red de atención no se sobreentrena tanto como con un batch más grande

In [10]:
batch_size       = 4

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn = train_dataset.collate_fn, shuffle=True)
test_dataloader  = DataLoader(test_dataset, batch_size=batch_size, collate_fn = test_dataset.collate_fn, shuffle=False)

# 2) **[20pts]** Diseñe un clasificador con alguna RNN que codifique cada perfil de usuario con todos sus tweets y clasifique.

In [12]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1,
                 bidirectional=False, emb_mat=None, dense_hidden_size=256):
        '''Constructor, aquí definimos las capas.
        input:
            input_size: Tamaño de los embeddings de las palabras.
            hidden_size: Tamaño de la capa oculta de la GRU.
            num_layers: Número de capas de la GRU.
            bidirectional: True si se quiere una GRu bidireccional.
            emb_mat: Matriz de embeddings del vocabulario.
            dense_hidden_size: Tamaño de la capa ocula del clasificador.
        '''
        super(SimpleRNN, self).__init__()
        # Matriz entrenable de embeddings, tamaño vocab_size x 100
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        # Gated Recurrent Unit
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, 
                          num_layers=num_layers, bidirectional=bidirectional)
        # Número de direcciones de la GRU
        directions = 2 if bidirectional else 1
        # Clasificador MLP
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),            
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 2))
    
    def forward(self, input_seq, lengths):
        '''Función feed-forward de la red.
        input:
            input_seq: Lista de ids para cada palabra.
            lengths: Número de palabras en cada una de las observaciones del batch.
        output:
            x: vectores para clasificar.
            return None for consistency with the next model
        '''
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        # Forma las secuencias de palabras que entraran a la GRU.
        x = x.split(lengths.tolist())
        # Añade pading y empaqueta las secuencias (mayor velocidad de cómputo).
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        output, hn = self.gru(x)
        hn = torch.cat([h for h in hn], dim=-1)
        x = self.classifier(hn)
        return x, None

In [249]:
def eval_model(model, dataloader, criterion, device):
    '''Función para evaluar el modelo.'''
    with torch.no_grad():
        model.eval()
        losses = []
        preds = torch.empty(0).long()
        targets = torch.empty(0).long()
        scores_list = []
        words_list = []
        pred_list = []
        for data in tqdm(dataloader):
            torch.cuda.empty_cache()
            seq, seq_len, labels, words, _ = data
            seq, labels = seq.to(device), labels.to(device)
            output, scores = model(seq, seq_len)
            output = F.log_softmax(output, dim=1)
            loss = criterion(output, labels)
            losses.append(loss.item())
            predictions = F.log_softmax(output, dim=1).argmax(1)
            preds = torch.cat([preds, predictions.cpu()], dim=0)
            targets = torch.cat([targets, labels.cpu()], dim=0)
            if scores is not None:
                pred_list += predictions.tolist()
                scores = scores.cpu().squeeze(2).tolist()
                scores_list += scores
                words_list += words

        model.train()
        preds   = preds.numpy()
        targets = targets.numpy()
        f1      = f1_score(targets, preds, average = 'binary')
        acc     = accuracy_score(targets, preds)
        return np.mean(losses), f1, scores_list, words_list, pred_list, acc

In [14]:
# Parámetros
epochs = 30
device = torch.device('cuda')


torch.cuda.set_device(1)

In [16]:
model     = SimpleRNN(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(), )
weight    = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight)

## Entrenamiento

In [17]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        # Limpia basura de la memoria GPU
        torch.cuda.empty_cache()
        # Reiniciamos el cálculo del gradiente
        optimizer.zero_grad()
        # Desempaca los datos que salen del dataloader
        seq, seq_len, labels, _, _ = data
        # Mueve los datos al mismo device en el que este el modelo
        seq, labels = seq.to(device), labels.to(device)
        output, _   = model(seq, seq_len)
        output      = F.log_softmax(output, dim=1)
        loss        = criterion(output, labels)
        # Calcula el gradiente de la pérdida
        loss.backward()
        # Realiza un paso de la optimización
        optimizer.step()
    
    #Evalúa los modelos en los conjuntos de entrenamiento y valuación
    train_loss, train_f1, _, _, _, train_accuracy = eval_model(model, train_dataloader, criterion, device)
    test_loss, test_f1, _, _, _, test_accuracy    = eval_model(model, test_dataloader, criterion, device)
    
    print('train_loss: %5f | train_f1: %5f | train_ac: %5f' %(train_loss, train_f1, train_accuracy)) 
    print('test_loss: %5f | test_f1: %5f | test_ac: %5f'%(test_loss, test_f1, test_accuracy)) 
    
    if test_f1 > best_val_f1:
        best_val_f1     = test_f1
        best_state_dict = copy.deepcopy(model.state_dict())

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.667094 | train_f1: 0.623646 | train_ac: 0.594524
test_loss: 0.691772 | test_f1: 0.563540 | test_ac: 0.531429


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.599198 | train_f1: 0.607390 | train_ac: 0.676190
test_loss: 0.675359 | test_f1: 0.484461 | test_ac: 0.597143


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.519695 | train_f1: 0.697891 | train_ac: 0.737381
test_loss: 0.634416 | test_f1: 0.604341 | test_ac: 0.661429


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.378234 | train_f1: 0.854697 | train_ac: 0.854524
test_loss: 0.607972 | test_f1: 0.678099 | test_ac: 0.678214


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.214150 | train_f1: 0.935771 | train_ac: 0.934762
test_loss: 0.570985 | test_f1: 0.741681 | test_ac: 0.731071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.144035 | train_f1: 0.962768 | train_ac: 0.962857
test_loss: 0.656016 | test_f1: 0.696312 | test_ac: 0.700000


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.111606 | train_f1: 0.974359 | train_ac: 0.974524
test_loss: 0.641195 | test_f1: 0.715782 | test_ac: 0.728571


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.057173 | train_f1: 0.985576 | train_ac: 0.985476
test_loss: 0.687732 | test_f1: 0.757730 | test_ac: 0.745357


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.058482 | train_f1: 0.984353 | train_ac: 0.984286
test_loss: 0.757795 | test_f1: 0.746340 | test_ac: 0.733929


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.042264 | train_f1: 0.991889 | train_ac: 0.991905
test_loss: 0.745499 | test_f1: 0.721739 | test_ac: 0.725714


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.053991 | train_f1: 0.983583 | train_ac: 0.983810
test_loss: 0.800575 | test_f1: 0.694478 | test_ac: 0.725357


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.032408 | train_f1: 0.991465 | train_ac: 0.991429
test_loss: 0.991483 | test_f1: 0.740431 | test_ac: 0.733571


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.047194 | train_f1: 0.986730 | train_ac: 0.986667
test_loss: 1.005353 | test_f1: 0.728358 | test_ac: 0.707500


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.050974 | train_f1: 0.987252 | train_ac: 0.987143
test_loss: 1.213806 | test_f1: 0.738611 | test_ac: 0.711071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.037217 | train_f1: 0.991416 | train_ac: 0.991429
test_loss: 1.027335 | test_f1: 0.744666 | test_ac: 0.735000


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.066336 | train_f1: 0.981396 | train_ac: 0.981667
test_loss: 1.226640 | test_f1: 0.651026 | test_ac: 0.702500


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.062104 | train_f1: 0.985329 | train_ac: 0.985238
test_loss: 0.934615 | test_f1: 0.727646 | test_ac: 0.716071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.093951 | train_f1: 0.972419 | train_ac: 0.973095
test_loss: 0.923742 | test_f1: 0.645548 | test_ac: 0.704286


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.084784 | train_f1: 0.973972 | train_ac: 0.974524
test_loss: 1.012821 | test_f1: 0.623433 | test_ac: 0.688929


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.059694 | train_f1: 0.984209 | train_ac: 0.984048
test_loss: 1.139313 | test_f1: 0.733578 | test_ac: 0.714643


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.027216 | train_f1: 0.993327 | train_ac: 0.993333
test_loss: 1.120214 | test_f1: 0.720089 | test_ac: 0.731786


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.050136 | train_f1: 0.992628 | train_ac: 0.992619
test_loss: 1.134742 | test_f1: 0.697894 | test_ac: 0.702857


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.028838 | train_f1: 0.993801 | train_ac: 0.993810
test_loss: 1.134412 | test_f1: 0.714180 | test_ac: 0.723571


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.026720 | train_f1: 0.992578 | train_ac: 0.992619
test_loss: 1.270448 | test_f1: 0.685385 | test_ac: 0.707857


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.024662 | train_f1: 0.994043 | train_ac: 0.994048
test_loss: 1.296973 | test_f1: 0.729505 | test_ac: 0.732500


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.031957 | train_f1: 0.991445 | train_ac: 0.991429
test_loss: 1.412261 | test_f1: 0.742323 | test_ac: 0.724286


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.046430 | train_f1: 0.988191 | train_ac: 0.988095
test_loss: 1.396388 | test_f1: 0.733622 | test_ac: 0.713929


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.017813 | train_f1: 0.995953 | train_ac: 0.995952
test_loss: 1.354229 | test_f1: 0.708242 | test_ac: 0.714286


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.033827 | train_f1: 0.990918 | train_ac: 0.990952
test_loss: 1.405029 | test_f1: 0.690881 | test_ac: 0.714286


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.026914 | train_f1: 0.994058 | train_ac: 0.994048
test_loss: 1.476221 | test_f1: 0.728383 | test_ac: 0.713929


In [18]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, _, _, _, train_accuracy = eval_model(model, train_dataloader, criterion, device)
test_loss, test_f1, _, _, _, test_accuracy    = eval_model(model, test_dataloader, criterion, device)
print('train_loss: %5f | train_f1: %5f | train_ac: %5f' %(train_loss, train_f1, train_accuracy)) 
print('test_loss: %5f | test_f1: %5f | test_ac: %5f'%(test_loss, test_f1, test_accuracy)) 

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.057173 | train_f1: 0.985576 | train_ac: 0.985476
test_loss: 0.687732 | test_f1: 0.757730 | test_ac: 0.745357


# 3) [20 pts] Diseñe un clasificador con alguna RNN con atención que codifique cada perfil de usuario con todos sus tweets y clasifique.

Usaremos la imprementación de la GRU y añadiremos un módulo de atención. Este toma los vectores $h_t$ de la GRU y calcula un vector de ponderación $ s $ 

$$ s = \sum_t \alpha_t h_t,$$

donde

\begin{align*}
    u_{t} &= \tanh(Wh_{t}+b),\\
    \alpha_{t} &= \frac{\exp(u_t^Tu)}{\sum_i\exp(u_{i}^Tu)}.
\end{align*}

# 3) **[20pts]** Diseñe un clasificador con alguna RNN con atención que codifique cada perfil de usuario con todos sus tweets y clasifique.

In [19]:
class AttnModule(nn.Module):
    def __init__(self, input_size, attn_hidden_size=128):
        '''
        input:
            input_size: tamaño de la capa oculta de la GRU.
            attn_hidden_size: tamaño de la capa oculta.
        '''
        super(AttnModule, self).__init__()
        self.fc1 = nn.Linear(input_size, attn_hidden_size)
        self.fc2 = nn.Linear(attn_hidden_size, 1, bias=False)

    def forward(self, seq, lengths):
        '''
        input:
            seq: secuencia de vectores ocultos de la GRU.
            lengths: número de palabras en cada observación.
        '''
        x = pad_packed_sequence(seq)[0]
        seq_len, batch_size, nhid = x.size()
        u = self.fc1(x.view(batch_size*seq_len, nhid))
        u = torch.tanh(u)
        scores = self.fc2(u)
        scores = scores.view(seq_len, batch_size, 1)
        # Asigna -100 a las posiciones con padding para que no sean consideados en la atención.
        scores = nn.utils.rnn.pack_padded_sequence(scores, lengths=lengths,enforce_sorted=False)
        scores = nn.utils.rnn.pad_packed_sequence(scores, padding_value=-100)[0]
        scores = F.softmax(scores, dim=0)
        scores = scores.transpose(0,1)
        x = x.transpose(0,1).transpose(1,2)
        x = torch.bmm(x, scores)
        return x.squeeze(2), scores

In [20]:
class AttnRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1,
                 bidirectional=False, emb_mat=None, dense_hidden_size=256,
                 attn_hidden_size=128):
        super(AttnRNN, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, 
                          num_layers=num_layers, bidirectional=bidirectional)
        directions = 2 if bidirectional else 1
        self.attn = AttnModule(input_size=hidden_size*directions)
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 2))
        
    def forward(self, input_seq, lengths):
        x = self.embeddings(input_seq)
        x = x.split(lengths.tolist())
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        output, hn = self.gru(x)
        x, scores = self.attn(output, lengths)
        x = self.classifier(x)
        return x, scores.detach()

In [21]:
# Parámetros
epochs = 30
device = torch.device('cuda')

In [22]:
model     = AttnRNN(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(),)
weight    = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight)

## Entrenamiento

In [23]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        torch.cuda.empty_cache()
        optimizer.zero_grad()
        seq, seq_len, labels, _, _ = data
        seq, labels                = seq.to(device), labels.to(device)
        output, _                  = model(seq, seq_len)
        output                     = F.log_softmax(output, dim=1)
        loss                       = criterion(output, labels)
        loss.backward()
        optimizer.step()
    
    #Evalúa los modelos en los conjuntos de entrenamiento y valuación
    train_loss, train_f1, _, _, _, train_accuracy = eval_model(model, train_dataloader, criterion, device)
    test_loss, test_f1, _, _, _, test_accuracy    = eval_model(model, test_dataloader, criterion, device)
    print('train_loss: %5f | train_f1: %5f | train_ac: %5f' %(train_loss, train_f1, train_accuracy)) 
    print('test_loss: %5f | test_f1: %5f | test_ac: %5f'%(test_loss, test_f1, test_accuracy)) 
    if test_f1 > best_val_f1:
        best_val_f1     = test_f1
        best_state_dict = copy.deepcopy(model.state_dict())

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.514290 | train_f1: 0.729656 | train_ac: 0.764286
test_loss: 0.543653 | test_f1: 0.692496 | test_ac: 0.730714


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.349957 | train_f1: 0.846975 | train_ac: 0.856667
test_loss: 0.487639 | test_f1: 0.771019 | test_ac: 0.788929


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.202531 | train_f1: 0.946990 | train_ac: 0.947381
test_loss: 0.443787 | test_f1: 0.799853 | test_ac: 0.806071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.112764 | train_f1: 0.977328 | train_ac: 0.977619
test_loss: 0.511497 | test_f1: 0.762505 | test_ac: 0.784643


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.059262 | train_f1: 0.989832 | train_ac: 0.989762
test_loss: 0.549763 | test_f1: 0.807552 | test_ac: 0.792500


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.035011 | train_f1: 0.997383 | train_ac: 0.997381
test_loss: 0.492689 | test_f1: 0.805928 | test_ac: 0.803571


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.028651 | train_f1: 0.997616 | train_ac: 0.997619
test_loss: 0.545602 | test_f1: 0.769770 | test_ac: 0.788929


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.025061 | train_f1: 0.997144 | train_ac: 0.997143
test_loss: 0.518219 | test_f1: 0.795296 | test_ac: 0.801071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.029495 | train_f1: 0.996201 | train_ac: 0.996190
test_loss: 0.514751 | test_f1: 0.802073 | test_ac: 0.795357


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.022122 | train_f1: 0.995490 | train_ac: 0.995476
test_loss: 0.577119 | test_f1: 0.799463 | test_ac: 0.786786


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.020253 | train_f1: 0.997383 | train_ac: 0.997381
test_loss: 0.550569 | test_f1: 0.776445 | test_ac: 0.780357


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.019141 | train_f1: 0.995696 | train_ac: 0.995714
test_loss: 0.630728 | test_f1: 0.759177 | test_ac: 0.786786


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.018564 | train_f1: 0.997144 | train_ac: 0.997143
test_loss: 0.568791 | test_f1: 0.801483 | test_ac: 0.789643


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.017414 | train_f1: 0.996904 | train_ac: 0.996905
test_loss: 0.604337 | test_f1: 0.772503 | test_ac: 0.786071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.014954 | train_f1: 0.996904 | train_ac: 0.996905
test_loss: 0.569407 | test_f1: 0.794096 | test_ac: 0.795714


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.014249 | train_f1: 0.998335 | train_ac: 0.998333
test_loss: 0.590807 | test_f1: 0.778434 | test_ac: 0.785714


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.014333 | train_f1: 0.997860 | train_ac: 0.997857
test_loss: 0.611380 | test_f1: 0.796546 | test_ac: 0.789643


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.012077 | train_f1: 0.998809 | train_ac: 0.998810
test_loss: 0.580952 | test_f1: 0.788469 | test_ac: 0.798214


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.017766 | train_f1: 0.998808 | train_ac: 0.998810
test_loss: 0.564385 | test_f1: 0.786716 | test_ac: 0.793571


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.012969 | train_f1: 0.998572 | train_ac: 0.998571
test_loss: 0.564307 | test_f1: 0.802024 | test_ac: 0.790357


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.011719 | train_f1: 0.998094 | train_ac: 0.998095
test_loss: 0.646228 | test_f1: 0.760343 | test_ac: 0.780714


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.010884 | train_f1: 0.997860 | train_ac: 0.997857
test_loss: 0.606287 | test_f1: 0.795772 | test_ac: 0.786071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.009116 | train_f1: 0.998809 | train_ac: 0.998810
test_loss: 0.624194 | test_f1: 0.780077 | test_ac: 0.795000


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.009107 | train_f1: 0.998810 | train_ac: 0.998810
test_loss: 0.632968 | test_f1: 0.776453 | test_ac: 0.785714


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.010676 | train_f1: 0.999286 | train_ac: 0.999286
test_loss: 0.602737 | test_f1: 0.793377 | test_ac: 0.795000


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.007991 | train_f1: 0.999048 | train_ac: 0.999048
test_loss: 0.562102 | test_f1: 0.798455 | test_ac: 0.795000


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.014505 | train_f1: 0.996417 | train_ac: 0.996429
test_loss: 0.777445 | test_f1: 0.682243 | test_ac: 0.732857


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.007894 | train_f1: 0.999285 | train_ac: 0.999286
test_loss: 0.602421 | test_f1: 0.792598 | test_ac: 0.787857


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.007773 | train_f1: 0.998571 | train_ac: 0.998571
test_loss: 0.650577 | test_f1: 0.779934 | test_ac: 0.785357


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.008416 | train_f1: 0.999524 | train_ac: 0.999524
test_loss: 0.652185 | test_f1: 0.789564 | test_ac: 0.781071


# 4) **[20pts]** Diseñe un clasificador con alguna RNN con atención en jerárquia (como en el paper adjunto) que codifique cada perfil de usuario con todos sus tweets y clasifique. El primer nivel intuitivamente recibe embeddings de palabras, el segundo de tweets.

In [242]:
class AttnModule(nn.Module):
    def __init__(self, input_size, attn_hidden_size=128, output_size = 1):
        '''
        input:
            input_size: tamaño de la capa oculta de la GRU.
            attn_hidden_size: tamaño de la capa oculta.
        '''
        super(AttnModule, self).__init__()
        self.fc1 = nn.Linear(input_size, attn_hidden_size)
        self.fc2 = nn.Linear(attn_hidden_size, output_size, bias=False)

    def forward(self, seq, lengths):
        '''
        input:
            seq: secuencia de vectores ocultos de la GRU.
            lengths: número de palabras en cada observación.
        '''
        x      = pad_packed_sequence(seq)[0]
        seq_len, batch_size, nhid = x.size()
        u      = self.fc1(x.view(batch_size*seq_len, nhid))
        u      = torch.tanh(u)
        scores = self.fc2(u)
        scores = scores.view(seq_len, batch_size, 1)
        # Asigna -100 a las posiciones con padding para que no sean consideados en la atención.
        scores = nn.utils.rnn.pack_padded_sequence(scores, lengths=lengths,enforce_sorted=False)
        scores = nn.utils.rnn.pad_packed_sequence(scores, padding_value=-100)[0]
        scores = F.softmax(scores, dim=0)
        scores = scores.transpose(0,1)
        x      = x.transpose(0,1).transpose(1,2)
        x      = torch.bmm(x, scores)
        return x.squeeze(2), scores

In [243]:
class Hierarchical_Attn(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1,
                 bidirectional=False, emb_mat=None, dense_hidden_size=256,
                 attn_hidden_size=128):
        super(Hierarchical_Attn, self).__init__()
        # Embed Matrix
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        # Words RNN
        self.words_gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, 
                          num_layers=num_layers, bidirectional=bidirectional)
        directions = 2 if bidirectional else 1
        # Word Attention
        self.words_attn = AttnModule(input_size=hidden_size*directions)

        # Sentence RNN
        self.att_dim1 = 128
        self.sentence_gru = nn.GRU(input_size=self.att_dim1, hidden_size=hidden_size, 
                          num_layers=num_layers, bidirectional=bidirectional, batch_first=True)
        directions = 2 if bidirectional else 1
        # Sentence Attention
        self.sentence_attn = AttnModule(input_size=hidden_size*directions, output_size = 128)
        # Proyection layer
        self.project_dim2 = hidden_size * num_layers
        self.att_dim2 = 128
        self.attn_proj2   = nn.Linear(self.project_dim2, self.att_dim2)
        self.query2       = nn.Linear(self.att_dim1, 1, bias=False) 
        
        self.fc = nn.Sequential(nn.Linear(hidden_size, 2))

        
    def forward(self, input_seq, lengths):
        # Embbed words into w2v
        x = self.embeddings(input_seq)
        x = x.split(lengths.tolist())
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        # Use word rnn
        output, hn = self.words_gru(x)
        # attention
        x, scores1 = self.words_attn(output, lengths)
        x          = x.unsqueeze(0)
        
        # Second Level
        x, hn = self.sentence_gru(x)
        # attention
        seq_len, batch_size, nhid = x.size()
        u = self.attn_proj2(x.view(batch_size*seq_len, nhid))
        u = torch.tanh(u)
        scores = self.query2(u)
        scores = scores.view(seq_len, batch_size, 1)
        scores = F.softmax(scores, dim=0)
        scores = scores.transpose(0,1)
        x = x.transpose(0,1).transpose(1,2)
        x = torch.bmm(x, scores)
        x = x.squeeze(2)
        x = self.fc(x)        
        return x, scores1.cpu().detach().numpy(), scores.cpu().detach().numpy()

In [254]:
epochs = 20
device = torch.device('cuda')

torch.cuda.set_device(1)

In [256]:
model     = Hierarchical_Attn(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(),)
weight    = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight)

## Entrenamiento

In [257]:
def eval_model(model, dataloader, criterion, device):
    '''Función para evaluar el modelo.'''
    with torch.no_grad():
        model.eval()
        losses      = []
        preds       = torch.empty(0).long()
        targets     = torch.empty(0).long()
        scores_list = []
        words_list  = []
        pred_list   = []
        for data in tqdm(dataloader):
            torch.cuda.empty_cache()
            seq, seq_len, labels, words, _ = data
            seq, labels                    = seq.to(device), labels.to(device)
            output, _, _                   = model(seq, seq_len)
            output                         = F.log_softmax(output, dim=1)
            loss                           = criterion(output, labels)
            losses.append(loss.item())
            predictions                    = F.log_softmax(output, dim=1).argmax(1)
            preds                          = torch.cat([preds, predictions.cpu()], dim=0)
            targets                        = torch.cat([targets, labels.cpu()], dim=0)

        model.train()
        preds   = preds.numpy()
        targets = targets.numpy()
        f1      = f1_score(targets, preds, average='binary')
        acc     = accuracy_score(targets, preds)
        return np.mean(losses), f1, scores_list, words_list, pred_list, acc

In [258]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        torch.cuda.empty_cache()
        optimizer.zero_grad()
        seq, seq_len, labels, _, _ = data
        seq, labels                = seq.to(device), labels.to(device)
        output, _, _               = model(seq, seq_len)
        output                     = F.log_softmax(output, dim=1)
        loss                       = criterion(output, labels)
        loss.backward()
        optimizer.step()
    
    #Evalúa los modelos en los conjuntos de entrenamiento y valuación
    train_loss, train_f1, _, _, _, train_accuracy = eval_model(model, train_dataloader, criterion, device)
    test_loss, test_f1, _, _, _, test_accuracy    = eval_model(model, test_dataloader, criterion, device)
    print('train_loss: %5f | train_f1: %5f | train_ac: %5f' %(train_loss, train_f1, train_accuracy)) 
    print('test_loss: %5f | test_f1: %5f | test_ac: %5f'%(test_loss, test_f1, test_accuracy)) 
    if test_f1 > best_val_f1:
        best_val_f1     = test_f1
        best_state_dict = copy.deepcopy(model.state_dict())

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.475743 | train_f1: 0.781235 | train_ac: 0.779048
test_loss: 0.619292 | test_f1: 0.705554 | test_ac: 0.698929


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.251680 | train_f1: 0.906760 | train_ac: 0.904762
test_loss: 0.420712 | test_f1: 0.811184 | test_ac: 0.804643


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.099809 | train_f1: 0.975657 | train_ac: 0.975952
test_loss: 0.535952 | test_f1: 0.739113 | test_ac: 0.771071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.024068 | train_f1: 0.993551 | train_ac: 0.993571
test_loss: 0.712027 | test_f1: 0.771384 | test_ac: 0.790000


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.007952 | train_f1: 0.998810 | train_ac: 0.998810
test_loss: 0.648821 | test_f1: 0.800827 | test_ac: 0.793571


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.004633 | train_f1: 0.999048 | train_ac: 0.999048
test_loss: 0.805682 | test_f1: 0.801480 | test_ac: 0.789286


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.004733 | train_f1: 0.999286 | train_ac: 0.999286
test_loss: 0.975912 | test_f1: 0.792907 | test_ac: 0.778929


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.005854 | train_f1: 0.998573 | train_ac: 0.998571
test_loss: 0.999842 | test_f1: 0.779456 | test_ac: 0.791429


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.009369 | train_f1: 0.998098 | train_ac: 0.998095
test_loss: 0.923748 | test_f1: 0.776988 | test_ac: 0.764643


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.006087 | train_f1: 0.998809 | train_ac: 0.998810
test_loss: 0.853653 | test_f1: 0.777582 | test_ac: 0.774643


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.002540 | train_f1: 0.999524 | train_ac: 0.999524
test_loss: 1.112040 | test_f1: 0.716713 | test_ac: 0.743929


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.002253 | train_f1: 0.999762 | train_ac: 0.999762
test_loss: 1.146676 | test_f1: 0.754912 | test_ac: 0.746071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.001525 | train_f1: 0.999762 | train_ac: 0.999762
test_loss: 1.196235 | test_f1: 0.749206 | test_ac: 0.746071


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.007921 | train_f1: 0.998092 | train_ac: 0.998095
test_loss: 1.203976 | test_f1: 0.743707 | test_ac: 0.760000


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.004677 | train_f1: 0.999286 | train_ac: 0.999286
test_loss: 1.101786 | test_f1: 0.750360 | test_ac: 0.752143


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.000785 | train_f1: 0.999762 | train_ac: 0.999762
test_loss: 1.106868 | test_f1: 0.762097 | test_ac: 0.747143


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.000900 | train_f1: 0.999762 | train_ac: 0.999762
test_loss: 1.310757 | test_f1: 0.737686 | test_ac: 0.754643


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.000812 | train_f1: 0.999762 | train_ac: 0.999762
test_loss: 1.240177 | test_f1: 0.747563 | test_ac: 0.731786


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.000279 | train_f1: 1.000000 | train_ac: 1.000000
test_loss: 1.617425 | test_f1: 0.660243 | test_ac: 0.710357


  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/1050 [00:00<?, ?it/s]

  0%|          | 0/700 [00:00<?, ?it/s]

train_loss: 0.000055 | train_f1: 1.000000 | train_ac: 1.000000
test_loss: 1.508177 | test_f1: 0.690363 | test_ac: 0.720000


# 5) **[10pts]** Compare y discuta el rendimiento de los modelos anteriores. Para el punto 4, construya visualizaciones como en la Figura 6 del paper de atención jerárquica y discuta.

In [2]:
import pandas as pd

In [4]:
rnn          = ['RNN'         , 0.985576 , 0.985476, 0.757730, 0.745357]
atention     = ['Atention'    , 0.997383 , 0.997381, 0.805928, 0.803571]
hierarchical = ['Hierarchical', 0.993551 , 0.993571, 0.771384, 0.790000]

data = [rnn, atention, hierarchical]
0.790000
# Create the pandas DataFrame
pd.DataFrame(data, columns = ['Model', 'train_f1', 'train_ac', 'test_f1', 'test_ac'])

Unnamed: 0,Model,train_f1,train_ac,test_f1,test_ac
0,RNN,0.985576,0.985476,0.75773,0.745357
1,Atention,0.997383,0.997381,0.805928,0.803571
2,Hierarchical,0.993551,0.993571,0.771384,0.79


En los resultados observamos que el mejor rendimiento se obtiene cuando se agrega un módulo de atención simple.  Se observó también que el tamaño del batch tiene un papel importante para las métricas resultantes. Tanto la RNN simple como la atención jerárquica mostraron un sobre-entrenamiento importante comparado con la RNN con atención. 

## Visualización de Atención Jerárquica

In [None]:
from IPython.display import display, HTML
import matplotlib
import matplotlib.pyplot as plt

In [343]:
#Función para colorear tweets y palabras tomada de 
#https://stackoverflow.com/questions/59220488/to-visualize-attention-color-tokens-using-attention-weights
def colorize(words, color_array, color = 'default'):
    '''
        Función para visuzalizar la atención, tomada de https://gist.github.com/ihsgnef/f13c35cd46624c8f458a4d23589ac768,
    '''
    # words is a list of words
    # color_array is an array of numbers between 0 and 1 of length equal to words
    if color =='blue':
        cmap=matplotlib.cm.Blues
    else:
        cmap=matplotlib.cm.Reds
    template       = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

In [344]:
def show_attn(tweets, att_lv1, att_lv2):
    # Removemos dimensiones de batch
    att_lv2 = np.squeeze(att_lv2, axis=2)
    att_lv1 = np.squeeze(att_lv1, axis=2)
    att_lv1 = np.squeeze(att_lv1, axis=1)
    
    colored_tweets = []
    for i, t in enumerate(tweets):
        n_color   = colorize([str(i+1)], [att_lv1[i]], color = 'blue')
        col_tweet = colorize(t, att_lv2[i]*1e2)
        display(HTML(n_color+col_tweet))

In [345]:
model.load_state_dict(best_state_dict)

<All keys matched successfully>

In [346]:
# Tomamos un batch del conjunto de prueba
for b in test_dataloader:
    seq, seq_len, labels, words, _ = b
    break

In [347]:
seq.shape, len(words)

(torch.Size([4980]), 4)

In [348]:
# Obtenemos vectores de atención y predicciones
out, att_lv1, att_lv2 = model(seq.to(device), seq_len)
col_tweet = show_attn(words, ps, pw)

In [352]:
# Tomamos un batch del conjunto de prueba
I = 0
for b in test_dataloader:
    seq, seq_len, labels, words, _ = b
    I += 1
    if I == 2 :
        break

In [353]:
# Obtenemos vectores de atención y predicciones
out, att_lv1, att_lv2 = model(seq.to(device), seq_len)
col_tweet = show_attn(words, ps, pw)

# 6) **[20pts]** Eché un ojo a los resultados que el profesor obtuvo sin deep learning ( http://ceur-ws.org/Vol-1866/paper_109.pdf ) y discuta lo siguiente respecto a sus modelos profundos:

En este artículo se usa una representación muy específica para el problema con la premisa de separar en clases la infomación respecto a la similitud de los usuarios en la representación usuario-documento. Los modelos profundos usados en los puntos anteriores son una estrategia general para encontrar similitudes en un espacio de que no disponemos de interpretabilidad directa. 

## 6.1) ¿Le ganó a la mejor propuesta de representaciones distribucionales según lo reportado por el profesor? Si no fue así: ¿Le ganó a la Bolsa de Términos con SVM?

Los modelos de deep learning no le ganaron a la mejor propuesta de representaciones distribucionales. El mejor resultado para el conjunto de prueba se obtuvo con la RNN con atención y fue de $0.8$ mientras que el accuracy del paper obtuvo $0.8014$.

Podríamos decir que los modelos de redes recurrentes quedaron con métricas similares a la BoT para el conjunto de prueba. En cuanto al de entrenamiento, podríamos decir que las redes se sobre entrenarona  aprtir de la época 10. Por lo que quedaron a la par con la BoT.

## 6.2) ¿Por que cree que NO o SI le ganó? Discuta.

No le ganó. Las RNN buscan similitudes entre los datos en espacios de alta dimencionalidad de las cuales conocemos la interpretabilidad de las dimensiones. Podemos visualizar la atención de los vectores aprendidos apra cada oración pero no sabemos qué sentido les da la red de manera interna. En este sentido, las RNN son una estrategia mpas general y los métodos usados en el paper son representaciones (aunque tal vez de dimensión alta) más sencillas de los datos donde podemos entender qué significa cada dimensión. 

La estrategia USR es muy específica pra esta tarea de clasificación y tal vez por eso se obtienen mejores resultados.

## 6.3) ¿Qué usó el primer lugar de la competencia? (Basile, et al.) ( Consulte: http://ceur-ws.org/Vol-1866/ )

Ellos usaron una representación de los tewwts con enegramas de tamaño $[3,4,5]$ a nivel caracter y $k$-gramas de tamaño $[1,2]$ para nivel palabras. Luego aplicaron un SVM para reducir la dimensionalidad.

## 6.4) ¿Cuántos y que competidores usaron deep learning? En una o dos oraciones escriba qué hicieron

Cinco equipos competidores usaron deep learning

1) [http://ceur-ws.org/Vol-1866/paper_68.pdf] Usaron modelos de enconder-decoder clásicos de LSTM.

2) [http://ceur-ws.org/Vol-1866/paper_77.pdf] Usaron una RNN bidirecional con GRUs combinada con mecanismos de atención.

3) [http://ceur-ws.org/Vol-1866/paper_80.pdf] Combinaron TF-IDF y modelos de redes convolucionales con métodos de limpieza de texto básicos con matrices de bigramas.

4) [http://ceur-ws.org/Vol-1866/paper_90.pdf] Combinaron información a nivel caracter y a nivel palabras con una arquitectura NN-FT. Que combina embeddings a nivel palabra y caracter junto con redes recurrentes y redes convolucionaes seguido de un mecanismo de atención.

5) [http://ceur-ws.org/Vol-1866/paper_192.pdf] Usaron embeddings de los textos a nivel caracter usando $n$-gramas y una arquitectura llamada Deep Averaging Networks.