## Imports

In [1]:
import os
import ast
import random
import numpy as np
import pandas as pd

from random import shuffle
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.cluster import SpectralClustering, KMeans

import spacy

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset, Iterator

import torchvision.transforms as transforms
import pytorch_lightning as ptl
from test_tube import Experiment

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset

import matplotlib.pyplot as plt
from millenlp.embeddings import FastTextVec

from bokeh.plotting import figure, output_notebook, show, ColumnDataSource, output_file
from bokeh.palettes import Blues9,Spectral11,Category10,Set1,Set2,Category20
from bokeh.io import reset_output
from bokeh.models import BoxSelectTool,CustomJS, ColumnDataSource, Legend

In [2]:
output_file("Cluster.html")

## Functions

In [3]:
def preping_bokeh_clustering(original_data,features,column,cluster):
    
    color_dic = {}
    L = cluster['ClusterNumber'].unique().shape[0]
    nums = [x for x in range(L)]
    shuffle(nums)
    if L <= 20:
        for k,value in enumerate(cluster['ClusterNumber'].unique()):
            if L >10:
                color_dic[value] = Category20[20][nums[k]]
            else:
                color_dic[value] = Category10[10][nums[k]]
    else:
            
        hexa = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']
        color_dic = {}
        for k,value in enumerate(cluster['ClusterNumber'].unique()):
            color_dic[value] = ''.join([choice(hexa) if i != 0 else '#' for i in range(7)])

    df_bokeh = pd.DataFrame(np.concatenate((features,
                                            original_data[column].values.reshape(-1,1),
                                            original_data['lemma'].values.reshape(-1,1),
                                            cluster['ClusterNumber'].values.reshape(-1,1)),axis=1),

                            columns=['x','y','Message','lemma','Label'])
    
    df_bokeh['color'] = cluster.ClusterNumber.apply(lambda x: color_dic[x])
    
    TOOLTIPS = [("Index", "$index"),
            ("(x,y)", "(@x, @y)"),
            ("Message", "@{Message}"),
            ("Lemmas", "@{lemma}"),
            ("Label", "@{Label}")]

    return df_bokeh, TOOLTIPS

def scatter(source,TOOLTIPS,classes):
    p = figure(title="Fasttext and TSNE", 
           x_axis_label='x', y_axis_label='y',
           plot_width=950, plot_height=500, 
           tools = 'lasso_select,box_zoom,pan,poly_select,tap,wheel_zoom,save,zoom_out,crosshair,hover,reset,help',
           tooltips=TOOLTIPS)
    
    legend_it = []
    cluster_size = []
    for label in range(0,classes):
        cluster_size.append((label,source[source['Label']==label].shape[0]))
    cluster_size = sorted(cluster_size, key = lambda x: x[1], reverse=True)
    
    for label in list(np.array(cluster_size)[:,0]):
        c = p.scatter(x = "x",y = "y",size=5,
                      line_color="black", color="color", 
                      alpha=0.7, source=ColumnDataSource(source[source['Label']==label]))
        legend_it.append((str(label), [c]))
    
    legend = Legend(items=legend_it, location=(0, -30), spacing = 1)
    legend.click_policy="hide"
    p.add_layout(legend, 'right')
    
    show(p)

## AutoEncoder with pytorch-lightning

In [4]:
DEVICE = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda', index=0)

In [5]:
torch.cuda.is_available()

True

In [6]:
SEED = 1234

In [7]:
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Creating Data Splits

In [None]:
data = pd.read_excel('dataset/data.xlsx')

In [None]:
data_train, data_test = train_test_split(data, test_size=0.30)
data_val, data_test = train_test_split(data_test, test_size=0.5)
data_train.shape, data_val.shape, data_test.shape

In [None]:
data_train.to_csv('dataset/data_train.csv')
data_val.to_csv('dataset/data_val.csv')
data_test.to_csv('dataset/data_test.csv')

### Creating Dataset Clases

In [8]:
spacy_es = spacy.load('es_core_news_md')

In [9]:
def tokenizer(text):
    """
    Tokenizes Spanish text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_es.tokenizer(text)][::-1]

In [10]:
def get_dataset(fix_length=100, lower=False, vectors=None):
    if vectors is not None:
        # pretrain vectors only supports all lower cases
        lower = True
        
    comment = Field(
        sequential=True,
        init_token = '<sos>',
        eos_token = '<eos>',
        fix_length=fix_length,
        tokenize=tokenizer,
        pad_first=False,
        batch_first = True,
        lower=lower
    )
    train, val = TabularDataset.splits(
        path='dataset/', format='csv', skip_header=True,
        train='data_train.csv', validation='data_val.csv',
        fields=[
            ('mensaje', None),
            ('lemma', comment),
            ('cluster', None),
            ('cluster_2', None),
            ('output', comment)
        ])
    test = TabularDataset(
        path='dataset/data_test.csv', format='csv', 
        skip_header=True,
        fields=[
            ('mensaje', None),
            ('lemma', comment),
            ('cluster', None),
            ('cluster_2', None)
        ])

    comment.build_vocab(
        train, val, test,
        max_size=7000,
        min_freq=10,
        vectors=vectors
    )
    return train, val, test, comment

In [11]:
sequence_length = 50
train_dataset, val_dataset, test_dataset, message_field = get_dataset(fix_length=sequence_length)

In [12]:
generator = BucketIterator(train_dataset, batch_size=256)

In [13]:
examples = generator.data()

### Creating Autoencoder Model

In [12]:
class Encoder(ptl.LightningModule):
    def __init__(self, 
                 layers, 
                 hidden_size, 
                 input_size, 
                 message_field, 
                 dropout = 0, 
                 bidir = False, 
                 batch_size = 256):
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.layers = layers
        self.batch_size = batch_size
        self.dropout = dropout
        self.bidir = bidir
        self.message_field = message_field
        self.embedding_dim = input_size
        
        self.word_embedding = nn.Embedding(num_embeddings=len(self.message_field.vocab.itos),
                                           embedding_dim=self.embedding_dim,
                                           padding_idx=self.message_field.vocab.stoi['<pad>']).to(DEVICE)
        
        self.lstm = nn.LSTM(input_size=self.embedding_dim,
                                  hidden_size=self.hidden_size,
                                  num_layers=self.layers,
                                  batch_first=True,
                                  dropout = self.dropout if self.dropout and self.layers > 1 else 0,
                                  bidirectional = self.bidir).to(DEVICE)
        
        self.init_weigths()
        
    def init_hidden(self, batch_size):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden = torch.randn(self.layers*2 if self.bidir else self.layers, batch_size, self.hidden_size).to(DEVICE)
        cell = torch.randn(self.layers*2 if self.bidir else self.layers, batch_size, self.hidden_size).to(DEVICE)

        return (hidden, cell)
    
    def init_weigths(self):
        
        for param in self.lstm.named_parameters():
            if 'weight' in param[0]:
                torch.nn.init.xavier_normal_(param[1])
#         torch.nn.init.xavier_normal_(self.dense.weight)
        print('weigths initializer: done!')
        
    def forward(self, x):
        
        x = self.word_embedding(x)
        
        batch_size = x.shape[0]
        self.hidden, self.cell = self.init_hidden(batch_size)
        x, (self.hidden, self.cell) = self.lstm(x, (self.hidden, self.cell))
        
        return self.hidden, self.cell

In [13]:
class Decoder(ptl.LightningModule):
    def __init__(self, 
                 layers, 
                 hidden_size, 
                 input_size,
                 out_dim,
                 message_field,
                 dropout = 0, 
                 bidir = False, 
                 batch_size = 256,
                 sequence_length = 100):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.layers = layers
        self.input_size = input_size
        self.output_dim = out_dim
        self.batch_size = batch_size
        self.dropout = dropout
        self.bidir = bidir
        self.sequence_length = sequence_length
        self.message_field = message_field
        
        self.word_embedding = nn.Embedding(num_embeddings = self.output_dim, 
                                           embedding_dim = self.input_size,
                                           padding_idx=self.message_field.vocab.stoi['<pad>'])
        
        self.lstm = nn.LSTM(input_size=self.input_size,
                            hidden_size=self.hidden_size,
                            num_layers=self.layers,
                            batch_first=True,
                            dropout = self.dropout if self.dropout and self.layers > 1 else 0,
                            bidirectional = self.bidir).to(DEVICE)
        
        self.linear1 = torch.nn.Linear(self.hidden_size*2 if self.bidir else self.hidden_size, 
                                       1024).to(DEVICE)
        
        self.bn = torch.nn.BatchNorm1d(num_features=1024).to(DEVICE)
        self.dropout = torch.nn.Dropout(self.dropout)
        
        self.linear2 = torch.nn.Linear(1024, self.output_dim).to(DEVICE)
        
        self.softmax = nn.Softmax(dim=1)
        
        self.init_weigths()
        
    def init_hidden(self, batch_size):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden = torch.randn(self.layers*2 if self.bidir else self.layers, batch_size, self.hidden_size).to(DEVICE)
        cell = torch.randn(self.layers*2 if self.bidir else self.layers, batch_size, self.hidden_size).to(DEVICE)

        return (hidden, cell)
    
    def init_weigths(self):
        
        for param in self.lstm.named_parameters():
            if 'weight' in param[0]:
                torch.nn.init.xavier_normal_(param[1])
        torch.nn.init.xavier_normal_(self.linear1.weight)
        torch.nn.init.xavier_normal_(self.linear2.weight)
        print('weigths initializer: done!')
    
    def outReshape(self, last_hidden):
        
        last_hidden = last_hidden.view(self.layers,2,-1,self.hidden_size)[-1] if self.bidir else last_hidden[-1]
        last_hidden = last_hidden.contiguous()
        last_hidden = last_hidden.view(-1, self.hidden_size*2 if self.bidir else self.hidden_size)
        
        return last_hidden
        
    def forward(self, x, encoder_hidden, encoder_cell):
        
        x = self.word_embedding(x)
        x = x.unsqueeze(1)
        
        x, (decoder_hidden, decoder_cell) = self.lstm(x, (encoder_hidden, encoder_cell))
        
        x = x.view(-1,self.sequence_length,2,self.hidden_size) if self.bidir else x
        
        decoder_hidden = self.outReshape(decoder_hidden)
        decoder_cell = self.outReshape(decoder_cell)
        
        output = self.dropout(F.relu(self.linear1(x)))
        output = self.softmax(self.linear2(output))
        
        return output, (decoder_hidden, decoder_cell)

In [14]:
class AutoEncoder(ptl.LightningModule):
    def __init__(self, 
                 layers, 
                 hidden_size, 
                 input_size, 
                 message_field, 
                 dropout = 0, 
                 bidir = False, 
                 batch_size = 256,
                 sequence_length = 100):
        super(AutoEncoder, self).__init__()
        
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.hidden_size = hidden_size
        self.message_field = message_field
        self.output_dim = len(self.message_field.vocab.itos)

        self.encoder = Encoder(layers = layers, 
                               hidden_size = self.hidden_size, 
                               input_size = input_size,
                               message_field =  self.message_field,
                               dropout = dropout, 
                               bidir = bidir, 
                               batch_size = batch_size).to(DEVICE)
        
        self.decoder = Decoder(layers = layers, 
                               hidden_size = self.hidden_size, 
                               input_size = input_size,
                               out_dim = self.output_dim,
                               message_field =  self.message_field,
                               dropout = dropout, 
                               bidir = bidir, 
                               batch_size = batch_size,
                               sequence_length = self.sequence_length).to(DEVICE)
        
        self.loss = torch.nn.CrossEntropyLoss(ignore_index = self.message_field.vocab.stoi['<pad>'],
                                             reduction = 'mean')
        
    def forward(self, x, y, teacher_forcing_ratio = 0.5):
        
        batch_size = x.shape[0]
        
        ## The padding index is 1 with the token <pad>
        outputs = torch.ones(batch_size, self.sequence_length, self.output_dim).to(DEVICE)
        
        encoder_hidden, encoder_cell = self.encoder(x)
        
        decoder_input = torch.ones((batch_size,), dtype=torch.long, device=DEVICE)
        decoder_input.new_full((batch_size, ), self.message_field.vocab.stoi['<sos>'])
        
        for step in range(self.sequence_length):
            output, (hidden, cell) = self.decoder(decoder_input, encoder_hidden, encoder_cell)
            output = output.squeeze(1)
            outputs[:, step, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top_word = output.max(1)[1]
            decoder_input = (y[:, step] if teacher_force else top_word)
#             if teacher_force:
#                 print('True - {}'.format(self.message_field.vocab.itos[decoder_input[0]]))
#             else:
#                 print('False - {}'.format(self.message_field.vocab.itos[decoder_input[0]]))
                
        return outputs
    
    def my_loss(self, y_hat, y):
        y = y[:,1:].contiguous().view(-1)
        y_hat = y_hat[:,1:,:].contiguous().view(-1, y_hat.shape[-1])
        return self.loss(y_hat, y)
    
    def training_step(self, batch, batch_nb):
        (x, y), _ = batch
        y_hat = self.forward(x, y)
        return {'loss': self.my_loss(y_hat, y)}
    
    def validation_step(self, batch, batch_nb):
        (x, y), _ = batch
        y_hat = self.forward(x, y)
        return {'val_loss': self.my_loss(y_hat, y)}
    
    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        return {'avg_val_loss': avg_loss}

    def configure_optimizers(self):
        return [torch.optim.Adam(self.parameters(), lr=0.1, weight_decay=5e-4, amsgrad = True)]
    
    @ptl.data_loader
    def tng_dataloader(self):
        return BucketIterator(train_dataset, batch_size=self.batch_size, device=DEVICE)

    @ptl.data_loader
    def val_dataloader(self):
        return BucketIterator(val_dataset, batch_size=self.batch_size, device=DEVICE)
    
    @ptl.data_loader
    def test_dataloader(self):
        return BucketIterator(test_dataset, batch_size=self.batch_size, device=DEVICE)

### Training Model

In [16]:
model = AutoEncoder(layers = 2,
                    hidden_size = 400,
                    input_size = 100,
                    message_field = message_field,
                    dropout = 0.2,
                    bidir = False,
                    batch_size=256,
                    sequence_length = sequence_length)

exp = Experiment(save_dir=os.getcwd())
trainer = ptl.Trainer(experiment=exp, max_nb_epochs=100, train_percent_check=0.1, gpus=[0],track_grad_norm=2)

weigths initializer: done!
weigths initializer: done!
VISIBLE GPUS: '0'
gpu available: True, used: True


In [17]:
trainer.fit(model)

# view tensorflow logs 
print(f'View tensorboard logs by running\ntensorboard --logdir {os.getcwd()}')
print('and going to http://localhost:6006 on your browser')

  0%|          | 0/5 [00:00<?, ?it/s]

                      Name              Type   Params
0                  encoder           Encoder  2679100
1   encoder.word_embedding         Embedding   592700
2             encoder.lstm              LSTM  2086400
3                  decoder           Decoder  9166947
4   decoder.word_embedding         Embedding   592700
5             decoder.lstm              LSTM  2086400
6          decoder.linear1            Linear   410624
7               decoder.bn       BatchNorm1d     2048
8          decoder.dropout           Dropout        0
9          decoder.linear2            Linear  6075175
10         decoder.softmax           Softmax        0
11                    loss  CrossEntropyLoss        0


119it [00:39,  3.88it/s, avg_val_loss=8.69, batch_nb=37, epoch=99, gpu=0, loss=8.687, v_nb=139]                         

View tensorboard logs by running
tensorboard --logdir /home/daniel/Deep_RNN/AutoEncoders
and going to http://localhost:6006 on your browser


### Plotting Dimentionality Reduction with the Autoencoder

#### Function to Extract Features from encoder Model

In [15]:
def getEncoderFeatures(text):
    model.eval()
    out = model.encoder(torch.LongTensor([[message_field.vocab.stoi[word] for word in tokenizer(text)]]).to(DEVICE))
    return out.cpu().detach().numpy()

In [16]:
def getEmbeddingFeatures(text):
    model.eval()
    out = model.encoder.word_embedding(torch.LongTensor([[message_field.vocab.stoi[word] for word in tokenizer(text)]]).to(DEVICE))
    return out.cpu().detach().numpy()

#### Loading Dataset

In [17]:
data2plot = pd.read_excel('dataset/data.xlsx')

#### Extracting Features from Encoder Model

In [19]:
features_AE = np.array([getEncoderFeatures(text) for text in data2plot.lemma.tolist()]).squeeze(1)
features_AE.shape

(136915, 2)

#### Extracting Features from Encoder Word Embedding to Cluster

In [37]:
features = np.array([getEmbeddingFeatures(text).squeeze(0).mean(0) for text in data2plot.lemma])
features.shape

(136915, 300)

In [38]:
#BGM = BayesianGaussianMixture(n_components=20, covariance_type='full', max_iter=100).fit(features)
kmeans = KMeans(n_clusters=20).fit(features)

In [39]:
cluster = pd.DataFrame(kmeans.predict(features), columns = ['ClusterNumber'])

In [40]:
source, TOOLTIPS = preping_bokeh_clustering(data2plot,
                                            features_AE,
                                            'mensaje',
                                            cluster)
scatter(source,TOOLTIPS,len(cluster.ClusterNumber.unique()))

In [None]:
# data2cluster = data2cluster.drop(columns = ['cluster'])
data2cluster['cluster'] = cluster.ClusterNumber.tolist()

data2cluster.to_excel('Conversaciones_Chat_Codensa_2019_onlyUser_cluster_fix3_kmeans.xlsx')