## Imports

In [37]:
import os
import ast
import random
import pandas as pd

from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

import spacy

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset, Iterator

import torchvision.transforms as transforms
import pytorch_lightning as ptl
from test_tube import Experiment

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset

import matplotlib.pyplot as plt
from millenlp.embeddings import FastTextVec

## AutoEncoder with pytorch-lightning

In [2]:
DEVICE = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda', index=0)

In [3]:
SEED = 1234

In [4]:
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
data = pd.read_excel('dataset/data.xlsx')

In [None]:
data_train, data_test = train_test_split(data, test_size=0.30)
data_val, data_test = train_test_split(data_test, test_size=0.5)
data_train.shape, data_val.shape, data_test.shape

In [None]:
data_train.to_csv('dataset/data_train.csv')
data_val.to_csv('dataset/data_val.csv')
data_test.to_csv('dataset/data_test.csv')

In [5]:
spacy_es = spacy.load('es_core_news_md')

In [6]:
def tokenizer(text):
    """
    Tokenizes Spanish text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_es.tokenizer(text)][::-1]

In [49]:
def get_dataset(fix_length=100, lower=False, vectors=None):
    if vectors is not None:
        # pretrain vectors only supports all lower cases
        lower = True
        
    comment = Field(
        sequential=True,
        fix_length=fix_length,
        tokenize=tokenizer,
        pad_first=True,
        batch_first = True,
        lower=lower
    )
    train, val = TabularDataset.splits(
        path='dataset/', format='csv', skip_header=True,
        train='data_train.csv', validation='data_val.csv',
        fields=[
            ('mensaje', None),
            ('lemma', comment),
            ('cluster', None),
            ('cluster_2', None),
            ('output', None)
        ])
    test = TabularDataset(
        path='dataset/data_test.csv', format='csv', 
        skip_header=True,
        fields=[
            ('mensaje', None),
            ('lemma', comment),
            ('cluster', None),
            ('cluster_2', None)
        ])

    comment.build_vocab(
        train, val, test,
        max_size=20000,
        min_freq=50,
        vectors=vectors
    )
    return train, val, test, comment

In [50]:
sequence_length = 100
train_dataset, val_dataset, test_dataset, message_field = get_dataset(fix_length=sequence_length)

In [57]:
class Encoder(ptl.LightningModule):
    def __init__(self, 
                 layers, 
                 hidden_size, 
                 input_size, 
                 message_field, 
                 dropout = 0, 
                 bidir = False, 
                 batch_size = 256):
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.layers = layers
        self.batch_size = batch_size
        self.dropout = dropout
        self.bidir = bidir
        self.message_field = message_field
        self.embedding_dim = input_size
        
        self.word_embedding = nn.Embedding(num_embeddings=len(self.message_field.vocab.itos),
                                           embedding_dim=self.embedding_dim,
                                           padding_idx=self.message_field.vocab.stoi['<pad>']).to(DEVICE)
        
        self.lstm = nn.LSTM(input_size=self.embedding_dim,
                                  hidden_size=self.hidden_size,
                                  num_layers=self.layers,
                                  batch_first=True,
                                  dropout = self.dropout if self.dropout and self.layers > 1 else 0,
                                  bidirectional = self.bidir).to(DEVICE)
        
#         self.dense1 = torch.nn.Linear(self.units*2 if self.bidir else self.units, self.dim_reduction[0]).to(DEVICE)
#         self.bn1 = torch.nn.BatchNorm1d(num_features=self.dim_reduction[0]).to(DEVICE)
#         self.dense2 = torch.nn.Linear(self.dim_reduction[0]*2 if self.bidir else self.dim_reduction[0], 
#                                      self.dim_reduction[1]).to(DEVICE)
        
        self.init_weigths()
        
    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden = torch.randn(self.layers*2 if self.bidir else self.layers, self.batch_size, self.hidden_size).to(DEVICE)
        cell = torch.randn(self.layers*2 if self.bidir else self.layers, self.batch_size, self.hidden_size).to(DEVICE)

        return (hidden, cell)
    
    def init_weigths(self):
        
        for param in self.lstm.named_parameters():
            if 'weight' in param[0]:
                torch.nn.init.orthogonal_(param[1])
#         torch.nn.init.xavier_normal_(self.dense.weight)
        print('weigths initializer: done!')
        
    def forward(self, x):
        
        x = self.word_embedding(x)
        
        self.hidden, self.cell = self.init_hidden()
        x, (self.hidden, self.cell) = self.lstm(x, (self.hidden, self.cell))
        
        last_hidden = self.hidden.view(self.layers,2,-1,self.hidden_size)[-1] if self.bidir else self.hidden[-1]
        last_hidden = last_hidden.contiguous()
        output = last_hidden.view(-1, self.hidden_size*2 if self.bidir else self.hidden_size)
        
#         output = self.dropout(F.relu(self.bn1(self.dense1(last_hidden))))
#         output = self.dense2(output)
        
        return output

In [58]:
class Decoder(ptl.LightningModule):
    def __init__(self, 
                 layers, 
                 hidden_size, 
                 input_size,
                 dropout = 0, 
                 bidir = False, 
                 batch_size = 256):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.layers = layers
        self.input_size = input_size
        self.batch_size = batch_size
        self.dropout = dropout
        self.bidir = bidir
        
#         self.dense1 = torch.nn.Linear(self.dim_reduction[0]*2 if self.bidir else self.dim_reduction[0], 
#                                       self.dim_reduction[1]).to(DEVICE)
#         self.bn1 = torch.nn.BatchNorm1d(num_features=self.dim_reduction[1]).to(DEVICE)
        
#         self.dense2 = torch.nn.Linear(self.dim_reduction[1]*2 if self.bidir else self.dim_reduction[1], 
#                                      self.units).to(DEVICE)
#         self.bn2 = torch.nn.BatchNorm1d(num_features=self.units).to(DEVICE)
        
        self.lstm = nn.LSTM(input_size=self.input_size,
                            hidden_size=hidden_size,
                            num_layers=self.layers,
                            batch_first=True,
                            dropout = self.dropout if self.dropout and self.layers > 1 else 0,
                            bidirectional = self.bidir).to(DEVICE)
        
        self.softmax = nn.Softmax(dim=1)
        
        self.init_weigths()
        
    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden = torch.randn(self.layers*2 if self.bidir else self.layers, self.batch_size, self.hidden_size).to(DEVICE)
        cell = torch.randn(self.layers*2 if self.bidir else self.layers, self.batch_size, self.hidden_size).to(DEVICE)

        return (hidden, cell)
    
    def init_weigths(self):
        
        for param in self.lstm.named_parameters():
            if 'weight' in param[0]:
                torch.nn.init.orthogonal_(param[1])
#         torch.nn.init.xavier_normal_(self.dense.weight)
        print('weigths initializer: done!')
        
    def forward(self, x):
        
#         x = self.dropout(F.relu(self.bn1(self.dense1(x))))
#         x = self.dropout(F.relu(self.bn2(self.dense2(x))))
        
        self.hidden, self.cell = self.init_hidden()
        x, (self.hidden, self.cell) = self.lstm(x, (self.hidden, self.cell))
        last_hidden = self.hidden.view(self.layers,2,-1,self.hidden_size)[-1] if self.bidir else self.hidden[-1]
        last_hidden = last_hidden.contiguous()
        last_hidden = last_hidden.view(-1,self.hidden_size*2 if self.bidir else self.hidden_size)
        
        output = self.softmax(last_hidden)
        
        return output

In [182]:
class AutoEncoder(ptl.LightningModule):
    def __init__(self, 
                 layers, 
                 hidden_size, 
                 input_size, 
                 message_field, 
                 dropout = 0, 
                 bidir = False, 
                 batch_size = 256,
                 sequence_length = 100):
        super(AutoEncoder, self).__init__()
        self.batch_size = batch_size
        self.sequence_length = sequence_length

        self.encoder = Encoder(layers = layers, 
                               hidden_size = hidden_size, 
                               input_size = input_size,
                               message_field = message_field,
                               dropout = dropout, 
                               bidir = bidir, 
                               batch_size = batch_size).to(DEVICE)
        
        self.decoder = Decoder(layers = layers, 
                               hidden_size = input_size, 
                               input_size = hidden_size,
                               dropout = dropout, 
                               bidir = bidir, 
                               batch_size = batch_size).to(DEVICE)
        
        self.loss = torch.nn.MSELoss()
        
    def forward(self, x):
        
        x = self.encoder(x).expand(-1, self.sequence_length, -1)
        out = self.decoder(x)
        
        return out
    
    def my_loss(self, y_hat, y):
        return self.loss(y_hat, y)
    
    def training_step(self, batch, batch_nb):
        x = batch
        y = self.encoder.word_embedding(x)
        y_hat = self.forward(x)
        return {'loss': self.my_loss(y_hat, y)}
    
    def validation_step(self, batch, batch_nb):
        x = batch
        y_hat = self.forward(x)
        y = self.encoder.word_embedding(x)
        return {'val_loss': self.my_loss(y_hat, y)}
    
    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        return {'avg_val_loss': avg_loss, 'avg_val_acc':avg_val}

    def configure_optimizers(self):
        return [torch.optim.Adam(self.parameters(), lr=0.001, weight_decay=5e-4, amsgrad = True)]
    
    @ptl.data_loader
    def tng_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size)

    @ptl.data_loader
    def val_dataloader(self):
        return DataLoader(val_dataset, batch_size=self.batch_size)
    
    @ptl.data_loader
    def test_dataloader(self):
        return DataLoader(test_dataset, batch_size=self.batch_size)

In [179]:
train_iter = BucketIterator(train_dataset, batch_size=512, device = DEVICE, train = True)

In [183]:
model = AutoEncoder(layers = 2,
                    hidden_size = 2,
                    input_size = 300,
                    message_field = message_field,
                    dropout = 0.2,
                    bidir = False,
                    batch_size=256)

exp = Experiment(save_dir=os.getcwd())
trainer = ptl.Trainer(experiment=exp, max_nb_epochs=300, train_percent_check=1, gpus=[0],track_grad_norm=2)

weigths initializer: done!
weigths initializer: done!
VISIBLE GPUS: '0'
gpu available: True, used: True


In [192]:
train_iter = DataLoader(train_dataset, batch_size=512)

In [193]:
len(train_iter)

188

In [184]:
trainer.fit(model)

# view tensorflow logs 
print(f'View tensorboard logs by running\ntensorboard --logdir {os.getcwd()}')
print('and going to http://localhost:6006 on your browser')

                     Name       Type   Params
0                 encoder    Encoder   489980
1  encoder.word_embedding  Embedding   487500
2            encoder.lstm       LSTM     2480
3                 decoder    Decoder  1087200
4            decoder.lstm       LSTM  1087200
5         decoder.softmax    Softmax        0
6                    loss    MSELoss        0


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'torchtext.data.example.Example'>

In [191]:
train_dataset.examples

[<torchtext.data.example.Example at 0x7f7aa411d080>,
 <torchtext.data.example.Example at 0x7f7a9c635d68>,
 <torchtext.data.example.Example at 0x7f7a9c635ef0>,
 <torchtext.data.example.Example at 0x7f7a9c635a58>,
 <torchtext.data.example.Example at 0x7f7a9c635fd0>,
 <torchtext.data.example.Example at 0x7f7a9c6357b8>,
 <torchtext.data.example.Example at 0x7f7a9c635a20>,
 <torchtext.data.example.Example at 0x7f7a9c6356a0>,
 <torchtext.data.example.Example at 0x7f7a9c66d6a0>,
 <torchtext.data.example.Example at 0x7f7a9c441940>,
 <torchtext.data.example.Example at 0x7f7a9c441d68>,
 <torchtext.data.example.Example at 0x7f7a9c441588>,
 <torchtext.data.example.Example at 0x7f7a9c441a90>,
 <torchtext.data.example.Example at 0x7f7a9c441668>,
 <torchtext.data.example.Example at 0x7f7aa4e06cf8>,
 <torchtext.data.example.Example at 0x7f7aa4e06b00>,
 <torchtext.data.example.Example at 0x7f7a9c639d68>,
 <torchtext.data.example.Example at 0x7f7a9c6395c0>,
 <torchtext.data.example.Example at 0x7f7a9c63