# Clasificador binario de Reviews

In [None]:
import torch

## Data cleansing

In [None]:
# reviews y labels son un solo string que almacena todo el archivo (cada instancia está separada por \n):

with open('data/reviews.txt', 'r') as f:
  reviews = f.read()
with open('data/labels.txt', 'r') as f:
  labels = f.read()

In [None]:
from string import punctuation
from collections import Counter

# Se eliminarán los signos de puntuación:
reviews = reviews.lower()
reviews = ''.join(c for c in reviews if c not in punctuation)

# Se separarán las reviews por instancia:
sequences_str = reviews.split('\n')
labels_str = labels.split('\n')
print('Número de secuencias:', len(sequences_str))

# Se obtiene el vocabulario:
vocab_freq = Counter(' '.join(sequences_str).split())  # dict de la forma {palabra: frecuencia}.
vocab = sorted(vocab_freq.keys(), key=vocab_freq.get, reverse=True)  # lista con las palabras (str) ordenadas por frecuencia.
print('Tamaño vocabulario:', len(vocab))

# Se asocia cada palabra con un índice:
vocab2idx = {word: i for i, word in enumerate(vocab, 1)}  # dict de la forma {palabra: int_asociado}.

# Str a int en secuencias y etiquetas:
labels_int = [1 if label == 'positive' else 0 for label in labels_str]  # Se codifican las categorías con 0 o 1:
sequences_int = []  # lista con las secuencias. Cada secuencia es una lista (de tamaño variable) de enteros representando palabras.
for seq in sequences_str:
  sequences_int.append([vocab2idx[word] for word in seq.split()])

In [None]:
import numpy as np

# Eliminar las reviews de largo 0:
non_zero_idx = [i for (i, seq) in enumerate(sequences_int) if len(seq) != 0]
sequences_int = [sequences_int[i] for i in non_zero_idx]
labels_int = [labels_int[i] for i in non_zero_idx]

# Secuencias de tamaño uniforme:
def padding_and_clipping(sequences_list, seq_length):
    sequences = torch.zeros((len(sequences_list), seq_length), dtype=int)
    for i, seq in enumerate(sequences_list):
        sequences[i, -len(seq):] = torch.Tensor(seq)[:seq_length]
    return sequences

seq_length = int(np.median([len(x) for x in sequences_int]))
sequences_int = padding_and_clipping(sequences_int, seq_length)

print('Tamaño mediano de las secuencias:', seq_length)
print('Tamaño final de la data:', sequences_int.shape)

## Datasets and dataloader

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader

TRAIN_RATIO, DEV_RATIO = 0.8, 0.1
BATCH_SIZE = 64

dataset = TensorDataset(sequences_int, torch.Tensor(labels_int).to(int))

# Partición de la data:
def split_dataset(dataset, train_ratio, dev_ratio):
    n = len(dataset)
    n_train, n_dev = int(n*train_ratio), int(n*dev_ratio)
    n_test = n - n_train - n_dev
    return random_split(dataset, [n_train, n_dev, n_test])

dataset_split = split_dataset(dataset, train_ratio=TRAIN_RATIO, dev_ratio=DEV_RATIO)
print('\nTrain/dev/test datasets size:', [len(dataset) for dataset in dataset_split])

dataloaders = {mode: DataLoader(dataset_split[i], batch_size=BATCH_SIZE, shuffle=True)
               for i, mode in enumerate(('train', 'dev', 'test'))}

## Red neuronal

In [None]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes=2, dropout=0.5):
        
        super(SentimentRNN, self).__init__()
        
        # Parámetros para instanciar los estados iniciales de la LSTM:
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(
            num_embeddings = vocab_size + 1,  # tamaño del diccionario original (se suma el token de padding).
            embedding_dim=embedding_dim  # dimensión de embedding para cada entrada.
            )
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,  # dimensión de entrada a la red.
            hidden_size=hidden_size,  # dimensión de los estados c<t> y h<t>.
            num_layers=num_layers,  # capas LSTM superpuestas.
            batch_first=True,
            dropout=dropout,  # dropout para las capas internas.
            )
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, initial_states):  # se esperará que la red reciba el input y los estados iniciales de la LSTM.
        
        # x.shape = (batch_size, seq_lenght) es una lista de secuencias donde cada secuencia es una lista de enteros. Cada entero representa una palabra.
        x = self.embedding(x)
        # x.shape = (batch_size, seq_lenght, input_size = embedding_dim). Contiene la lista de secuencias.
        
        # initial_states = (h<0>, c<0>), cada uno de tamaño (n_layers, batch_size, hidden_size).
        output, final_states = self.lstm(x, initial_states)
        # output.shape = (batch_size, seq_lenght, hidden_size). Para cada instancia del batch, los outputs generados son los estados ocultos (en cada tiempo) de la última capa.
        # final_states = (h<n>, c<n>). Cada uno es de tamaño (num_layers, batch_size, hidden_size) y contiene los últimos states generados (en el último instante de tiempo).
                
        # Para cada secuencia, solo se necesita la salida que se generó en el último instante de tiempo, no las intermedias:
        output = output[:, -1, :]  # shape: (batch_size, hidden_size).
        
        # Clasificación:
        output = self.dropout(output)
        output = self.fc(output)
        return output
    
    def init_hidden(self, batch_size, device):
        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        c_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return h_0, c_0

In [None]:
vocab_size = len(vocab)

EMBEDDING_DIM = 400  # reducción de dimensionalidad del vocabulario.
HIDDEN_SIZE = 256  # dimensión estados de la LSTM.
NUM_LAYERS = 2  # número de capas LSTM.
LR = 0.001

net = SentimentRNN(vocab_size, EMBEDDING_DIM, HIDDEN_SIZE, NUM_LAYERS)
optimizer = torch.optim.Adam(net.parameters(), lr=LR)

# Ejemplo:
x, y = next(iter(dataloaders['train']))
initial_states = net.init_hidden(BATCH_SIZE, 'cpu')
print('Tamaño batch:', x.shape)
print('Tamaño estados iniciales:', initial_states[0].shape)

output = net(x, initial_states)
print('\nTamaño salida:', output.shape)

# Entrenamiento

In [None]:
def train_model(net, dataloaders, optimizer, epochs=5, clip=5):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    net.to(device)
    
    for epoch in range(epochs):

        print(f'Epoch {epoch + 1}/{epochs}')
    
        for mode in ('train', 'dev'):
            
            net.train(mode == 'train')
            epoch_loss, epoch_accuracy = 0.0, 0.0

            with torch.set_grad_enabled(mode == 'train'):
              for x, y in dataloaders[mode]:
                  x, y = x.to(device), y.to(device)
                  initial_states = net.init_hidden(x.shape[0], device)

                  output = net(x, initial_states)
                  loss = nn.CrossEntropyLoss()(output, y)
                  epoch_loss += loss * x.shape[0]
                  
                  if mode == 'train':
                      net.zero_grad()
                      loss.backward()
                      nn.utils.clip_grad_norm_(net.parameters(), clip)
                      optimizer.step()

                  preds = output.argmax(dim=1).to(device)
                  epoch_accuracy += (preds == y).sum()
            
            epoch_loss /= len(dataloaders[mode].dataset)
            epoch_accuracy *= 100 / len(dataloaders[mode].dataset)
            
            print(f'{mode:5} | loss: {epoch_loss:.2f} - accuracy: {epoch_accuracy:.2f}%')
        print()

In [None]:
train_model(net, dataloaders, optimizer, epochs=5)

states = {'net': net.state_dict(),
          'optimizer': optimizer.state_dict()}
torch.save(states, 'model.pt')

## Testing

In [None]:
def test_model(net, dataloader):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    net.to(device)
      
    net.eval()
    total_loss, total_accuracy = 0.0, 0.0

    with torch.set_grad_enabled(False):
        for x, y in dataloader:
            
            x, y = x.to(device), y.to(device)
            initial_states = net.init_hidden(x.shape[0], device)

            output = net(x, initial_states).squeeze()
            loss = nn.CrossEntropyLoss()(output, y)
            preds = output.argmax(dim=1).to(device)
            
            total_loss += loss * x.shape[0]
            total_accuracy += (preds == y).sum()
    
    total_loss /= len(dataloader.dataset)
    total_accuracy *= 100 / len(dataloader.dataset)
    
    print(f'- Loss: {total_loss:.2f}')
    print(f'- Accuracy: {total_accuracy:.2f}%')

print('Desempeño en el test set:')
test_model(net, dataloaders['test'])

## Inferencia sobre nueva data

In [None]:
def tokenize_review(review_str):
    review_str = review_str.lower() 
    review_str = ''.join([c for c in review_str if c not in punctuation])
    review_list_str = review_str.split()
    review_list_int = [vocab2idx[word] for word in review_list_str]
    
    return review_list_int

# Para decodificar secuencia: text = ' '.join([vocab[i-1] for i in seq)

def predict(net, review, seq_length=seq_length):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    net.to(device)
    net.eval()
    review_batch = padding_and_clipping(sequences_list=[tokenize_review(review)],
                                        seq_length=seq_length)
    initial_states = net.init_hidden(batch_size=1, device=device)
    output = net(review_batch.to(device), initial_states)
    fidelity, pred = torch.softmax(output, dim=1).max(dim=1)
    fidelity = round(fidelity.item() * 100, 2)
    pred = ('negativo', 'positivo')[pred]
    
    print(f'Sentimiento: {pred} (confianza: {fidelity}%)')

In [81]:
predict(net, 'My cats are called Yuki and Simba. They are both very pretty although Yuki is a bit bored. However, I love them very much.')
predict(net, 'This is the most boring year of my career.')
predict(net, 'Natural language processing is easier than reinforcement learning.')
predict(net, 'This net is as good at detecting sarcasm as a communist is at economics.')

Sentimiento: positivo (confianza: 67.3%)
Sentimiento: negativo (confianza: 85.82%)
Sentimiento: positivo (confianza: 77.23%)
Sentimiento: positivo (confianza: 91.84%)
