# Data Processing

In [1]:
path_train = u"../src/data/processed/train.730000.csv"
path_val = u"../src/data/processed/val.csv"
path_test = u"../src/data/processed/test.csv"

Reading in word vectors. Reduced to 500k due to RAM limits.

**Todo:** Filter for those vocabularies that are actually in the whole data set.

In [2]:
%%time
import gensim
import numpy as np
embedding_model = gensim.models.KeyedVectors.load_word2vec_format('../src/data/embeddings/GoogleNews-vectors-negative300.bin', binary=True, limit=500_000)
embedding_model.add('<oov>', np.mean(embedding_model.vectors, axis=0),replace=False)
embedding_model.add('<padding>', np.zeros(300),replace=False)

CPU times: user 15.3 s, sys: 1.28 s, total: 16.6 s
Wall time: 16.4 s


Create a Dataset for the Amazon reviews.

In [3]:
from torch.utils.data import Dataset
import spacy
import pandas as pd
import numpy as np
import torch

class AmazonReviewDataset(Dataset):
    def __init__(self, path, padding=200, padding_token="<padding>", oov_token="<oov>"):
        
        tokenizer = spacy.load("en_core_web_sm")
        
        self.samples = []
        
        # Load all the data
        data = pd.read_csv(path)
                
        # Tokenize, pad and vectorize each review
        for index, row in data.iterrows():
            
            # Tokenize
            sentence = [token.text for token in tokenizer(row["review"])]
            
            # Pad
            sentence = sentence[:padding] + [padding_token]*(padding - len(sentence))
            
            # Vectorize
            row = [row["id"], row["label"], row["alpha"]] + sentence
            tensor = self.__row2tensor__(row, oov_token)
            
            self.samples.append(tensor)
            
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        
        return self.samples[idx]
    
    @staticmethod
    def __row2tensor__(sentence, oov_token):
        filled_sentence = [word if embedding_model.vocab.get(word) is not None else oov_token for word in sentence[3:]]
        sentence_as_int = [embedding_model.vocab.get(word).index for word in filled_sentence]
        return sentence_as_int, sentence[1]

In [None]:
%%time
from os import path
import pickle

pickle_path_train = "data.train-0_20_500k.pickle"

if path.exists(pickle_path_train):
    print("Loading Training Dataset")
    with open(pickle_path_train, "rb") as pickled:
        train_data = pickle.load(pickled)
else:
    print("Creating and saving Training Dataset")
    train_data = AmazonReviewDataset(path_train)
    with open(pickle_path_train, "wb") as pickled:
         pickle.dump(train_data, pickled)

Creating and saving Training Dataset


In [None]:
%%time
from os import path
import pickle

pickle_path_val = "data.val-0_10_500k.pickle"

if path.exists(pickle_path_val):
    print("Loading Validation Dataset")
    with open(pickle_path_val, "rb") as pickled:
        val_data = pickle.load(pickled)
else:
    print("Creating and saving Validation Dataset")
    val_data = AmazonReviewDataset(path_val)
    with open(pickle_path_val, "wb") as pickled:
         pickle.dump(val_data, pickled)

In [None]:
#%%time
#from os import path
#import pickle
#
#pickle_path_test = "data.test-0_05_500k.pickle"
#
#if path.exists(pickle_path_test):
#    print("Loading Test Dataset")
#    with open(pickle_path_test, "rb") as pickled:
#        test_data = pickle.load(pickled)
#else:
#    print("Creating and saving Test Dataset")
#    test_data = AmazonReviewDataset(path_test)
#    with open(pickle_path_test, "wb") as pickled:
#         pickle.dump(test_data, pickled)

In [None]:
#%%time
#train_data = AmazonReviewDataset(path_train)

In [None]:
#%%time
#val_data = AmazonReviewDataset(path_val)

Create a DataLoader as well, using a custom collate function for creating the batches.

In [None]:
def batch2tensor(batch):
    X, Y = [None]*len(batch), [None]*len(batch)
    
    for i, row in enumerate(batch):
        X[i] = row[0]
        Y[i] = row[1]
        
    return torch.LongTensor(X), torch.LongTensor(Y)

In [None]:
%%time
from torch.utils.data import DataLoader
dataloader_train = DataLoader(train_data, batch_size=1024, shuffle=True, num_workers=4, collate_fn=batch2tensor)
dataloader_val = DataLoader(val_data, batch_size=1024, shuffle=True, num_workers=4, collate_fn=batch2tensor)

Load one example.

In [None]:
%%time
dataiter_train = iter(dataloader_train)

In [None]:
%%time
X_dash, Y_dash = dataiter_train.next()

In [None]:
X_dash.shape

In [None]:
X_dash = X_dash.cuda()
Y_dash = Y_dash.cuda()

In [None]:
X_dash.shape

In [None]:
Y_dash.shape

Definition of the network.

In [None]:
import torch.nn as nn

In [None]:
class FFN(nn.Module):
    def __init__(self, word_embeddings, embedding_size=300, padding=200, category_amount=5, dropout=0.25):
        super().__init__()
        
        # Predefined word embeddings
        self.embedding = nn.Embedding.from_pretrained(word_embeddings)
        
        # Dropout
        self.dropout = nn.Dropout(p=dropout)
        
        self.l1 = nn.Linear(embedding_size * padding, 256)
        self.l2 = nn.Linear(256, category_amount)
        
        # Define sigmoid activation and softmax output 
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.embedding(x)
        x = x.view(x.shape[0], -1)
        x = self.l1(x)
        x = self.sigmoid(x)
        x = self.dropout(x)
        x = self.l2(x)
        x = self.softmax(x)
        
        return x

In [None]:
model = FFN(torch.FloatTensor(embedding_model.vectors)).cuda()

In [None]:
model

In [None]:
model(X_dash).shape

In [None]:
model(X_dash)[0].shape

In [None]:
model(X_dash).shape

In [None]:
Y_dash.shape

In [None]:
X_dash.shape

Look at the untrained model.

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.bar(range(0,5), np.exp(model(X_dash).detach().cpu().numpy()[0]), alpha=0.5);

## Training the Model

In [None]:
from torch import optim

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.04, nesterov=True, momentum=0.1)
#optimizer = optim.SGD(model.parameters(), lr=0.05)

In [None]:
criterion = nn.NLLLoss()

In [None]:
%%time

epochs = 50

train_losses, train_accuracies, validation_losses, validation_accuracies = [], [], [], []

for e in range(epochs):
    
    training_loss = 0
    training_accuracy = 0
    
    for X, Y in dataloader_train:
        X = X.cuda()
        Y = Y.cuda()
    
        # Reset Gradients
        optimizer.zero_grad()
        
        # Forward, Loss, Backwards, Update
        output = model(X)
        loss = criterion(output, Y)
        loss.backward()
        optimizer.step()
        
        # Calculate Metrics
        training_loss += loss.item()
        training_accuracy += torch.sum(torch.exp(output).topk(1)[1].view(-1) == Y).item()
        
    else:
        validation_loss = 0
        validation_accuracy = 0
        
        model.eval()
        
        with torch.no_grad():
            for X, Y in dataloader_val:
                X = X.cuda()
                Y = Y.cuda()  
                
                output_validation = model(X)
                loss_val = criterion(output_validation, Y)
                validation_loss += loss_val.item()
                validation_accuracy += torch.sum(torch.exp(output_validation).topk(1, dim=1)[1].view(-1) == Y).item()
        
        training_loss /= len(train_data)
        training_accuracy /= len(train_data)
        validation_loss /= len(val_data)
        validation_accuracy /= len(val_data)
        
        # Saving metrics
        train_losses.append(training_loss)
        train_accuracies.append(training_accuracy)
        validation_losses.append(validation_loss)
        validation_accuracies.append(validation_accuracy)
        
        print("Epoch: {}/{}\n".format(e+1, epochs),
              "Training Loss: {:.6f}\n".format(training_loss),
              "Training Accuracy: {:.3f}\n".format(training_accuracy),
              "Validation Loss: {:.6f}\n".format(validation_loss),
              "Validation Accuracy: {:.3f}\n".format(validation_accuracy))
        
        model.train()

In [None]:
plt.bar(range(0,5), np.exp(model(X_dash).detach().cpu().numpy()[0]), alpha=0.5);

In [None]:
Y_dash[0]

In [None]:
plt.plot(train_losses, label='Training loss')
plt.plot(validation_losses, label='Validation loss')
plt.legend(frameon=False);

In [None]:
plt.plot(train_accuracies, label='Training Accuracy')
plt.plot(validation_accuracies, label='Validation Accuracy')
plt.legend(frameon=False);

# LSTM

In [None]:
import torch.nn as nn

In [None]:
class LSTM(nn.Module):
    def __init__(self, word_embeddings, embedding_size=300, padding=200,
                 category_amount=5, dropout=0.25, lstm_dropout=0.25):
        
        super().__init__()
        
        # Predefined word embeddings
        self.embedding = nn.Embedding.from_pretrained(word_embeddings)
        
        # LSTM
        self.lstm = nn.LSTM(embedding_size * padding, 256, 2,
                           batch_first=True, dropout=lstm_dropout,
                           bidirectional=True)
        
        # Dropout
        self.dropout = nn.Dropout(p=dropout)
        
        # FFN
        self.l1 = nn.Linear(256, category_amount)
        
        # Define sigmoid activation and softmax output 
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.embedding(x)
        x = x.view(x.shape[0], -1)
        x, hidden = self.lstm(x, hidden)
        x = self.dropout(x)
        x = self.l1(x)
        x = self.sigmoid(x)
        x = self.softmax(x)
        
        return x, hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        
        return hidden

In [None]:
import torch

In [None]:
model = LSTM(torch.FloatTensor(embedding_model.vectors)).cuda()

In [None]:
hidden = net.init_hidden(1024)

In [None]:
model

In [None]:
model(X_dash, hidden).shape

# Old

In [None]:
import pandas as pd 

In [None]:
train = pd.read_csv("/home/flennic/git/text-mining-project/src/data/processed/train.csv", header=None)
test = pd.read_csv("/home/flennic/git/text-mining-project/src/data/processed/test.csv", header=None) 

In [None]:
train.head()

In [None]:
test.head()

In [None]:
import logging
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

logger = logging.getLogger(__name__)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

In [None]:
model.config

In [None]:
model.eval();

In [None]:
model.to('cuda');

In [None]:
input_ids1 = torch.tensor(tokenizer.encode("Hello, my dog is cute".lower(), add_special_tokens=True)).unsqueeze(0).to('cuda')  # Batch size 1
input_ids2 = torch.tensor(tokenizer.encode("Hello, my cat is ugly".lower(), add_special_tokens=True)).unsqueeze(0).to('cuda')  # Batch size 1

In [None]:
input_ids1.shape

In [None]:
input_ids = torch.cat((input_ids1, input_ids2))

In [None]:
input_ids.shape

In [None]:
labels = torch.tensor([0, 1]).unsqueeze(0).to('cuda')   # Batch size 1

In [None]:
labels

In [None]:
outputs = model(input_ids, labels=labels)

In [None]:
outputs

In [None]:
loss, logits = outputs[:2]

In [None]:
import numpy as np

In [None]:
np.exp(logits.detach().cpu().numpy())

In [None]:
outputs = model(input_ids)

In [None]:
outputs

## Raw BERT

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
model.config

In [None]:
model.eval();

In [None]:
model.to('cuda');

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', output_hidden_states=True)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0).to('cuda')  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [None]:
tokenizer.tokenize("Hello, my dog is cute")

In [None]:
tokenizer.cls_token

In [None]:
tokenizer.convert_tokens_to_ids("[CLS]")

In [None]:
input_ids

In [None]:
outputs[1].shape

In [None]:
outputs[0].shape

In [None]:
outputs[0][0, 0,:].shape

In [None]:
a = ("a", "b", "c")

In [None]:
b = (1, 2) + a

In [None]:
b

In [None]:
from gensim.models import KeyedVectors

In [None]:
!pwd

In [None]:
model = KeyedVectors.load_word2vec_format('src/data/embeddings/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
model["hello"].shape