In [3]:
import os
import torch.nn as nn
import numpy as np
import torch
from collections import Counter
from nltk import word_tokenize
from torch.utils.data import Dataset
from torch.autograd import Variable
from models import *

# A simple RNN example
---

In this notebook, we'll se a simple (perhaps the most simple) way of using RNNs... For text classification.

For it, you'll need to download the data of IMDB sentiment classification: http://ai.stanford.edu/~amaas/data/sentiment/. This set is small enough for us to do it locally in our computers. It also provides a train and a test set. For our purpose, we'll use the test set as development/validation set. 

A lot of code of the preprocess Pipeline for IMDB dataset is from https://github.com/nyu-mll/DS-GA-1011-Fall2017/blob/master/hw1/HW01-student.ipynb (I recommend you doing this excersises and labs on your own! I took this class and it was extremely good. Syllable here: https://docs.google.com/document/d/1SIPSt4aeB3Lys9ztCp47Y4v68R6Awt8NBTlCObp2njg/edit). 

To tokenize, we're using the simple nltk word tokenizer (https://www.nltk.org/api/nltk.tokenize.html), but feel free to try moses or any other tokenizer. To use it, you'll probably have to download nltk's data. 

Creating this notebook, I came across someone who did something similar:
* https://towardsdatascience.com/sentiment-analysis-using-lstm-step-by-step-50d074f09948

## 0. File params

In [5]:
data_dir = "./data/aclImdb/"
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
TRAIN_SIZE = 23000 
VALIDATION_SIZE = 2000
TEST_SIZE = 2000 # I'm just loading 2000 out of the 25,000 to be able to work locally. Feel free to increase to 25k.
NUM_CLASSES = 2

# These can be treated as a hyperparm. 
VOCAB_SIZE = 20000 
BATCH_SIZE = 40
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
HIDDEN_DIM = 256
EMBEDDING_DIM = 400
MAX_LEN = 200

# GPU use (I'll be asuming 1 device). This will be hard coded throughout the notebook: 
if torch.cuda.is_available():
    USE_CUDA = True
    DEFAULT_DEVICE = 'cuda:0'
else:
    DEFAULT_DEVICE = 'cpu'
print(DEFAULT_DEVICE)

cuda:0


## 1. Pre-process data

Here, we're just loading the data into memory and we'll tokenize it.

In [7]:
class IMDBDatum():
    """
    Class that represents a train/validation/test datum
    - self.raw_text
    - self.label: 0 neg, 1 pos
    - self.file_name: dir for this datum
    - self.tokens: list of tokens
    - self.token_idx: index of each token in the text
    """
    def __init__(self, raw_text, label, file_name):
        self.raw_text = raw_text
        self.label = label
        self.file_name = file_name
        self.tokens = self._set_tokens(raw_text)
        self.tokens_idx = []
    
    @staticmethod
    def _set_tokens(raw_text):
        return word_tokenize(raw_text)
        
    def set_token_idx(self, token2idx, unk_token):
        tokens_idx = []
        for token in self.tokens:
            token_idx = token2idx.get(token, token2idx[unk_token])
            tokens_idx.append(token_idx)

        self.tokens_idx = tokens_idx

def preprocess_text(text):
    """
    Function that cleans the string
    """
    text = text.lower().replace("<br />", "")
    return text
        
    
def read_file_as_datum(file_name, label):
    """
    Function that reads a file 
    """
    with open(file_name, "r") as f:
        content = f.read()
        content = preprocess_text(content)
    return IMDBDatum(raw_text=content, label=label, file_name=file_name)


def construct_dataset(dataset_dir, dataset_size, offset=0):
    """
    Function that loads a dataset
    """
    pos_dir = os.path.join(dataset_dir, "pos")
    neg_dir = os.path.join(dataset_dir, "neg")
    single_label_size = int(dataset_size / 2)
    output = []
    all_pos = os.listdir(pos_dir)
    all_neg = os.listdir(neg_dir)
    for i in range(offset, offset+single_label_size):
        output.append(read_file_as_datum(os.path.join(pos_dir, all_pos[i]), 1))
        output.append(read_file_as_datum(os.path.join(neg_dir, all_neg[i]), 0))
    return output

def filter_dataum_dataset(dataset, max_len):
    new_output = []
    removed_samples = 0
    total_samples = len(dataset)
    for sample in dataset:
        if len(sample.tokens) > max_len:
            removed_samples += 1
            continue
        new_output.append(sample)
            
    print('Removed {} samples, thats {}% of set'.format(removed_samples, 100*round(removed_samples/total_samples, 2)))
    print('Total samples: {}'.format(len(new_output)))
    return new_output

In [8]:
print('Working in train set...')
train_set = construct_dataset(train_dir, TRAIN_SIZE)
train_set = filter_dataum_dataset(train_set, MAX_LEN)
print('Working in dev set...')
validation_set = construct_dataset(train_dir, VALIDATION_SIZE, offset=int(TRAIN_SIZE/2))
validation_set = filter_dataum_dataset(validation_set, MAX_LEN)
print('Working in test set...')
test_set = construct_dataset(test_dir, TEST_SIZE)
test_set = filter_dataum_dataset(test_set, MAX_LEN)

Working in train set...
Removed 11291 samples, thats 49.0% of set
Total samples: 11709
Working in dev set...
Removed 980 samples, thats 49.0% of set
Total samples: 1020
Working in test set...
Removed 966 samples, thats 48.0% of set
Total samples: 1034


In [12]:
train_set[0].tokens[0:10]

['broad', 'enough', 'for', 'you', '?', 'wait', 'till', 'you', 'see', 'this']

In [13]:
train_set[0].tokens[-10:]

['to', 'film', 'well', '.', 'not', 'a', 'lot', 'of', 'fun', '.']

In [15]:
train_set[0].raw_text

"broad enough for you? wait till you see this heavy handedadaption of a little collegiate one act. what is shocking and wild incollege rarely holds up over time, and this is proof. to take on thecatholic church with broadside humor just isn't shocking orinteresting or funny, it's kind of boring. the performers are allgame, giving all they've got, but it's basically a play that doesn'topen up to film well. not a lot of fun."

## 2. Feature engineering

Decide what features we should use (maybe tokenization decision should be here?).

In [17]:
def build_token2idx(train_set, vocab_size=VOCAB_SIZE):
    """
    Function that loads the train set and return a dict that maps tokens to index. 
    Indexes 0 and 1 are reserved for Padding and Unknown tokens respectively. 
    Currently hard coded. 
    """
    tokens_counter = Counter()
    for datum in train_set:
        for token in datum.tokens:
            tokens_counter[token] += 1
            
    print('Number of unique tokens in train data: {}'.format(len(tokens_counter)))
    print('Subsetting to: {}'.format(vocab_size))
    top_k_tokens = tokens_counter.most_common(vocab_size) # This return a list of touples, not a Counter() object.
    token2idx = {'<PAD>': 0, '<UNK>': 1}
    for token_touples in top_k_tokens:
        token = token_touples[0]
        if token in token2idx.keys():
            continue
        
        token2idx[token] = len(token2idx) + 1
        
    return token2idx

def build_idx2tokens(token2idx):
    """
    Function to build a dictionary that maps indexes to tokens from the reverse maps/
    """
    return {v:k for k,v in token2idx.items()}

def set_tokens_idx_in_datum(data_set, token2idx, unk_token='<UNK>'):
    """
    Function to set the tokens in the list of datums.
    """
    for datum in data_set:
        datum.set_token_idx(token2idx, unk_token)

In [18]:
token2idx = build_token2idx(train_set)
idx2tokens = build_idx2tokens(token2idx)

Number of unique tokens in train data: 55344
Subsetting to: 20000


In [21]:
set_tokens_idx_in_datum(train_set, token2idx)
set_tokens_idx_in_datum(validation_set, token2idx)
set_tokens_idx_in_datum(test_set, token2idx)

In [22]:
train_set[0].tokens_idx[0:10]

[5185, 226, 19, 21, 61, 597, 1732, 21, 65, 13]

In [20]:
# Sanity check (ALWAYS DO SANITY CHECKS!!! It's so easy to screw up in this steps)
for x in train_set[0].tokens_idx[0:10]:
    print(idx2tokens[x])

broad
enough
for
you
?
wait
till
you
see
this


In [24]:
# Sanity check (ALWAYS DO SANITY CHECKS!!! It's so easy to screw up in this steps)
#idx2tokens
# token2idx

{'<PAD>': 0,
 '<UNK>': 1,
 'the': 3,
 '.': 4,
 ',': 5,
 'a': 6,
 'and': 7,
 'of': 8,
 'to': 9,
 'is': 10,
 'i': 11,
 'it': 12,
 'this': 13,
 'in': 14,
 'that': 15,
 'movie': 16,
 'was': 17,
 "'s": 18,
 'for': 19,
 'but': 20,
 'you': 21,
 'film': 22,
 'with': 23,
 'as': 24,
 "n't": 25,
 '!': 26,
 'not': 27,
 'have': 28,
 'on': 29,
 'are': 30,
 '``': 31,
 "''": 32,
 ')': 33,
 'one': 34,
 'be': 35,
 '(': 36,
 'all': 37,
 'at': 38,
 'like': 39,
 'so': 40,
 'they': 41,
 'an': 42,
 'do': 43,
 'good': 44,
 'his': 45,
 'just': 46,
 'if': 47,
 'from': 48,
 'he': 49,
 'who': 50,
 'by': 51,
 'about': 52,
 'there': 53,
 'out': 54,
 'very': 55,
 'or': 56,
 '...': 57,
 'my': 58,
 'what': 59,
 'has': 60,
 '?': 61,
 'some': 62,
 'would': 63,
 'really': 64,
 'see': 65,
 'more': 66,
 'story': 67,
 'time': 68,
 'can': 69,
 'even': 70,
 'me': 71,
 'were': 72,
 'no': 73,
 'when': 74,
 'great': 75,
 'bad': 76,
 'did': 77,
 'only': 78,
 'her': 79,
 'had': 80,
 'up': 81,
 'could': 82,
 'will': 83,
 'watch': 8

## 3. Pytorch pipeline

Build the pytorch pipeline

In [25]:
class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    See the documentation of pytorch's dataloaders here: 
    https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
    """
    
    def __init__(self, data_list):
        """
        @param data_list: list of IMDBDatum
        """
        self.data_list = data_list
        
    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        returns (tokens idxs list, len_of_sequence), label
        """
        tokens_idx, label = self.data_list[key].tokens_idx, self.data_list[key].label
        return (tokens_idx, len(tokens_idx)), label
    


def imdb_collate_and_force_len(batch, max_length=MAX_LEN):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length. 
    
    PAD token is hard coded (0).
    """
    data_list = []
    label_list = []
    length_list = []
    
    # Padd the sequence (using numpy)
    for datum in batch:
        label_list.append(datum[1])
        length_list.append(datum[0][1])
        padded_vec = np.pad(np.array(datum[0][0]), 
                            pad_width=((0,max_length-datum[0][1])), 
                            mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)).to(DEFAULT_DEVICE), torch.LongTensor(length_list).to(DEFAULT_DEVICE), torch.LongTensor(label_list).to(DEFAULT_DEVICE)]

In [26]:
# consturct datasets
imdb_train = IMDBDataset(train_set)
imdb_validation = IMDBDataset(validation_set)
imdb_test = IMDBDataset(test_set)    
    
train_loader = torch.utils.data.DataLoader(dataset=imdb_train, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_collate_and_force_len,
                                           shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=imdb_validation, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=imdb_collate_and_force_len,
                                           shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=imdb_test, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_collate_and_force_len,
                                           shuffle=False)

print("This is an training sample: {0}".format(imdb_train[0][0]))
print("This is a label: {0}".format(imdb_train[0][1]))

This is an training sample: ([5185, 226, 19, 21, 61, 597, 1732, 21, 65, 13, 1323, 1, 8, 6, 132, 1, 34, 535, 4, 59, 10, 1469, 7, 1351, 1, 1499, 1541, 81, 144, 68, 5, 7, 13, 10, 2816, 4, 9, 221, 29, 1, 1779, 23, 1, 379, 46, 10, 25, 1469, 1, 56, 130, 5, 12, 18, 271, 8, 272, 4, 3, 2958, 30, 1, 5, 814, 37, 41, 137, 170, 5, 20, 12, 18, 664, 6, 336, 15, 1, 81, 9, 22, 85, 4, 27, 6, 167, 8, 211, 4], 88)
This is a label: 0


## 4. Simple RNN model!!

Build our very own RNN model!

Check `models.py` to see specification

In [31]:
#model = OurAwsomeRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES, DEFAULT_DEVICE)
model = OurAwsomeLSTMWithTwoLayers(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES, DEFAULT_DEVICE)
# model = OurAwsomeRNNWithAllConnections(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES, DEFAULT_DEVICE, MAX_LEN)
model.to(DEFAULT_DEVICE)

OurAwsomeLSTMWithTwoLayers(
  (embedding): Embedding(20003, 400, padding_idx=0)
  (LSTM): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (linear): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

## 5. Training and validating

In [32]:
# Define our loss function
# See all of pytorch loss functions: https://pytorch.org/docs/stable/nn.html#loss-functions
criterion = nn.BCELoss()

# Define our optimizer:
# See all of pytorch's optimizers: https://pytorch.org/docs/stable/optim.html
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [33]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch = Variable(data)
        outputs = model(data)
        predicted = (outputs.data > 0.5).long().view(-1)
        total += labels.size(0)
        correct += (predicted == labels).sum().cpu().numpy()
    model.train()
    return (100 * correct / total)

In [34]:
validation_acc_history = []
stop_training = False
for epoch in range(NUM_EPOCHS):
    for i, (data, lengths, labels) in enumerate(train_loader):
        data_batch = Variable(data)
        optimizer.zero_grad()
        outputs = model(data_batch) 
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        
        if (i+1) % (BATCH_SIZE*2) == 0:
            train_acc = test_model(train_loader, model)
            val_acc = test_model(validation_loader, model)
            print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Train Acc: {5}, Validation Acc:{6}'.format( 
                   epoch+1, NUM_EPOCHS, i+1, len(imdb_train)//BATCH_SIZE, loss.data.item(), 
                   train_acc, val_acc))
            validation_acc_history.append(val_acc)
            
        # TODO: Implement early stopping or model selection based on Validation Acc!!!
print('Maximum in checkpoint: {}'.format(np.max(validation_acc_history)))

Epoch: [1/20], Step: [80/292], Loss: 0.6971672177314758, Train Acc: 50.01281065846784, Validation Acc:49.31372549019608
Epoch: [1/20], Step: [160/292], Loss: 0.6984521150588989, Train Acc: 50.02989153642497, Validation Acc:49.21568627450981
Epoch: [1/20], Step: [240/292], Loss: 0.6994984745979309, Train Acc: 50.12383636518917, Validation Acc:48.627450980392155
Epoch: [2/20], Step: [80/292], Loss: 0.6903266906738281, Train Acc: 50.57647963105303, Validation Acc:52.450980392156865
Epoch: [2/20], Step: [160/292], Loss: 0.6922510266304016, Train Acc: 50.86685455632419, Validation Acc:48.13725490196079
Epoch: [2/20], Step: [240/292], Loss: 0.6945252418518066, Train Acc: 51.575710991544966, Validation Acc:51.76470588235294
Epoch: [3/20], Step: [80/292], Loss: 0.7019426226615906, Train Acc: 50.64480314288154, Validation Acc:48.627450980392155
Epoch: [3/20], Step: [160/292], Loss: 0.6683098673820496, Train Acc: 51.51592791869502, Validation Acc:48.72549019607843
Epoch: [3/20], Step: [240/292],

## 6. Evaluating

In [18]:
# TODO - Evaluate based on best model