# Sentiment classification with LSTM
In this notebook we will use LSTMs to do sentiment classification on the [imdb dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 

## Dataset

To get the data: <br>
`wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz`

In [2]:
from pathlib import Path
PATH = Path("/data2/yinterian/aclImdb/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/aclImdb/README'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-86.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-82.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-81.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-gru.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-78.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-88.pth'),
 PosixPath('/data2/yinterian/aclImdb/test'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-87.pth'),
 PosixPath('/data2/yinterian/aclImdb/imdbEr.txt'),
 PosixPath('/data2/yinterian/aclImdb/train'),
 PosixPath('/data2/yinterian/aclImdb/models'),
 PosixPath('/data2/yinterian/aclImdb/imdb.vocab')]

In [3]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## Tokenization

In [4]:
# first time run this
#!python3 -m spacy download en

In [5]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [6]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

### Computing vocab2index

In [7]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('/data2/yinterian/aclImdb/train/pos/8030_9.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/8819_10.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/6316_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/4781_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/10085_10.txt')]

In [28]:
word_count = Counter()
for path in all_files:
    word_count.update(spacy_tok(path.read_text()))

In [31]:
#word_count

In [30]:
len(word_count.keys())

103578

### Load pre-trained embeddings
To get glove pre-trained embeddings:  wget http://nlp.stanford.edu/data/glove.6B.zip

In [25]:
def loadGloveModel(gloveFile="/data2/yinterian/rotten_imdb/glove.6B.300d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs
word_vecs = loadGloveModel()

### Creating final vocabulary

In [32]:
# delete if occurs < 5 times and it is not in our pretrained embeddings
for word in list(word_count):
    if word_count[word] < 5 and word not in word_vecs:
        del word_count[word]
len(word_count)

56291

In [33]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [14]:
#vocab2index

## Pre-trained weights for the embedding layer

In [34]:
def random_word_vector(D=300):
    """Create arandom word vector
    
    0.25 is chosen so the unknown vectors have (approximately) same variance 
    as pre-trained ones
    """
    return np.random.uniform(-0.25,0.25,D)

In [35]:
def create_embedding_matrix(word_vecs, vocab2index, words, D=300):
    """Creates embedding matrix from word vectors. """
    V = len(words)
    W = np.zeros((V, D), dtype="float32")
    W[0] = np.zeros(D, dtype='float32')
    i = 1
    for i in range(1, V):
        if words[i] in word_vecs:
            W[i] = word_vecs[words[i]]
        else:
            W[i] = random_word_vector()
    return W

In [36]:
embedding_matrix = create_embedding_matrix(word_vecs, vocab2index, words)
embedding_matrix.shape

(33920, 300)

## Dataset

In [37]:
# note that spacy_tok takes a while run it just once
def encode_sentence(path, vocab2index, N=400, padding_start=True):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [39]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N=400)

(array([    1,   774,   101,  2247,   101,   239,    22,  3051,   106,
          455,   834,   123,    52,   940,   131,  1999,   276,  3050,
         1040,    94,   416,  4813,    94,  4814,    76,  2336,  1100,
           76, 31038,    47,   510,   145,  1661,    22,     1,    33,
           25, 18194,   376,   746,   931,    74,  1480,   205,  2770,
         3235,    52,     3,   392,  4605,    52, 11851,    29,  2879,
           12,   276,    99,    25,  1580,  1190,    62,     8,    67,
         6907,  2338,    47,   376,    58,    22,  2247,   376,  8076,
        28445,    74,  1108,   793,  1436,   145,   302,    62,  1999,
         1018,    47,   737,    74,    52,  1131,   847,  5916,    47,
         2090,    74,   283,    63,    72,    52,  6027,  4495,     3,
        18684,    74,   176,   518, 31038,    64, 14484,  8440,    47,
           62,    67,  2748,  4313,    58,     5,    74, 29624,   171,
          566,   176,   108,     1,   647,  4771,    72,    67,   166,
      

In [40]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", N=400, padding_start=True):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        self.X = [encode_sentence(path, vocab2index, N, padding_start) for path in self.files]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

## Training and val loops

In [41]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [42]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [43]:
# dataset with padding at the end
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

In [44]:
batch_size = 5000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

## Pre-trained embeddings

## GRU model with dropout

In [47]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False ## freeze embeddings
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embedding(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, list(s), batch_first=True)
        out_pack, ht= self.gru(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out)

In [48]:
vocab_size = len(words)
print(vocab_size)
model = GRUModel(vocab_size, 300, 50, embedding_matrix).cuda()

33920


In [50]:
train_epocs(model, epochs=30, lr=0.01)

train loss 0.656 val loss 0.611 and val accuracy 0.664
train loss 0.432 val loss 0.387 and val accuracy 0.835
train loss 0.329 val loss 0.298 and val accuracy 0.874
train loss 0.292 val loss 0.275 and val accuracy 0.886
train loss 0.266 val loss 0.264 and val accuracy 0.893
train loss 0.253 val loss 0.258 and val accuracy 0.894
