In [None]:
import torch
from torch import nn
import numpy as np
import os


### Download Glove & imdb data

In [None]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"

import requests, zipfile, io
r = requests.get(glove_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
path = os.path.abspath(".") + "/Glove"
z.extractall(path)

KeyboardInterrupt: 

In [None]:
import requests
import tarfile
import urllib.request
thetarfile = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
ftpstream = urllib.request.urlopen(thetarfile)
thetarfile = tarfile.open(fileobj=ftpstream, mode="r|gz")
thetarfile.extractall()
thetarfile.close()

'/notebooks'

### Load data into train_data

In [None]:
import os
import torch
from torch import nn
data_dir = 'aclImdb'
def read_imdb(data_dir, is_train):
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
                                   label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

train_data = read_imdb(data_dir, is_train=True)
print('# trainings:', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:', y, 'review:', x[0:30])

# trainings: 25000
label: 1 review: While the premise of the film 
label: 1 review: She is such an extraordinary s
label: 1 review: Love trap is a "must see" inde


### Data transformation
1. Remove new lines
2. Remove punctuation
3. Make a corpus

In [None]:
from string import punctuation
reviews = train_data[0].copy()
for i in range(len(reviews)):
    reviews[i] = reviews[i].replace("<br />", " ")
    reviews[i] = ''.join([c for c in reviews[i] if c not in punctuation])

all_text = ' '.join(reviews)
words = all_text.split()

4. Create a counter of words

In [None]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
int_to_vocab = [word for word in vocab_to_int]
reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])

5. Save the vocab_to_int dictionary - to be used in inference

In [None]:
import pickle

with open('vocab_to_int.pickle', 'wb') as handle:
    pickle.dump(vocab_to_int, handle, protocol=pickle.HIGHEST_PROTOCOL)


6. Pick labels

In [None]:
labels = train_data[1]


In [None]:
int_to_vocab[:10]

['the', 'a', 'and', 'of', 'to', 'is', 'in', 'I', 'that', 'it']

7. Check if there are zero-length reviews. Remove if they exist

In [None]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 2459


In [None]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

25000

In [None]:
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

### Preparing data
Use first 200 words of each review, if words are less than 200, pad them with zeros

In [None]:
seq_len = 200
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
for i, row in enumerate(reviews_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [None]:
features.shape

(25000, 200)

### Split data into training and testing
80% - training, 10% - validation, 10% - testing

In [None]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [None]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [None]:
#https://debuggercafe.com/using-learning-rate-scheduler-and-early-stopping-with-pytorch/

In [None]:
def load_array(data_arrays, batch_size, is_train=True):
    dataset = torch.utils.data.TensorDataset(*data_arrays)
    return torch.utils.data.DataLoader(dataset, batch_size, shuffle=is_train)


### Preparing batches
Load train, validation data on-to GPU

In [None]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev)  


In [None]:
train_x, train_y = torch.tensor(train_x).to(device), torch.tensor(train_y).to(device)
val_x, val_y = torch.tensor(val_x).to(device), torch.tensor(val_y).to(device)

Load train and validation set onto torch dataloader to create batches

In [None]:
batch_size = 64
train_iter = load_array((train_x, train_y),
                                batch_size)
test_iter = load_array((val_x, val_y), len(val_x), is_train=False)


In [None]:
len(train_x)

20000

### Classes for RNN & Embedding
Use nn.Module from torch to create Bi-directional RNN. We also write a feed-forward network that forwards the output.
We will define rest of the inputs later

In [None]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,
                               bidirectional=True)
        self.decoder = nn.Linear(4 * num_hiddens, 2)

    def forward(self, inputs):
        embeddings = self.embedding(inputs.T)
        self.encoder.flatten_parameters()
        outputs, _ = self.encoder(embeddings)
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder(encoding)
        return outs

Initialize weights using Xavier initialization
<br> ref: http://cs231n.stanford.edu/slides/2016/winter1516_lecture5.pdf


In [None]:

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(m._parameters[param])



Create a Glove Token Embedding Class to embed reviews' words

In [None]:
class GloveTokenEmbedding:
    def __init__(self, file_name):
        self.idx_to_token, self.idx_to_vec = self._load_embedding(
            file_name)
        self.unknown_idx = 0
        self.token_to_idx = {
            token: idx for idx, token in enumerate(self.idx_to_token)}

    def _load_embedding(self, file_name):
        idx_to_token, idx_to_vec = ['<unk>'], []
        with open(file_name, 'rt') as f:
            for line in f:
                elems = line.rstrip().split(' ')
                token, elems = elems[0], [float(elem) for elem in elems[1:]]
                if len(elems) > 1:
                    idx_to_token.append(token)
                    idx_to_vec.append(elems)
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, torch.tensor(idx_to_vec).to(device)

    def __getitem__(self, tokens):
        indices = [
            self.token_to_idx.get(token, self.unknown_idx)
            for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices).to(device)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)

Load Glove 100d

In [None]:
glove_path = 'Glove/glove.6B.100d.txt'
glove_embedding = GloveTokenEmbedding(glove_path)


In [None]:
embeds = glove_embedding[int_to_vocab]
embeds.shape, embeds.is_cuda

(torch.Size([132456, 100]), True)

### Training
Implement training, accuracy, early stopping

In [None]:
# Accuracy
reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs)
argmax = lambda x, *args, **kwargs: x.argmax(*args, **kwargs)
astype = lambda x, *args, **kwargs: x.type(*args, **kwargs)

def accuracy(y_hat, y):
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = argmax(y_hat, axis=1)
    cmp = astype(y_hat, y.dtype) == y
    return float(reduce_sum(astype(cmp, y.dtype)))

def evaluate_accuracy(net, data_iter):
    for f, l in data_iter:
        pred = net(f)
        accuracy_sum = accuracy(pred, l)
    return accuracy_sum/l.numel()

In [None]:
def train_rnn_batch(net, X, y, loss, trainer):
    net.train()
    trainer.zero_grad()
    pred = net(X)
    l = loss(pred, y)
    l.sum().backward()
    trainer.step()
    train_loss_sum = l.sum()
    train_acc_sum = accuracy(pred, y)
    return train_loss_sum, train_acc_sum



In [None]:
def get_val_loss(net, data_iter, loss):
    val_loss_sum, denom = 0, 0
    for X, y in data_iter:
        pred = net(X)
        l = loss(pred, y)
        val_loss_sum += l.sum()
        denom += y.shape[0]
    return val_loss_sum/denom

In [None]:
def train_rnn(net, train_iter, test_iter, loss, trainer, num_epochs, CYCLES_WITH_NO_IMPROVEMENTS, STOP_BY_EPOCH):
    min_val_loss = float("inf")
    reset_counter = 0
    print("Starting training... for %d epochs" %num_epochs)
    for epoch in range(num_epochs):
        count = 0
        for i, (features, labels) in enumerate(train_iter):
            count += 1
            l, acc = train_rnn_batch(net, features, labels, loss, trainer)
            test_acc = evaluate_accuracy(net, test_iter)
            val_loss = get_val_loss(net, test_iter, loss)
            # Early stopping
            if epoch > STOP_BY_EPOCH:
                if val_loss.cpu().detach().numpy() < 0: 
                    reset_counter = 0
                    min_val_loss = val_loss
                else: reset_counter += 1
                if reset_counter == CYCLES_WITH_NO_IMPROVEMENTS:
                    print("Stopping early")
                    return
            if (count) % 25 == 0: 
                print(f'epoch {epoch}, iteration {100*count*len(features)/len(train_x):.2f}%, loss {l / labels.shape[0]:.3f}, val_loss {val_loss:.3f}, train acc '
                  f'{acc / labels.numel():.3f}, test acc {test_acc:.3f}')
        

Rest of inputs: 2 hidden layers, embed size - 100 (same as Glove embedding)

In [None]:
num_epochs = 5
embed_size, num_hiddens, num_layers = 100, 100, 2
CYCLES_WITH_NO_IMPROVEMENTS, STOP_BY_EPOCH = 15, 3
final_plots = []
for lr in [0.003, 0.001, 0.0003, 0.0001]:
    print(lr)
    rnn = BiRNN(len(int_to_vocab), embed_size, num_hiddens, num_layers)
    rnn.to(device)

    rnn.apply(init_weights)
    rnn.embedding.weight.data.copy_(embeds)
    rnn.embedding.weight.requires_grad = False

    trainer = torch.optim.Adam(rnn.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss(reduction="none")
    train_rnn(rnn, train_iter, test_iter, loss, trainer, num_epochs, CYCLES_WITH_NO_IMPROVEMENTS, STOP_BY_EPOCH)
    PATH = './sentiment_analysis' + str(lr) + '.pth'
    torch.save(rnn.state_dict(), PATH)


0.003
Starting training... for 5 epochs
epoch 0, iteration 8.00%, loss 0.666, val_loss 0.836, train acc 0.609, test acc 0.000
epoch 0, iteration 16.00%, loss 0.710, val_loss 0.915, train acc 0.516, test acc 0.000
epoch 0, iteration 24.00%, loss 0.649, val_loss 1.058, train acc 0.641, test acc 0.000
epoch 0, iteration 32.00%, loss 0.587, val_loss 1.005, train acc 0.750, test acc 0.000
epoch 0, iteration 40.00%, loss 0.658, val_loss 0.913, train acc 0.594, test acc 0.024


KeyboardInterrupt: 

### Hyperparameter optimization
1. Started with lr = 1, loss increasing from 1.6 to 12.3
2. Decreased lr=0.3, loss oscillating from 1.5 to 3.5 to 1.2 to 0.7
3. Decreased lr=0.1, loss oscillating from 1.5 to 3.5 to 1.2 to 0.7
3. Decreased lr=0.03, loss oscillating from 0.6 to 0.8
4. Decreased lr=0.01, loss oscillating from 0.6 to 0.7
loss kept on oscillating, so I increased batch sizes from 64 to 256

In [None]:
def predict_sentiment(net, vocab, sequence):
    """Predict the sentiment of a text sequence."""
    sentence = [vocab_to_int[word] for word in sequence.split()]
    print(sentence)
    sequence = torch.tensor(sentence).to(device)
    label = torch.argmax(net(sequence.reshape(1, -1)), dim=1)
    return 'positive' if label == 1 else 'negative'


In [None]:
predict_sentiment(rnn, vocab, 'this movie is so great')


[11, 16, 6, 39, 84]


'positive'