## Movie review model generation
This notebook handles the model generation for the movie review dataset.
Before running this notebook, make sure to run the [data preparation notebook](./data_preparation.ipynb) to generate
the word embeddings required for the model.  
In addition to [PyTorch](https://pytorch.org/docs/stable/index.html), this notebook depends on [torchtext](https://pytorch.org/text/stable/index.html) for handling the input data. The notebook is also linked to [Weights & Biases](wandb.ai).

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import tqdm
import wandb

from torchtext.vocab import Vectors
from torchtext.data import get_tokenizer

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F

In [2]:
# mode = 'online'
mode = 'disabled'
wandb.init(project='movie-reviews', entity='dianna-ai', mode=mode)



In [3]:
# tunable hyperparameters
config = wandb.config
config.batch_size = 256
config.epochs = 20
config.output_dim = 2
config.dropout = .6913
config.n_filters = 245
config.filter_sizes = [3, 4, 5]
config.learning_rate = 0.0005261
config.weight_decay = 0.0005
config.max_samples = 10000  # maximum samples of training set to use

In [4]:
# select device to run on
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(f'PyTorch will use {device}')

PyTorch will use cuda


In [5]:
# path to data files
data_path = os.path.join(os.path.expanduser('~'), 'surfdrive/Shared/datasets/stanford_sentiment_treebank_v2')

# tokenizer for splitting input sentences
tokenizer = get_tokenizer('spacy', 'en_core_web_sm')

# word vectors as generated from data preparation notebook
vocab = Vectors('word_vectors.txt', cache='.')

In [6]:
# class to hold a dataset
# custom datasets need to implement init, len, and getitem
class MovieReviewDataset(Dataset):
    def __init__(self, filename, max_samples=None):
        self.data = pd.read_csv(filename, delimiter='\t')
        if max_samples is not None:
            self.data = self.data[:min(len(self.data), max_samples)]
          
    def __getitem__(self, idx):
        # get sentence and corresponding label
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label']
        return sentence, label.astype(np.float32)
    
    def __len__(self):
        return len(self.data)

In [7]:
# Load data into PyTorch dataset
# These datasets return the input as numerical values, suited for input to the model

train_data = MovieReviewDataset(os.path.join(data_path, 'train.tsv'), max_samples=config.max_samples)
val_data = MovieReviewDataset(os.path.join(data_path, 'validation.tsv'))

In [8]:
# Create iterators for the data
batch_size = config.batch_size
nworker = min(12, os.cpu_count() - 1)

train_data_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=nworker, pin_memory=True)
val_data_iterator = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=nworker, pin_memory=True)

In [9]:
# create a model
class Model(nn.Module):
    def __init__(self, vocab, tokenizer, n_filters, filter_sizes, dropout, output_dim, device=None):
        super().__init__()
        self.filter_sizes = filter_sizes
        self.device = device
        
        vocab_size, embedding_size = vocab.vectors.size()
        self.padding_idx = vocab.stoi['<pad>']
        
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=self.padding_idx)
        
        self.conv_layers = nn.ModuleList()
        for filter_size in filter_sizes:
            layer = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_size, embedding_size))
            self.conv_layers.append(layer)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)
        
    def collate(self, tokens):
        # get max sample size: each item in tokens is the input sentence split into numerical tokens
        max_size = max(max(map(len, tokens)), max(self.filter_sizes))
        # create input array with padding such that each element has the same size
        input_data = np.empty((len(tokens), max_size), dtype=int)
        for i, item in enumerate(tokens):
            # pad each element and store numericalized tokens
            tokens_numerical = [self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.stoi['<unk>'] for token in item]
            npad = max(max_size - len(item), 0)
            input_data[i] = tokens_numerical + [self.padding_idx] * npad

        # convert to tensors
        input_data = torch.LongTensor(input_data)
        if self.device is not None:
            input_data = input_data.to(self.device)
        return input_data
    
    def forward(self, text):
        # first split sentences into tokens
        tokens = [self.tokenizer(sentence) for sentence in text]
        # pad sentences so they all have the same length, convert tokens to numerical embedding, 
        # then make one big tensor for the entire batch of data
        tokens = self.collate(tokens)
        # shape = [batch size, max nword per sentence]
        embedding = self.embedding(tokens).unsqueeze(1)
        # shape = [batch_size, 1, nword, embedding dim]
        conved = [F.relu(conv(embedding)).squeeze(3) for conv in self.conv_layers]
        # shape = len(filter_sizes) list of [batch_size, n_filter, nword - filter_size + 1]
        # note: max_pool1d does not work with ONNX when output shape is dynamic
        # therefore switched to adaptive_max_pool1d
        pooled = [F.adaptive_max_pool1d(out, 1).squeeze(2) for out in conved]
        # shape = len(filter_sizes) list of [batch_size, n_filter]
        concat = torch.cat(pooled, dim=1)
        # shape = [batch_size * len(filter_sizes), n_filter]
        dropped = self.dropout(concat)
        return self.fc(dropped)

In [10]:
# init a model
output_dim = config.output_dim
dropout = config.dropout
n_filters = config.n_filters
filter_sizes = config.filter_sizes


model = Model(vocab, tokenizer, n_filters, filter_sizes, dropout, output_dim, device)
# copy pre-trained embeddings into model
model.embedding.weight.data.copy_(vocab.vectors)

model = model.to(device)

wandb.watch(model)

[<wandb.wandb_torch.TorchGraph at 0x7f5a002d9d30>]

In [11]:
# Define training and evaluation functions
# first define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
loss_func = nn.CrossEntropyLoss().to(device)

# function to calculate accuracy
def accuracy(model_output, y_true):
    y_pred = torch.argmax(model_output, dim=1)
    return (y_pred == y_true).sum() / len(y_pred)

def train(model, train_data, optimizer, loss_func):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in tqdm.tqdm(train_data, file=sys.stdout):
        input_data, label = batch
        label = torch.FloatTensor(label).to(device)
        
        optimizer.zero_grad()
        predictions = model(input_data).squeeze(1)
        
        loss = loss_func(predictions, label.long())
        acc = accuracy(predictions, label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        wandb.log({'train_loss': loss.item(), 'train_acc': acc.item()})
        
    epoch_loss /= len(train_data)
    epoch_acc /= len(train_data)
    
    wandb.log({'train_epoch_loss': epoch_loss, 'train_epoch_acc': epoch_acc})
                
    return epoch_loss, epoch_acc

def evaluate(model, data, loss_func):
    loss = 0
    acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in data:
            input_data, label = batch
            label = torch.FloatTensor(label).to(device)
            predictions = model(input_data).squeeze(1)

            loss += loss_func(predictions, label.long()).item()
            acc += accuracy(predictions, label).item()
            
    loss /= len(data)
    acc /= len(data)
        
    return loss, acc

In [12]:
# do the training
best_val_loss = np.inf
best_val_acc = np.inf

for epoch in range(config.epochs):
    train_epoch_loss, train_epoch_acc = train(model, train_data_iterator, optimizer, loss_func)
    val_epoch_loss, val_epoch_acc = evaluate(model, val_data_iterator, loss_func)
    # log the validation results to wandb
    wandb.log({'val_epoch_loss': val_epoch_loss, 'val_epoch_acc': val_epoch_acc})
    print(f'train loss: {train_epoch_loss:.2f} | train acc: {train_epoch_acc:.2f}')
    print(f'val   loss: {val_epoch_loss:.2f} | val   acc: {val_epoch_acc:.2f}')
    # store model with best validation loss
    if val_epoch_loss < best_val_loss:
        best_val_loss = val_epoch_loss
        best_val_acc = val_epoch_acc
        # ensure we are in eval mode
        model.eval()
        torch.save(model, 'movie_review_model.pytorch')

print(f"Best validation loss: {best_val_loss:.2f}, accuracy: {best_val_acc:.2f}")

100%|██████████| 40/40 [00:01<00:00, 25.08it/s]
train loss: 0.63 | train acc: 0.65
val   loss: 0.57 | val   acc: 0.74
100%|██████████| 40/40 [00:01<00:00, 35.38it/s]
train loss: 0.50 | train acc: 0.77
val   loss: 0.51 | val   acc: 0.75
100%|██████████| 40/40 [00:01<00:00, 36.05it/s]
train loss: 0.44 | train acc: 0.80
val   loss: 0.50 | val   acc: 0.75
100%|██████████| 40/40 [00:01<00:00, 35.60it/s]
train loss: 0.40 | train acc: 0.83
val   loss: 0.46 | val   acc: 0.78
100%|██████████| 40/40 [00:01<00:00, 37.10it/s]
train loss: 0.36 | train acc: 0.85
val   loss: 0.47 | val   acc: 0.78
100%|██████████| 40/40 [00:01<00:00, 35.61it/s]
train loss: 0.33 | train acc: 0.86
val   loss: 0.44 | val   acc: 0.80
100%|██████████| 40/40 [00:01<00:00, 35.86it/s]
train loss: 0.31 | train acc: 0.88
val   loss: 0.44 | val   acc: 0.80
100%|██████████| 40/40 [00:01<00:00, 36.79it/s]
train loss: 0.29 | train acc: 0.88
val   loss: 0.46 | val   acc: 0.78
100%|██████████| 40/40 [00:01<00:00, 36.22it/s]
train lo

In [13]:
# load best model from disk
loaded_model = torch.load('movie_review_model.pytorch')
loaded_model = loaded_model.to(device)
loaded_model.eval()

# store as ONNX, needs example input
x = next(iter(train_data_iterator))[0]
torch.onnx.export(loaded_model, x, 'movie_review_model.onnx', opset_version=11,
                  export_params=True, input_names=['input'], output_names=['output'],
                  do_constant_folding=False,
                  dynamic_axes={'input': {0: 'batch_size', 1: 'sentence_length'},
                                'output': {0: 'batch_size'}})

# check if we can load the onnx model
import onnx
mod = onnx.load('movie_review_model.onnx')
onnx.checker.check_model(mod)

In [14]:
mod.graph.input

[]

In [15]:
# class to predict sentiment from a sentence
class Predictor:
    def __init__(self, model, device):
        self.model = model.to(device)
        self.device = device
        self.classes = ['negative', 'positive']
        
        self.model.eval()
        
    def __call__(self, sentence):
        # feed to model
        pred = torch.softmax(model.forward([sentence]), dim=1)[0]
        # get predicted class
        idx = torch.argmax(pred)
        return pred[idx].item(), self.classes[idx]
    
predict_sentiment = Predictor(loaded_model, device)

In [16]:
# print some predictions from the (unlabeled) test set
sentences = pd.read_csv(os.path.join(data_path, 'test.tsv'), delimiter='\t')['sentence']
nmax = 10
classes = ['negative', 'positive']

for n, sentence in enumerate(sentences):
    if n == nmax:
        break
    output_numerical, predicted_class = predict_sentiment(sentence)
    print(f"\"{sentence}\" - {predicted_class} - {output_numerical:.2f}")

"uneasy mishmash of styles and genres ." - negative - 0.60
"this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation ." - negative - 0.81
"by the end of no such thing the audience , like beatrice , has a watchful affection for the monster ." - positive - 0.79
"director rob marshall went out gunning to make a great one ." - positive - 0.81
"lathan and diggs have considerable personal charm , and their screen rapport makes the old story seem new ." - positive - 0.99
"a well-made and often lovely depiction of the mysteries of friendship ." - positive - 1.00
"none of this violates the letter of behan 's book , but missing is its spirit , its ribald , full-throated humor ." - positive - 0.85
"although it bangs a very cliched drum at times , this crowd-pleaser 's fresh dialogue , energetic music , and good-natured spunk are often infectious ." - positive - 0.98
"it is not a mass-market enterta