## Movie review data preparation
This notebook handles the data preparation for a neural network to predict the sentiment of movie reviews.

[torchtext](https://pytorch.org/text/stable/index.html) is used for handling the input data

Additional dependencies:  
[spacy](https://spacy.io) for tokenizing (used by torchtext)

In [1]:
import os
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
import wandb

from torchtext.legacy.data import Field
from torchtext.data import get_tokenizer

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F

In [2]:
# mode = 'online'
mode = 'disabled'
wandb.init(project='movie-reviews', entity='dianna-ai', mode=mode)



In [3]:
config = wandb.config
config.batch_size = 64
config.epochs = 10
config.output_dim = 1
config.dropout = .4
config.n_filters = 128
config.filter_sizes = [3, 4, 5]
config.learning_rate = 0.001

In [4]:
# install spacy and download word embeddings
# !pip install spacy
# !python3 -m spacy download en_core_web_sm

In [5]:
# class to hold a dataset
# custom datasets need to implement init, len, and getitem
class MovieReviewDataset(Dataset):
    def __init__(self, filename, tokenizer, vocab, max_samples=None):
        self.tokenizer = tokenizer
        self.vocab = vocab

        self.data = pd.read_csv(filename, delimiter='\t')
        if max_samples is not None:
            self.data = self.data[:min(len(self.data), max_samples)]
          
    def __getitem__(self, idx):
        # tokenize sentence
        tokens = self.tokenizer(self.data.iloc[idx]['sentence'])
        # numericalize
        tokens_numerical = [self.vocab.stoi[token] for token in tokens]
        label = self.data.iloc[idx]['label']
        return tokens_numerical, label
    
    def __len__(self):
        return len(self.data)
    
    
# custom collate function to handle variable-size input
def collate(batch, padding_token_numerical):
    # get max sample size: item[0] is the input sentence split into numerical tokens
    tokens = [sample[0] for sample in batch]
    max_size = max(map(len, tokens))
    # create input array with padding such that each element has the same size
    input_data = []
    for item in tokens:
        # required padding
        npad = max_size - len(item)
        input_data.append(item + [padding_token_numerical] * npad)
    
    # convert to tensors
    input_data = torch.LongTensor(input_data)
    labels = torch.FloatTensor([item[1] for item in batch])
    return [input_data, labels]

In [6]:
# Build vocabulary from training data
data_path = os.path.join(os.path.expanduser('~'), 'surfdrive/Shared/datasets/stanford_sentiment_treebank_v2')

# tokenizer for splitting input sentences
tokenizer = get_tokenizer('spacy', 'en_core_web_sm')

# load raw training data
raw_train_data = pd.read_csv(os.path.join(data_path, 'train.tsv'), delimiter='\t')

# split input into tokens
train_tokens = [tokenizer(sentence) for sentence in raw_train_data['sentence']]

# Create field and build vocabulary
text_field = Field()
text_field.build_vocab(train_tokens, vectors='glove.6B.100d')
vocab_size = len(text_field.vocab.freqs) + 2  # add the padding and unknown token
embedding_size = text_field.vocab.vectors.shape[1]
padding_idx = text_field.vocab.stoi[text_field.pad_token]
unknown_idx = text_field.vocab.stoi[text_field.unk_token]

# define collate function with now known padding token
collate_func = lambda batch: collate(batch, padding_idx)

print(f'Vocab size: {vocab_size}')
print(f'Embedding size: {embedding_size}')
print(f'Padding token index: {padding_idx}')
print(f'Unknown token index: {unknown_idx}')

Vocab size: 13889
Embedding size: 100
Padding token index: 1
Unknown token index: 0


In [7]:
# Load data into PyTorch dataset
# These datasets return the input as numerical values, suited for input to the model
max_samples = 10000

train_data = MovieReviewDataset(os.path.join(data_path, 'train.tsv'), tokenizer, text_field.vocab, max_samples=max_samples)
val_data = MovieReviewDataset(os.path.join(data_path, 'validation.tsv'), tokenizer, text_field.vocab)

In [8]:
# Create iterators for the data
batch_size = config.batch_size

train_data_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_func)
val_data_iterator = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=collate_func)

In [9]:
# select device to run on
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(f'PyTorch will use {device}')

PyTorch will use cuda


In [10]:
# create a model
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_size, n_filters, filter_sizes, padding_idx,
                dropout, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        
        self.conv_layers = nn.ModuleList()
        for filter_size in filter_sizes:
            layer = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_size, embedding_size))
            self.conv_layers.append(layer)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)
    
    def forward(self, text):
        # shape = [batch size, max nword per sentence]
        embedding = self.embedding(text).unsqueeze(1)
        # shape = [batch_size, 1, nword, embedding dim]
        conved = [F.relu(conv(embedding)).squeeze(3) for conv in self.conv_layers]
        # shape = len(filter_sizes) list of [batch_size, n_filter, nword - filter_size + 1]
        # note: max_pool1d does not work with ONNX when output shape is dynamic
        # therefore switched to adaptive_max_pool1d
        pooled = [F.adaptive_max_pool1d(out, 1).squeeze(2) for out in conved]
        # shape = len(filter_sizes) list of [batch_size, n_filter]
        concat = torch.cat(pooled, dim=1)
        # shape = [batch_size * len(filter_sizes), n_filter]
        dropped = self.dropout(concat)
        return self.fc(dropped)

In [11]:
# init a model
output_dim = config.output_dim
dropout = config.dropout
n_filters = config.n_filters
filter_sizes = config.filter_sizes

model = Model(vocab_size, embedding_size, n_filters, filter_sizes, padding_idx, dropout, output_dim)
# copy pre-trained embeddings into model
model.embedding.weight.data.copy_(text_field.vocab.vectors)

model = model.to(device)

wandb.watch(model)

[<wandb.wandb_torch.TorchGraph at 0x7f5cf3faf640>]

In [12]:
# Define training and evaluation functions
# first define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
loss_func = nn.BCEWithLogitsLoss().to(device)

# function to calculate accuracy
def accuracy(model_output, y_true):
    y_pred = torch.round(torch.sigmoid(model_output))
    return (y_pred == y_true).sum() / len(y_pred)

def train(model, train_data, optimizer, loss_func):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm.tqdm(train_data):
        input_data, label = batch
        input_data = input_data.to(device)
        label = label.to(device)
        
        optimizer.zero_grad()
        predictions = model(input_data).squeeze(1)
        
        loss = loss_func(predictions, label)
        acc = accuracy(predictions, label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        wandb.log({'train_loss': loss.item(), 'train_acc': acc.item()})
        
    epoch_loss /= len(train_data)
    epoch_acc /= len(train_data)
    
    wandb.log({'train_epoch_loss': epoch_loss, 'train_epoch_acc': epoch_acc})
                
    return epoch_loss, epoch_acc

def evaluate(model, data, loss_func):
    
    loss = 0
    acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in data:
            input_data, label = batch
            input_data = input_data.to(device)
            label = label.to(device)

            predictions = model(input_data).squeeze(1)
            
            loss += loss_func(predictions, label).item()
            acc += accuracy(predictions, label).item()
            
    loss /= len(data)
    acc /= len(data)
        
    return loss, acc

In [13]:
# do the training
best_val_loss = np.inf
best_val_acc = np.inf

for epoch in range(config.epochs):
    train_epoch_loss, train_epoch_acc = train(model, train_data_iterator, optimizer, loss_func)
    val_epoch_loss, val_epoch_acc = evaluate(model, val_data_iterator, loss_func)
    # log the validation results to wandb
    wandb.log({'val_epoch_loss': val_epoch_loss, 'val_epoch_acc': val_epoch_acc})
    print(f'train loss: {train_epoch_loss:.2f} | train acc: {train_epoch_acc:.2f}')
    print(f'val   loss: {val_epoch_loss:.2f} | val   acc: {val_epoch_acc:.2f}')
    # store model with best validation loss
    if val_epoch_loss < best_val_loss:
        best_val_loss = val_epoch_loss
        best_val_acc = val_epoch_acc
        # ensure we are in eval mode
        model.eval()
        torch.save(model, 'movie_review_model.pytorch')

print(f"Best validation loss: {best_val_loss:.2f}, accuracy: {best_val_acc:.2f}")

100%|██████████| 157/157 [00:03<00:00, 40.51it/s]
  3%|▎         | 5/157 [00:00<00:03, 40.12it/s]

train loss: 0.49 | train acc: 0.76
val   loss: 0.44 | val   acc: 0.80


100%|██████████| 157/157 [00:03<00:00, 40.83it/s]
  3%|▎         | 5/157 [00:00<00:03, 40.85it/s]

train loss: 0.30 | train acc: 0.88
val   loss: 0.52 | val   acc: 0.76


100%|██████████| 157/157 [00:03<00:00, 40.75it/s]
  3%|▎         | 5/157 [00:00<00:03, 41.06it/s]

train loss: 0.20 | train acc: 0.92
val   loss: 0.47 | val   acc: 0.79


100%|██████████| 157/157 [00:03<00:00, 40.70it/s]
  3%|▎         | 5/157 [00:00<00:03, 40.72it/s]

train loss: 0.13 | train acc: 0.95
val   loss: 0.53 | val   acc: 0.78


100%|██████████| 157/157 [00:03<00:00, 40.72it/s]
  3%|▎         | 5/157 [00:00<00:03, 40.84it/s]

train loss: 0.09 | train acc: 0.97
val   loss: 0.55 | val   acc: 0.79


100%|██████████| 157/157 [00:03<00:00, 40.71it/s]
  3%|▎         | 5/157 [00:00<00:03, 40.38it/s]

train loss: 0.06 | train acc: 0.98
val   loss: 0.66 | val   acc: 0.78


100%|██████████| 157/157 [00:03<00:00, 40.68it/s]
  3%|▎         | 5/157 [00:00<00:03, 41.03it/s]

train loss: 0.05 | train acc: 0.99
val   loss: 0.66 | val   acc: 0.79


100%|██████████| 157/157 [00:03<00:00, 40.68it/s]
  3%|▎         | 5/157 [00:00<00:03, 40.77it/s]

train loss: 0.04 | train acc: 0.99
val   loss: 0.80 | val   acc: 0.76


100%|██████████| 157/157 [00:03<00:00, 40.65it/s]
  3%|▎         | 5/157 [00:00<00:03, 41.03it/s]

train loss: 0.03 | train acc: 0.99
val   loss: 0.80 | val   acc: 0.78


100%|██████████| 157/157 [00:03<00:00, 40.75it/s]


train loss: 0.03 | train acc: 0.99
val   loss: 0.96 | val   acc: 0.76
Best validation loss: 0.44, accuracy: 0.80


In [14]:
# load best model from disk
loaded_model = torch.load('movie_review_model.pytorch')
loaded_model = loaded_model.to(device)
loaded_model.eval()

# store as ONNX, needs example input
x = next(iter(train_data_iterator))[0].to(device)
torch.onnx.export(model, x, 'movie_review_model.onnx', opset_version=11)

In [15]:
# function to predict sentiment from a sentence
def predict_sentiment(sentence):
    # depends on loaded_model, tokenizer, text_field
    # convert sentence into tokens
    tokens = tokenizer(sentence)
    # must be at least the size of the largest filter
    if len(tokens) < max(filter_sizes):
        tokens += [text_field.pad_token] * (max(filter_size) - len(tokens))
    # convert to numerical and transfer to device
    tokens = [text_field.vocab.stoi[word] for word in tokens]
    tokens = torch.tensor(tokens).to(device)
    # add batch axis
    tensor = tokens.unsqueeze(0)
    prediction = loaded_model(tensor)
    return torch.sigmoid(prediction).item()

In [16]:
# print some predictions from the (unlabeled) test set
sentences = pd.read_csv(os.path.join(data_path, 'test.tsv'), delimiter='\t')['sentence']
nmax = 10
classes = ['negative', 'positive']

for n, sentence in enumerate(sentences):
    if n == nmax:
        break
    output = predict_sentiment(sentence)
    predicted_class = classes[int(np.round(output))]
    print(f"\"{sentence}\" - {predicted_class} - {output:.2f}")

"uneasy mishmash of styles and genres ." - positive - 0.68
"this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation ." - negative - 0.34
"by the end of no such thing the audience , like beatrice , has a watchful affection for the monster ." - positive - 0.66
"director rob marshall went out gunning to make a great one ." - positive - 0.54
"lathan and diggs have considerable personal charm , and their screen rapport makes the old story seem new ." - positive - 0.94
"a well-made and often lovely depiction of the mysteries of friendship ." - positive - 0.99
"none of this violates the letter of behan 's book , but missing is its spirit , its ribald , full-throated humor ." - positive - 0.90
"although it bangs a very cliched drum at times , this crowd-pleaser 's fresh dialogue , energetic music , and good-natured spunk are often infectious ." - positive - 0.88
"it is not a mass-market enterta