Here, I build a initial version of the sentiment analysis model

In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import json

import torch as th
import torch.nn.functional as F
from torch import nn
from torch.nn import Embedding
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torchtext.vocab import vocab, Vocab, GloVe
from torchtext.data.utils import get_tokenizer
from torchmetrics import MeanSquaredError

import pytorch_lightning as pl

from typing import Callable, List, Tuple
from functools import reduce
from collections import OrderedDict

from tqdm import tqdm

In [2]:
#!python -m spacy download en_core_web_sm

# 2. Use pre-trained glove embeddings

In [4]:
PAD_TOKEN = "<pad>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"
SPECIAL_TOKENS = (PAD_TOKEN, EOS_TOKEN, UNK_TOKEN)

In [5]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

embedding_dim = 100
embedding_vecs = GloVe(name='6B', dim=embedding_dim, )

# Use vocab from embedding_vecs
embedding_dict = OrderedDict()
embedding_dict.update({PAD_TOKEN: 1})
embedding_dict.update({EOS_TOKEN: 1})
embedding_dict.update({UNK_TOKEN: 1})
embedding_dict.update(embedding_vecs.stoi)
# min_freq=0 is a hack to read in the 0th token from embedding_vecs.stoi
voc = vocab(embedding_dict, min_freq=0)
voc.set_default_index(voc[UNK_TOKEN])

In [105]:
class TwitterDataset(Dataset):
    """
    Sentiment data from tweets. Raw dataset downloaded from: 
    http://help.sentiment140.com/for-students
    """
    def __init__(self, tokenizer: Callable, voc: Vocab) -> None:
        self.tokenizer = tokenizer
        self.voc = voc
        
        # Load data and remove unnecessary columns
        df = pd.read_csv("data_twitter_sentiment.csv", header=None, encoding='latin-1')
        df = df.rename(columns={
            0: "sentiment_raw",
            5: "text",
        })
        # raw sentiment data is from 0 to 4. Scale it to 0 and 1.
        df["sentiment"] = df.sentiment_raw / 4
        df = df[["sentiment", "text"]]
        df = df.reset_index(drop=True)
        
        nz_texts = []  # numericalized_texts
        seq_lengths = []  # sequence lengths
        for text in tqdm(df.text):
            nz_text = th.tensor(self.voc(self.tokenizer(text)))
            nz_texts.append(nz_text)
            seq_lengths.append(len(nz_text))
        
        # shape of x is: T x B, where T is length of longest seq, B is batch size
        self.seqs = pad_sequence(nz_texts, padding_value=self.voc[PAD_TOKEN])
        self.seq_lengths = seq_lengths
        self.sentiments = df.sentiment
        
    def __len__(self) -> int:
        return len(self.sentiments)
    
    def __getitem__(self, i: int) -> Tuple[Tuple[th.Tensor, int], float]:
        seq = self.seqs[:, i]
        seq_length = self.seq_lengths[i]
        sentiment = self.sentiments[i]
        return (seq, seq_length), sentiment

In [49]:
%%time
full_ds = TwitterDataset(tokenizer, voc)

100%|██████████████████████████████████████████████████| 1600000/1600000 [01:41<00:00, 15817.41it/s]


CPU times: user 1min 47s, sys: 10.2 s, total: 1min 57s
Wall time: 1min 51s


In [51]:
full_ds.seqs.shape

torch.Size([181, 1600000])

In [52]:
num_train = int(0.7 * len(full_ds))
num_val = int(0.15 * len(full_ds))
num_test = len(full_ds) - num_train - num_val
batch_size = 64

train_ds, val_ds, test_ds = random_split(full_ds, [num_train, num_val, num_test])

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
# no need to shuffle val or test dl's
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)  
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [77]:
test = next(iter(val_dl))

In [113]:
class LitVanillaRNN(pl.LightningModule):
    def __init__(
        self, embedding: nn.Embedding, hidden_size: int = 128, num_layers: int = 1,
        lr: float = 1e-3, dropout: float = 0.5,
    ):
        super().__init__()
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lr = lr
        self.dropout = dropout

        # TODO: try using dropout in rnn
        # TODO: try using bidirectional in rnn
        self.rnn = nn.RNN(
            self.embedding.embedding_dim, self.hidden_size, batch_first=True, 
            dropout=self.dropout, num_layers=self.num_layers
        )
        self.fc = nn.Linear(self.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

        self.mse = MeanSquaredError()
    
    def forward(self, x: List[th.Tensor]) -> th.Tensor:
        assert len(x) == 2
        seqs, seq_lengths = x
        embedded = self.embedding(seqs)
        packed = pack_padded_sequence(
            embedded, seq_lengths, batch_first=True, enforce_sorted=False
        )
        
        # TODO: try usng a randomly generated initialhidden state 
        # (instead of the zero vector default)
        _, h_n = self.rnn(packed)
        
        assert h_n.shape == (1, batch_size, self.hidden_size)
        
        x = h_n[-1, :, :]
        x = self.fc(x)
        x = self.sigmoid(x)
        return x
    
    def training_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        return self.generalized_step(batch, batch_idx, "train")

    def validation_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        return self.generalized_step(batch, batch_idx, "val")

    def test_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        return self.generalized_step(batch, batch_idx, "test")
    
    def generalized_step(self, batch: th.Tensor, batch_idx: int, label: str) -> th.Tensor:
        x, y = batch
        predicted = self(x)
        loss = F.mse_loss(predicted, y)
        self.log(f"{label}_loss", loss)
        self.log(f"{label}_mse", self.mse(predicted, y))
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), self.lr)
        return optimizer

In [102]:
embedding = Embedding.from_pretrained(
    embedding_vecs.vectors, freeze=True, padding_idx=voc[PAD_TOKEN]
)

vrnn = LitVanillaRNN(embedding)

In [114]:
test = next(iter(val_dl))
print(vrnn(test[0]).shape)
print(vrnn(test[0]))

torch.Size([64, 1])
tensor([[0.4678],
        [0.5019],
        [0.5383],
        [0.5361],
        [0.5526],
        [0.5562],
        [0.4714],
        [0.5238],
        [0.5283],
        [0.4851],
        [0.4942],
        [0.4423],
        [0.5289],
        [0.5247],
        [0.5554],
        [0.4696],
        [0.5212],
        [0.5124],
        [0.5071],
        [0.4654],
        [0.5215],
        [0.5088],
        [0.5284],
        [0.5375],
        [0.4872],
        [0.5141],
        [0.4798],
        [0.4934],
        [0.5024],
        [0.5225],
        [0.4622],
        [0.5261],
        [0.5177],
        [0.5048],
        [0.5240],
        [0.5191],
        [0.5015],
        [0.5147],
        [0.4923],
        [0.4576],
        [0.6168],
        [0.4885],
        [0.5313],
        [0.4751],
        [0.4979],
        [0.4955],
        [0.5229],
        [0.4992],
        [0.5071],
        [0.5034],
        [0.5128],
        [0.5214],
        [0.5148],
        [0.4768],
        

In [19]:
seqs = ["I like air.", "I like the sun.", "He likes the sun."]
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
tk_seqs = [tokenizer(s) + [EOS_TOKEN] for s in seqs]
# all_tks = reduce(lambda x, y: x + y, tk_seqs)
# all_tks = [all_tks]  # build_vocab_from_iterator requires each item to be iterator
vocab = build_vocab_from_iterator(tk_seqs, specials=SPECIAL_TOKENS)
# TODO: consider whether to include this
# vocab.set_default_index(-1)

# numericalized sequences; must be tensor to be input to pad_sequence
nz_seqs = [th.tensor(vocab(tk_seq)) for tk_seq in tk_seqs]
nz_seqs_padded = pad_sequence(nz_seqs, padding_value=vocab[PAD_TOKEN])

In [22]:
embedding_dim = 100
embedding_glove = GloVe(name='6B', dim=embedding_dim, )

In [119]:
freeze_embedding = True

vocab_itos = vocab.get_itos()
embedding_matrix = th.zeros((len(vocab_itos), embedding_dim))
for i, s in enumerate(vocab_itos):
    embedding_matrix[i, :] = embedding_glove[s]

embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=freeze_embedding, padding_idx=vocab[PAD_TOKEN])

In [None]:
"""
* create_glove_embedding(embedding_dim, vocab, freeze=True, padding_idx=vocab[PAD_TOKEN])
"""

In [None]:
df = pd.read_csv(f"{dir_data}/train.csv")
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
full_ds = DisasterTweetsDataset(df, tokenizer)

In [None]:
embedding_dim = 100
embedding_glove = GloVe(name='6B', dim=embedding_dim)

In [None]:
freeze_embedding = True

vocab_itos = full_ds.vocab.get_itos()  # "i to s"
embedding_matrix = th.zeros((len(vocab_itos), embedding_dim))
for i, s in enumerate(vocab_itos):
    embedding_matrix[i, :] = embedding_glove[s]

embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=freeze_embedding, padding_idx=full_ds.pad_idx)

In [None]:
def lengths_of_sequences(sequences: th.Tensor, eos_idx: int) -> th.Tensor:
    """
    Finds index of eos_idx (i.e. the length of seq). Assumes input is padded sequence, 
    and input shape is (N, max_L), where N is num sequences, and max_L is max length of seq.
    Will return a 1-D tensor that indicates the lengths of each seq in the batch. Asserts that 
    there exists exactly 1 eos_idx occurrence in each seq.
    """
    # assert len(sequences.shape) == 2
    num_seq = sequences.shape[0]
    max_length = sequences.shape[1]

    indices_first_dim, indices_second_dim = (sequences == eos_idx).nonzero(as_tuple=True)
    # assert (indices_first_dim == th.arange(num_seq)).all()

    return indices_second_dim


def test_lengths_of_sequences():
    x = th.zeros([4, 10])
    x[0, 5] = 1
    x[1, 4] = 1
    x[2, 7] = 1
    x[3, 9] = 1
    res = lengths_of_sequences(x, 1)
    assert (res == th.tensor([5, 4, 7, 9])).all()
test_lengths_of_sequences()


class LitVanillaRNN(pl.LightningModule):
    def __init__(
        self, embedding: nn.Embedding, hidden_size: int = 128, lr: float = 1e-3, dropout: float = 0.5,
        num_layers: int = 1
    ):
        super().__init__()
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.lr = lr
        self.dropout = dropout
        self.num_layers = num_layers

        # TODO: try using dropout in rnn
        # TODO: try using bidirectional in rnn
        self.rnn = nn.RNN(
            self.embedding.embedding_dim, self.hidden_size, batch_first=True, dropout=self.dropout,
            num_layers=self.num_layers
        )
        self.fc = nn.Linear(self.hidden_size, 2)
        self.log_softmax = nn.LogSoftmax(dim=-1)

        self.acc = torchmetrics.Accuracy()
    
    def forward(self, x: th.Tensor) -> th.Tensor:
        assert len(x) == 3
        batch_text = x[2]  # index points to "text" part of data (not "keyword" or "location")
        # assert batch_text.shape == (batch_size, train_ds[0][0][2].shape[0])
        
        lengths = lengths_of_sequences(batch_text, full_ds.eos_idx)
        lengths = lengths.cpu()  # need to be on cpu for pack_padded_sequence to work
        batch_text_embedded = self.embedding(batch_text)
        packed = pack_padded_sequence(batch_text_embedded, lengths, batch_first=True, enforce_sorted=False)
        # packed.type_as(batch_text)
        # TODO: try usng a randomly generated hidden state (instead of the zero vector default)
        _, h_n = self.rnn(packed)
        # assert h_n.shape == (1, batch_size, self.hidden_size)
        x = h_n[-1, :, :]

        x = self.fc(x)
        x = self.log_softmax(x)
        # assert x.shape == (batch_size, 2)
        return x
    
    def training_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        x, y = batch
        predicted = self(x)
        loss = F.nll_loss(predicted, y)
        self.log("train_loss", loss)
        self.log("train_acc", self.acc(predicted, y))
        return loss

    def validation_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        x, y = batch
        predicted = self(x)
        loss = F.nll_loss(predicted, y)
        self.log("val_loss", loss)
        self.log("val_acc", self.acc(predicted, y))
        return loss

    def test_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        x, y = batch
        predicted = self(x)
        loss = F.nll_loss(predicted, y)
        self.log("test_loss", loss)
        self.log("test_acc", self.acc(predicted, y))
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), self.lr)
        return optimizer

In [None]:
vrnn = LitVanillaRNN(embedding, dropout=0.5, lr=1e-4, num_layers=3)
trainer = pl.Trainer(auto_lr_find=True, gpus=1)
lr_finder = trainer.tuner.lr_find(vrnn, train_dl, max_lr=1e-1, num_training=1000)
fig = lr_finder.plot(suggest=True)
fig.show()
# lr_finder.suggestion()

# 3. Vanilla RNN

# Misc: code for future models

In [None]:
# for when you're building vocab from training dataset

seqs = ["I like air.", "I like the sun.", "He likes the sun."]
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
tk_seqs = [tokenizer(s) + [EOS_TOKEN] for s in seqs]
# all_tks = reduce(lambda x, y: x + y, tk_seqs)
# all_tks = [all_tks]  # build_vocab_from_iterator requires each item to be iterator
vocab = build_vocab_from_iterator(tk_seqs, specials=SPECIAL_TOKENS)
# TODO: consider whether to include this
# vocab.set_default_index(-1)

# numericalized sequences; must be tensor to be input to pad_sequence
nz_seqs = [th.tensor(vocab(tk_seq)) for tk_seq in tk_seqs]
nz_seqs_padded = pad_sequence(nz_seqs, padding_value=vocab[PAD_TOKEN])

Parse and clean data into dataframe

In [3]:
# Use example code from
# https://colab.research.google.com/drive/1Zv6MARGQcrBbLHyjPVVMZVnRWsRnVMpV#scrollTo=7igYuRaV4bF7

data = []
with gzip.open("reviews_Office_Products_5.json.gz") as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(f"Total num items in dataset: {len(data)}")

df = pd.DataFrame.from_dict(data)
df = df.loc[:, ["reviewText", "overall"]]
df = df.rename(columns={"reviewText": "review_text"})

FileNotFoundError: [Errno 2] No such file or directory: 'reviews_Office_Products_5.json.gz'

In [32]:
# manual mapping from "overall rating" to "sentiment"
MIN_RATING = 1
MAX_RATING = 5
NEUTRAL_RATING = 3.5
def rating_to_sentiment(rating, neutral=NEUTRAL_RATING):
    """
    Shifts scale to have `neutral` correspond to 0 sentiment, 1 to correspond to -1 sentiment,
    and 5 to correspond to +1 sentiment. The mapping is a piecewise function, linear from min rating to neutral,
    and again linear from neutral rating to max.
    """
    assert MIN_RATING <= rating <= MAX_RATING
    if rating <= neutral:
        d = neutral - MIN_RATING
        return -1 + (rating - MIN_RATING) / d
    else:
        d = MAX_RATING - neutral
        return (rating - neutral) / d

assert rating_to_sentiment(3.5) == 0
assert rating_to_sentiment(5) == 1
assert rating_to_sentiment(1) == -1
assert rating_to_sentiment(3, neutral=3) == 0
assert rating_to_sentiment(4, neutral=3) == 0.5
assert rating_to_sentiment(2, neutral=3) == -0.5
assert rating_to_sentiment(4) < rating_to_sentiment(4, neutral=3)
assert rating_to_sentiment(2) < rating_to_sentiment(2, neutral=3)

In [33]:
df["sentiment"] = df.overall.apply(rating_to_sentiment)

In [34]:
df.head()

Unnamed: 0,review_text,overall,sentiment
0,"I bought my first HP12C in about 1984 or so, a...",5.0,1.0
1,WHY THIS BELATED REVIEW? I feel very obliged t...,5.0,1.0
2,I have an HP 48GX that has been kicking for mo...,2.0,-0.6
3,I've started doing more finance stuff recently...,5.0,1.0
4,For simple calculations and discounted cash fl...,5.0,1.0
