# Implementing a Sentiment Analysis Model

- IMDB Movie Reviews, Binary Pos/Neg Sentiment
- Prune vocabulary to 30,000 most common words
- Pad each input sequence up to 500 words
- Inputs should be 500 dim vectors where each element correspnds to the vocab index of the corresponding word

In [1]:
import torch
from torchtext.datasets import IMDB
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

In [None]:
#@title
def build_vocab():

In [None]:
#@title
text_vocab = build_vocab(yield_tokens(train_iter),
                         max_size=30000, 
                         specials=['<unk>', '<pad>'])
text_vocab.set_default_index(text_vocab['<unk>'])


In [3]:
# Load dataset
train_iter = IMDB(split=('train'))

In [4]:
# Define tokenizer and build vocabulary
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# build vocab from iterator and add a list of any special tokens
text_vocab = build_vocab_from_iterator(yield_tokens(train_iter), 
                                       specials=['<unk>', '<pad>'])
text_vocab.set_default_index(text_vocab['<unk>'])

In [6]:
print(text_vocab(tokenizer("Hello is it me you're looking for?")))

[4645, 10, 11, 78, 26, 9, 183, 296, 19, 55]


In [5]:
# Alternate way
# rewrite build_vocab_from_iterator
# https://github.com/pytorch/text/blob/main/torchtext/vocab/vocab_factory.py

In [6]:
# # Define tokenizer and build vocabulary
# tokenizer = get_tokenizer('basic_english')

# def yield_tokens(data_iter):
#     for _, text in data_iter:
#         yield tokenizer(text)

# # build vocab from iterator and add a list of any special tokens
# full_vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=['<unk>', '<pad>'])
# full_vocab.set_default_index(full_vocab['<unk>'])
# text_vocab = lambda x: full_vocab(x) if full_vocab.get_stoi()[x] < 30000 else '<unk>'

In [5]:
# print(full_vocab.get_itos()[29999])
# print(full_vocab.get_itos()[30000])
# #print(text_vocab(full_vocab.get_itos()[29999]))
# print(text_vocab('wanderings'))
# print(text_vocab(full_vocab.get_itos()[30000]))


In [7]:
#define pipelines
def text_pipeline(x, max_size=500):
   text = tokenizer(x)
   
   # reduce vocab size
   pruned_text = []
   for token in text:
     if text_vocab.get_stoi()[token] >= 30000:
       token = '<unk>'
     pruned_text.append(token)
   
   # pad sequence or truncate
   if len(pruned_text) <= max_size:
     pruned_text += ['<pad>'] * (max_size - len(pruned_text))
   else:
     pruned_text = pruned_text[0:max_size]
   return text_vocab(pruned_text)

label_pipeline = lambda x: (0 if (x == 'neg') else 1)

In [8]:
# test pipelines
print(text_vocab.get_itos()[29999])
print(text_vocab.get_itos()[30000])
print(text_pipeline('hello, I saw the wanderings waned'))
print(len(text_pipeline('hello, I saw the wanderings waned')))

wanderings
waned
[4645, 4, 13, 220, 2, 29999, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

To complete the data preparation, we need to serve minibatches of a desired size from the underlying dataset. We can use the built-in DataLoader class from PyTorch to sample teh dataset in batches.  Before we do so, we need to define a function, collate_batch, that will tell the DataLoader how to preprocess each batch.

In [9]:
# define preprocessing
def collate_batch(batch):
  label_list, text_list = [], [] 
  for label, review in batch:
    label_list.append(label_pipeline(label))
    text_list.append(text_pipeline(review))
  return (torch.tensor(label_list, dtype=torch.float32),
          torch.tensor(text_list, dtype=torch.float32))


The collate_batch function simply runs the labels and review strings through each respective pipeline and returns the batch as a tuple of tensors (labels_batch, reviews_batch). Once the collate_fn is defined, we simply load the dataset and configure the dataloaders:

In [10]:
# Load datasets and create dataloaders for batching
from torch.utils.data import DataLoader

train_iter, val_iter = IMDB(split=('train','test'))
trainloader = DataLoader(train_iter, 
                         batch_size = 1, 
                         shuffle=False,
                         collate_fn=collate_batch)
valloader = DataLoader(val_iter, 
                       batch_size = 1, 
                       shuffle=False,
                       collate_fn=collate_batch)

In [11]:
# test pipelines and collate_batch()
for labels, reviews in trainloader:
  print(labels.shape)
  print(reviews.shape)
  break

torch.Size([1])
torch.Size([1, 500])


# Build TextClassification model

Now that the data is ready to go, we'll begin to construct the sentiment analysis model, step by step. First, we'll want to map each word in the input review to a word vector. To do this, we'll utilize an  embedding layer, which, as you may recall from the last chapter, is a simple lookup table that stores an embedding vector that corresponds to each word. Unlike in previous examples, where we treated the learning of the word embeddings as a separate problem (i.e., by building a Skip-Gram model), we'll learn the word embeddings jointly with the sentiment analysis problem by treating the embedding matrix as a matrix of parameters in the full problem. We accomplish this by using the PyTorch primitives for managing embeddings (remember that input represents one full minibatch at a time, not just one movie review vector):

In [13]:
import torch.nn as nn

In [14]:
embedding = nn.Embedding(
                      num_embeddings=30000,
                      embedding_dim=512,
                      padding_idx=text_vocab.get_stoi()['<pad>'])

In [18]:
#test embedding
emb = embedding(torch.randint(high=29999,size=(4,500)))
emb.shape

torch.Size([4, 500, 512])

In [19]:
class TextClassifier(nn.Module):
  def __init__(self):
    super(TextClassifier,self).__init__()
    self.layer_1 = nn.Embedding(
                      num_embeddings=30000,
                      embedding_dim=512,
                      padding_idx=1)                      
    self.layer_2 = nn.Sequential(
                      nn.LSTMCell(input_size=512, hidden_size=512),
                      nn.Dropout(p=0.5))
    self.layer_3 = nn.Sequential(
                      nn.Linear(512, 2),
                      nn.Sigmoid(),
                      nn.BatchNorm1d(2))
  def forward(self, x):
    x = self.layer_1(x)
    x = self.layer_2(x)
    return self.layer_3(x)



# Training Loop

In [62]:
import torch.optim as optim 

In [63]:
N_EPOCHS = 10
model = TextClassifier()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters())

for epoch in range(N_EPOCHS):
  running_loss = 0
  for labels, inputs in trainloader:
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = loss_fn(outputs, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
  print(f'Epoch: {epoch} Loss: {loss.item()}')


RuntimeError: ignored

# Archive

In [None]:
# Build dataloader from iterable dataset, shuffle must be False
# use collate_fn for preprocessing pipeline
def collate_batch(batch):
  #for label, text in batch
  #return label_batch, text_batch
  

trainloader = DataLoader(train_iter, 
                         batch_size = 100, 
                         shuffle=False, 
                         collate_fn=collate_batch)

In [None]:
# Pass in list of tokens, get indices
print(text_vocab(['here', 'we', 'go']))
# pass in index, get tokens
print(text_vocab.get_itos()[0:10])



### Limiting vocab for 30000 tokens is a pain
see https://github.com/pytorch/text/blob/main/torchtext/vocab/vocab_factory.py

In [None]:
from collections import Counter
from torchtext.vocab import vocab
from typing import Dict, Iterable, Optional, List
from collections import Counter, OrderedDict

def build_vocab(iterator: Iterable, tokenizer, max_size: int = 30000,
                min_freq: int = 1, specials: Optional[List[str]] = None, 
                special_first: bool = True) -> Vocab:

    counter = Counter()
    for _, text in iterator:
        counter.update(tokenizer(text))

    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[0])
    sorted_by_freq_tuples.sort(key=lambda x: x[1], reverse=True)
    if len(sorted_by_freq_tuples) > max_size:
      sorted_by_freq_tuples = sorted_by_freq_tuples[0:max_size] 
    ordered_dict = OrderedDict(sorted_by_freq_tuples)

    word_vocab = vocab(ordered_dict, min_freq=min_freq)
    return word_vocab

In [None]:
word_vocab = build_vocab(train_iter, tokenizer, specials = ['<unk>'])
print(len(word_vocab))
print(word_vocab.get_itos()[0:10])

In [None]:
train_iter = IMDB(split='train')
trainX = [torch.tensor(text_vocab(tokenizer(text))) for _, text in train_iter]
# list of 25000 variable size tensors

In [None]:
trainX_pad = pad_sequence(trainX, batch_first=True)

In [None]:
label_dict = {'neg': 0, 'pos': 1}
train_iter = IMDB(split='train')
trainY = torch.tensor([label_dict[tag] for tag, _ in train_iter])

In [None]:
trainX_pad.shape, len(trainY)

In [None]:
class IMDBDataset():
    def __init__(self, X, Y):
        self.num_examples = len(X)
        self.inputs = X
        self.tags = Y
        self.ptr = 0

    def minibatch(self, size):
        ret = None
        if self.ptr + size < len(self.inputs):
            ret = self.inputs[self.ptr:self.ptr+size],self.tags[self.ptr:self.ptr+size]
        else:
            ret = np.concatenate((self.inputs[self.ptr:],
                  self.inputs[:size-len(
                  self.inputs[self.ptr:])])),
                  np.concatenate((self.tags[self.ptr:],
                  self.tags[:size-len(
                  self.tags[self.ptr:])]))
        self.ptr = (self.ptr + size) % len(self.inputs)
        return ret

In [None]:
train = IMDBDataset(trainX, trainY)
val = IMDBDataset(testX, testY)

## Define LSTM Model for Sentiment Analysis

In [None]:
embedding = nn.Embedding(emb_size = 30000, 
                         in_size = 512)

In [None]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
  def __init__(self, emb_size, in_size, hidden_size, keep_prob):
    super(SentimentLSTM, self).__init__()
    self.embedding = nn.Embedding(emb_size, in_size)
    self.lstm = nn.LSTM(in_size,
                   hidden_size = hidden_size, 
                   num_layers=2,
                   dropout = keep_prob)
    self.out_layer = nn.Linear(512,2)

  def forward(self, x):
    x = self.lstm(self.embedding(x))
    return self.out_layer(x)
    

In [None]:
lstm = SentimentLSTM(emb_size = 30000, 
                     in_size = 512, 
                     hidden_size = 512, 
                     keep_prob = 0.5) 

# Training the LSTM

## TO-DO: run this for new pytorch code

In [None]:
# Parameters
training_epochs = 1000
batch_size = 32
display_step = 1

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), 
                       lr=0.001, 
                       betas=(0.9,0.999), 
                       eps=1e-08)
writer = SummaryWriter()

for epoch in range(training_epochs):
  running_loss = 0
  for inputs, labels in trainloader:
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = loss_fn(outputs, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss
  

  writer.add_scalar('Loss/train', running_loss/len(trainloader), epoch)
  #if (epoch % 100 == 0):
  print(f'Epoch: {epoch} Loss: {running_loss/len(trainloader)}')

In [None]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = AG_NEWS()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59) 