# Implementing a Sentiment Analysis Model

- IMDB Movie Reviews, Binary Pos/Neg Sentiment
- Prune vocabulary to 30,000 most common words
- Pad each input sequence up to 500 words
- Inputs should be 500 dim vectors where each element correspnds to the vocab index of the corresponding word

In [1]:
import torch
from torchtext.datasets import IMDB
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

In [2]:
# Load dataset
train_iter = IMDB(split=('train'))

100%|██████████| 84.1M/84.1M [00:05<00:00, 14.2MB/s]


In [3]:
# Define tokenizer and build vocabulary
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# build vocab from iterator and add a list of any special tokens
text_vocab = build_vocab_from_iterator(yield_tokens(train_iter), 
                                       specials=['<unk>', '<pad>'])
text_vocab.set_default_index(text_vocab['<unk>'])

In [4]:
print(text_vocab(tokenizer("Hello is it me you're looking for?")))

[4645, 10, 11, 78, 26, 9, 183, 296, 19, 55]


In [5]:
#define pipelines
def text_pipeline(x, max_size=512):
   text = tokenizer(x)
   
   # reduce vocab size
   pruned_text = []
   for token in text:
     if text_vocab.get_stoi()[token] >= 30000:
       token = '<unk>'
     pruned_text.append(token)
   
   # pad sequence or truncate
   if len(pruned_text) <= max_size:
     pruned_text += ['<pad>'] * (max_size - len(pruned_text))
   else:
     pruned_text = pruned_text[0:max_size]
   return text_vocab(pruned_text)

label_pipeline = lambda x: (0 if (x == 'neg') else 1)

In [6]:
# test pipelines
print(text_vocab.get_itos()[29999])
print(text_vocab.get_itos()[30000])
print(text_pipeline('hello, I saw the wanderings waned'))
print(len(text_pipeline('hello, I saw the wanderings waned')))

wanderings
waned
[4645, 4, 13, 220, 2, 29999, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

To complete the data preparation, we need to serve minibatches of a desired size from the underlying dataset. We can use the built-in DataLoader class from PyTorch to sample teh dataset in batches.  Before we do so, we need to define a function, collate_batch, that will tell the DataLoader how to preprocess each batch.

In [7]:
# define preprocessing
def collate_batch(batch):
  label_list, text_list = [], [] 
  for label, review in batch:
    label_list.append(label_pipeline(label))
    text_list.append(text_pipeline(review))
  return (torch.tensor(label_list, dtype=torch.long),
          torch.tensor(text_list, dtype=torch.int32))


The collate_batch function simply runs the labels and review strings through each respective pipeline and returns the batch as a tuple of tensors (labels_batch, reviews_batch). Once the collate_fn is defined, we simply load the dataset and configure the dataloaders:

In [8]:
# Load datasets and create dataloaders for batching
from torch.utils.data import DataLoader

train_iter, val_iter = IMDB(split=('train','test'))
trainloader = DataLoader(train_iter, 
                         batch_size = 4, 
                         shuffle=False,
                         collate_fn=collate_batch)
valloader = DataLoader(val_iter, 
                       batch_size = 4, 
                       shuffle=False,
                       collate_fn=collate_batch)

In [9]:
# test pipelines and collate_batch()
for labels, reviews in trainloader:
  print(labels.shape)
  print(reviews.shape)
  break

torch.Size([4])
torch.Size([4, 512])


# Build TextClassification model

Now that the data is ready to go, we'll begin to construct the sentiment analysis model, step by step. First, we'll want to map each word in the input review to a word vector. To do this, we'll utilize an  embedding layer, which, as you may recall from the last chapter, is a simple lookup table that stores an embedding vector that corresponds to each word. Unlike in previous examples, where we treated the learning of the word embeddings as a separate problem (i.e., by building a Skip-Gram model), we'll learn the word embeddings jointly with the sentiment analysis problem by treating the embedding matrix as a matrix of parameters in the full problem. We accomplish this by using the PyTorch primitives for managing embeddings (remember that input represents one full minibatch at a time, not just one movie review vector):

In [10]:
import torch.nn as nn

In [11]:
embedding = nn.Embedding(
                      num_embeddings=30000,
                      embedding_dim=512,
                      padding_idx=text_vocab.get_stoi()['<pad>'])

In [12]:
#test embedding
emb = embedding(torch.randint(high=29999,size=(4,500)))
emb.shape

torch.Size([4, 500, 512])

In [13]:
class TextClassifier(nn.Module):
  def __init__(self):
    super(TextClassifier,self).__init__()
    self.layer_1 = nn.Embedding(
                      num_embeddings=30000,
                      embedding_dim=512,
                      padding_idx=1)                      
    self.layer_2 = nn.LSTMCell(input_size=512, hidden_size=512)
    self.layer_3 = nn.Dropout(p=0.5)
    self.layer_4 = nn.Sequential(
                      nn.Linear(512, 2),
                      nn.Sigmoid(),
                      nn.BatchNorm1d(2))
    
  def forward(self, x):
    x = self.layer_1(x)
    x = x.permute(1,0,2)
    h = torch.rand(x.shape[1], 512)
    c = torch.rand(x.shape[1], 512)
    for t in range(x.shape[0]):
      h, c = self.layer_2(x[t], (h,c))
      h = self.layer_3(h)
    return self.layer_4(h)



# Training Loop

*   The training loop takes approximately 2 minutes per batch.  Therefore, we set N_EPOCHS = 1 and break after the first batch so you can execute the file.
*   To do a complete training, set N_EPOCHS = 40 and remove the break statement.



In [14]:
import torch.optim as optim 

In [17]:
N_EPOCHS = 1
model = TextClassifier()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

for epoch in range(N_EPOCHS):
  running_loss = 0
  for labels, inputs in trainloader:
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = loss_fn(outputs, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
    # This code takes about 2 min per batch
    # We'll stop after first batch
    break
  print(f'Epoch: {epoch} Loss: {loss.item()}')


Epoch: 0 Loss: 1.0499241352081299
