In [1]:
#!pip install torch==1.6.0 torchvision==0.7.0

In [2]:
import torch
print(torch.__version__)
from torchtext import data

1.7.0+cu101


In [3]:
SEED = 1111
torch.manual_seed(SEED)
TEXT = data.Field(tokenize='spacy', include_lengths=True) # If spacy not passed then it will split the text on the basis of spaces.
LABEL = data.LabelField(dtype=torch.float)

Downloading IMDB dataset from torchtext

In [4]:
%%time
from torchtext import datasets

train, test = datasets.IMDB.splits(TEXT, LABEL)

aclImdb_v1.tar.gz:   0%|          | 164k/84.1M [00:00<00:59, 1.42MB/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 73.6MB/s]


CPU times: user 1min 18s, sys: 8.46 s, total: 1min 26s
Wall time: 1min 27s


In [5]:
len(train), len(test)

(25000, 25000)

In [6]:
print(vars(train.examples[0]))

{'text': ['This', 'is', 'an', 'awesome', 'classic', 'monster', 'flick', 'from', 'the', '50', "'s", '!', 'I', 'just', 'love', 'the', 'look', 'of', 'the', '50', "'s", 'in', 'general', 'like', 'the', 'cars', 'and', 'the', 'music', '.', 'Anyway', ',', 'I', 'love', 'the', 'way', 'the', 'blob', 'looks', '.', 'I', 'love', 'when', 'the', 'everyone', 'is', 'at', 'the', 'late', 'night', 'horror', 'flick', 'at', 'the', 'theater', 'and', 'the', 'blob', 'comes', 'in', 'and', 'crashes', 'the', 'party', '.', 'Another', 'thing', 'I', 'love', 'about', 'it', 'is', 'that', 'it', 'takes', 'place', 'all', 'in', 'one', 'night', ',', 'just', 'like', 'Halloween', 'II.<br', '/><br', '/>When', 'Steve', 'and', 'Jane', 'are', 'making', 'out', ',', 'they', 'see', 'a', 'meteor', 'fall', 'from', 'space', '.', 'Inside', 'the', 'meteor', 'is', 'the', 'blob', '.', 'Whenever', 'the', 'blob', 'consumes', 'a', 'person', ',', 'it', 'grows', 'bigger', 'and', 'bigger', '.', 'They', 'try', 'to', 'convince', 'the', 'people', '

# We only have train and test data in IMDB dataset
Let's create validation data out of train data


In [7]:
# The following code automatically downloads the IMDb dataset and splits it into the 
# canonical train/test splits as torchtext.datasets objects.
import random
train, valid = train.split(random_state = random.seed(SEED))

Download the word embeddings - "glove.6B.100d". The reason to use pre_trained word embeddings is they are initialized with pre-trained vectors. These pre-trained vectors already have words with similar semantic meaning close together in vector space. This gives our embeding layer a good initialization as it doen not have to learn these relations from scratch.

In [8]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [9]:
#!unzip '/content/glove.6B.zip'

In [10]:
#import torchtext.vocab as vocab

In [11]:
#glove = vocab.GloVe(name='6B', dim=100)
#print('Loaded {} words'.format(len(glove.itos)))

In [12]:
# We have to build a vocabulary. A look up table where every unique word is mapped to a integer
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train, 
                 max_size=MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train)

.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                          
100%|█████████▉| 399098/400000 [00:15<00:00, 26174.48it/s]

In [13]:
len(TEXT.vocab), len(LABEL.vocab)
# The two additional tokens in TEXT.vocab is <unk>, <pad>

(25002, 2)

In [14]:
#vars(TEXT.vocab)

In [15]:
vars(LABEL.vocab)

{'freqs': Counter({'neg': 8726, 'pos': 8774}),
 'itos': ['pos', 'neg'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'neg': 1, 'pos': 0}),
 'vectors': None}

In [16]:
# Most common words in the voabulary with frequencies
TEXT.vocab.freqs.most_common(10)

[('the', 202592),
 (',', 192270),
 ('.', 165440),
 ('a', 109087),
 ('and', 108859),
 ('of', 100816),
 ('to', 93674),
 ('is', 76066),
 ('in', 61473),
 ('I', 53943)]

In [17]:
# To check the vocabulary
TEXT.vocab.itos[:10]     #itos - integer to string

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']

In [18]:
# Check the labels    # stor - string to integer
LABEL.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'neg': 1, 'pos': 0})

Final step is to create the iterators.
We'll use a BucketIterator which is a special type of
iterator that will return a batch of examples where each
example is of a similar length, minimizing the amount of 
padding per example.

In [19]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train, valid, test),
    batch_size = BATCH_SIZE,
    sort_within_batch=True,
    device = device)

In [20]:
# Build the RNN Model

import torch.nn as nn

In [21]:
class RNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional=bidirectional, dropout=dropout)
    self.linear = nn.Linear(hidden_dim*2, output_dim)
    self.dropout = nn.Dropout(dropout)
  def forward(self, text, text_length):
    embedded = self.dropout(self.embedding(text))
    pack_embedding = nn.utils.rnn.pack_padded_sequence(embedded, text_length.cpu())
    pack_output, (hidden, cell) = self.rnn(pack_embedding)
    output, output_length = nn.utils.rnn.pad_packed_sequence(pack_output)
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
    return self.linear(hidden)

In [22]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

In [23]:
embeddings = TEXT.vocab.vectors
embeddings.shape

torch.Size([25002, 100])

In [24]:
model.embedding.weight.data.copy_(embeddings)

tensor([[-1.1065,  0.1614, -0.6850,  ..., -0.9348, -0.4778,  1.1286],
        [ 0.2091,  0.2932,  0.3151,  ...,  1.1860, -1.5726,  1.1354],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.5523,  0.9965, -1.0090,  ..., -0.7429, -0.5860,  0.2106],
        [-0.4512,  0.6889, -0.0336,  ..., -0.2433,  0.4338,  0.6551],
        [ 0.7438,  1.1903, -0.4427,  ..., -0.2208,  0.5125,  0.4214]])

In [25]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [26]:
model.embedding.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.5523,  0.9965, -1.0090,  ..., -0.7429, -0.5860,  0.2106],
        [-0.4512,  0.6889, -0.0336,  ..., -0.2433,  0.4338,  0.6551],
        [ 0.7438,  1.1903, -0.4427,  ..., -0.2208,  0.5125,  0.4214]])

In [27]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [28]:
model = model.to(device)
criterion = criterion.to(device)

In [29]:
def model_accuracy(predictions, y):
  pred = torch.round(torch.sigmoid(predictions))
  actual = (pred == y).float()
  acc = actual.sum() / len(actual)
  return acc

In [30]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad()    # zero the gradients
    #print(batch.text)
    text, text_length = batch.text
    #print("*")
    predictions = model(text, text_length).squeeze(1)
    #print("*")
    loss = criterion(predictions, batch.label.float())   # Calculate the loss
    acc = model_accuracy(predictions, batch.label)
    loss.backward()  # calculate the gradient of each parameter with loss.backward()
    optimizer.step() # update the parameters using the gradients and optimizer algorithm
    epoch_loss += loss.item()
    epoch_acc += acc.item()
        
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In eval function we do not want to update the parameters when evaluating.
So, we don't need optimizer.zero_grad(), loss.backward() and optimizer.step().

In [31]:
def eval(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()

  """No gradients are calculated on PyTorch operations inside the with 
  no_grad() block. This causes less memory to be 
  used and speeds up computation"""
  with torch.no_grad(): 
    for batch in iterator:
      text, text_length = batch.text
      predictions = model(text, text_length).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = model_accuracy(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)  

In [32]:
import time
EPOCHS = 5
opt_valid_loss = float('inf')

for epoch in range(EPOCHS):
  start_time = time.time()
  #print(start_time)
  train_loss, train_acc = train(model, train_iter, optimizer, criterion)
  print("*")
  valid_loss, valid_acc = eval(model, valid_iter, criterion)
  print("*")
  end_time = time.time()
  if valid_loss < opt_valid_loss:
    opt_valid_loss = valid_loss
    torch.save(model.state_dict(), 'LSTM-RNN-model.pt')
    
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

100%|█████████▉| 399098/400000 [00:29<00:00, 26174.48it/s]

*
*
Epoch: 1
	Train Loss: 0.676 | Train Acc: 0.57%
	 Val. Loss: 0.606 |  Val. Acc: 0.70%
*
*
*
*
Epoch: 3
	Train Loss: 0.518 | Train Acc: 0.75%
	 Val. Loss: 0.383 |  Val. Acc: 0.83%
*
*
Epoch: 4
	Train Loss: 0.385 | Train Acc: 0.83%
	 Val. Loss: 0.351 |  Val. Acc: 0.85%
*
*
Epoch: 5
	Train Loss: 0.297 | Train Acc: 0.88%
	 Val. Loss: 0.285 |  Val. Acc: 0.88%


As we can see that the accuracy is poor. So, we need to improve the model by hypertuning it or to use different Neural Network.

In [33]:
model.load_state_dict(torch.load('/content/LSTM-RNN-model.pt'))

test_loss, test_acc = eval(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.2f}%')

Test Loss: 0.304 | Test Acc: 0.88%


In [43]:
import spacy
nlp = spacy.load('en')

def predict(model, text):
  model.eval()
  tokenize = [token.text for token in nlp.tokenizer(text)]
  index = [TEXT.vocab.stoi[t] for t in tokenize]
  length = [len(index)]
  tensor = torch.LongTensor(index).to(device).unsqueeze(1)
  len_tensor = torch.LongTensor(length)
  pred = torch.sigmoid(model(tensor, len_tensor))
  return pred.item()

In [44]:
predict(model, "This person is bad")

0.99434894323349

In [45]:
predict(model, "I am lovable")

0.1871410459280014