In [1]:
import torch
from torchtext import data
from torch.utils.data import Dataset

In [2]:
SEED = 1111
torch.manual_seed(SEED)
TEXT = data.Field(tokenize='spacy') # If spacy not passed then it will split the text on the basis of spaces.
LABEL = data.LabelField(dtype=torch.float)

Downloading IMDB dataset from torchtext

In [3]:
%%time
from torchtext import datasets

train, test = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 10.1MB/s]


CPU times: user 1min 45s, sys: 12.9 s, total: 1min 57s
Wall time: 2min 6s


In [4]:
len(train), len(test)

(25000, 25000)

In [5]:
print(vars(train.examples[0]))

{'text': ['I', 'have', 'no', 'idea', 'how', 'IMDb', 'sorts', 'reviews', 'but', 'I', 'do', 'know', 'that', ',', 'as', 'happens', 'often', 'on', 'Amazon.com', ',', 'there', 'are', 'a', 'striking', 'number', 'of', 'very', 'negative', 'reviews', 'for', 'this', 'movie', 'which', 'repeat', 'the', 'same', ',', 'somewhat', 'obscure', 'talking', 'points', ',', 'almost', 'verbatim', '.', 'A', 'campaign', '?', 'Only', 'IMDb', 'knows.<br', '/><br', '/>As', 'for', 'this', 'movie', ':', 'it', "'s", 'fine', '.', 'It', "'s", 'a', 'funny', ',', 'cute', 'and', 'very', 'straightforward', 'movie.<br', '/><br', '/>It', "'s", 'been', 'over', 'a', 'decade', 'since', 'I', 'worked', 'in', 'Brooklyn', ',', 'lived', 'in', 'Queens', 'and', 'visited', 'relatives', 'in', 'the', 'South', 'Bronx', '.', 'But', 'I', 'found', 'nothing', 'inauthentic', 'or', 'exploitative', 'about', 'these', 'kids', '.', 'Is', 'the', 'grandmother', 'a', 'bizarre', 'character', '?', 'Yup', '.', 'Do', 'the', 'dialogue', 'and', 'plot', 'ack

# We only have train and test data in IMDB dataset
Let's create validation data out of train data


In [7]:
# The following code automatically downloads the IMDb dataset and splits it into the 
# canonical train/test splits as torchtext.datasets objects.
import random
train, valid = train.split(random_state = random.seed(SEED))

In [10]:
# We have to build a vocabulary. A look up table where every unique word is mapped to a integer
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train)

In [11]:
len(TEXT.vocab), len(LABEL.vocab)
# The two additional tokens in TEXT.vocab is <unk>, <pad>

(25002, 2)

In [38]:
vars(LABEL.vocab)

{'freqs': Counter({'neg': 8726, 'pos': 8774}),
 'itos': ['pos', 'neg'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'neg': 1, 'pos': 0}),
 'vectors': None}

In [15]:
# Most common words in the voabulary with frequencies
TEXT.vocab.freqs.most_common(10)

[('the', 202064),
 (',', 192632),
 ('.', 165386),
 ('and', 109129),
 ('a', 108943),
 ('of', 100049),
 ('to', 93051),
 ('is', 76037),
 ('in', 61344),
 ('I', 54271)]

In [16]:
# To check the vocabulary
TEXT.vocab.itos[:10]     #itos - integer to string

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']

In [17]:
# Check the labels    # stor - string to integer
LABEL.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'neg': 1, 'pos': 0})

Final step is to create the iterators.
We'll use a BucketIterator which is a special type of
iterator that will return a batch of examples where each
example is of a similar length, minimizing the amount of 
padding per example.

In [19]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train, valid, test),
    batch_size = BATCH_SIZE,
    device = device)

In [20]:
print(device)

cuda


In [21]:
train_iter

<torchtext.data.iterator.BucketIterator at 0x7f33697e94e0>

In [22]:
# Build the RNN Model

import torch.nn as nn

In [23]:
class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_dim)
    self.linear = nn.Linear(hidden_dim, output_dim)
  def forward(self, text):
    embedded = self.embedding(text)
    output, hidden = self.rnn(embedded)
    assert torch.equal(output[-1, :, :], hidden.squeeze(0))
    return self.linear(hidden.squeeze(0))

In [24]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [25]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(),lr=  1e-3)
criterion = nn.BCEWithLogitsLoss()

In [26]:
model = model.to(device)
criterion = criterion.to(device)

In [27]:
def model_accuracy(predictions, y):
  pred = torch.round(torch.sigmoid(predictions))
  actual = (pred == y).float()
  acc = actual.sum() / len(actual)
  return acc

In [28]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for i, batch in enumerate(iterator):
    optimizer.zero_grad()    # zero the gradients
    #print(batch.text)
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)   # Calculate the loss
    acc = model_accuracy(predictions, batch.label)
    loss.backward()  # calculate the gradient of each parameter with loss.backward()
    optimizer.step() # update the parameters using the gradients and optimizer algorithm
    epoch_loss += loss.item()
    epoch_acc += acc.item()
        
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In eval function we do not want to update the parameters when evaluating.
So, we don't need optimizer.zero_grad(), loss.backward() and optimizer.step().

In [29]:
def eval(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()

  with torch.no_grad(): """No gradients are calculated on PyTorch operations inside the with no_grad() block. This causes less memory to be used and speeds up computation"""
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = model_accuracy(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)  

In [34]:
import time
EPOCHS = 10
opt_valid_loss = float('inf')

for epoch in range(EPOCHS):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iter, optimizer, criterion)
  valid_loss, valid_acc = eval(model, valid_iter, criterion)
  end_time = time.time()
  if valid_loss < opt_valid_loss:
    opt_valid_loss = valid_loss
    torch.save(model.state_dict(), 'RNN-model.pt')
    
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

Epoch: 1
	Train Loss: 0.693 | Train Acc: 0.50%
	 Val. Loss: 0.694 |  Val. Acc: 0.50%
Epoch: 2
	Train Loss: 0.693 | Train Acc: 0.50%
	 Val. Loss: 0.694 |  Val. Acc: 0.51%
Epoch: 4
	Train Loss: 0.693 | Train Acc: 0.50%
	 Val. Loss: 0.694 |  Val. Acc: 0.50%


As we can see that the accuracy is poor. So, we need to improve the model by hypertuning it or to use different Neural Network.

In [35]:
model.load_state_dict(torch.load('/content/RNN-model.pt'))

test_loss, test_acc = eval(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.2f}%')

Test Loss: 0.696 | Test Acc: 0.46%
