# Import dependencies

In [42]:
import random
from pprint import pprint

import tqdm
import torch
import torchtext

# Hyperparameters and constants

In [69]:
MAX_VOCAB_SIZE = 25_000
BATCH_SIZE = 64
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
LEARNING_RATE = 1e-3
N_EPOCHS = 5

# Device configuration

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# IMDb dataset

In [25]:
!python -m spacy download en_core_web_sm -Uqq

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [23]:
SEED = 1234

torch.manual_seed(SEED)

TEXT = torchtext.legacy.data.Field(tokenize="spacy", tokenizer_language="en_core_web_sm")
LABEL = torchtext.legacy.data.LabelField(dtype=torch.float)

In [26]:
train_data, test_data = torchtext.legacy.datasets.IMDB.splits(TEXT, LABEL)

In [41]:
pprint(vars(train_data.examples[0]))

{'label': 'neg',
 'text': ['This',
          'movie',
          'has',
          'got',
          'to',
          'be',
          'one',
          'of',
          'the',
          'worst',
          'I',
          'have',
          'ever',
          'seen',
          'make',
          'it',
          'to',
          'DVD',
          '!',
          '!',
          '!',
          'The',
          'story',
          'line',
          'might',
          'have',
          'clicked',
          'if',
          'the',
          'film',
          'had',
          'more',
          'funding',
          'and',
          'writers',
          'that',
          'would',
          'have',
          'cut',
          'the',
          'nonsense',
          'and',
          'sickly',
          'scenes',
          'that',
          'I',
          'highly',
          'caution',
          'parents',
          'on',
          '....',
          'But',
          'the',
          'story',
          'line',
     

In [28]:
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [29]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing data: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing data: 25000


# Build vocabulary

In [31]:
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [32]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [33]:
print(TEXT.vocab.freqs.most_common(10))
print(TEXT.vocab.itos[:10])

[('the', 203566), (',', 192495), ('.', 165544), ('and', 109443), ('a', 109116), ('of', 100702), ('to', 93766), ('is', 76328), ('in', 61255), ('I', 54004)]
['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [34]:
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


# Create iterators

In [37]:
train_iterator, valid_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
                                                    (train_data, valid_data, test_data),
                                                    batch_size=BATCH_SIZE,
                                                    device=device
                                                    )

# RNN network

In [84]:
class RNN(torch.nn.Module):

    def __init__(self, vocab_len, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(vocab_len, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        """
        @params text list(int): with shape [sent_len, batch_size]
        """

        embedded = self.embedding(text)
        # embedded shape: [sent_len, batch_size, embedding_dim]

        output, hidden = self.rnn(embedded)
        # output shape: [sent_len, batch_size, hidden_dim]
        # hidden shape: [1, batch_size, hidden_dim]

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))

        return self.fc(hidden.squeeze(0))


In [85]:
vocab_len = len(TEXT.vocab)
model = RNN(vocab_len, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [86]:
def count_parameter(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameter(model):,} trainable parameters")

The model has 2,592,105 trainable parameters


# Loss and optimizer

In [87]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [88]:
model = model.to(device)
criterion = criterion.to(device)

In [89]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).sum().float()
    acc = correct / len(y)
    return acc

# Train the model

In [110]:
def train(model, iterator, optimizer, criterion, tepoch):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        tepoch.update(1)
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        tepoch.set_postfix(train_loss=epoch_loss/len(iterator),
                            train_accuracy=epoch_acc/len(iterator))
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [111]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [112]:
best_valid_loss = float("inf")

for epoch in range(N_EPOCHS):
    tepoch = tqdm.tqdm(train_iterator, unit="batch", leave=True, position=0)
    tepoch.set_description(f"Epoch {epoch+1}")

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, tepoch)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "best_model.pt")
        
    tepoch.set_postfix(train_loss=train_loss, train_accuracy=train_acc,
                       valid_loss=valid_loss, valid_accuracy=valid_acc)

Epoch 1: 100%|██████████| 274/274 [11:03<00:00,  2.42s/batch, train_accuracy=0.502, train_loss=0.693, valid_accuracy=0.487, valid_loss=0.695]
Epoch 2:  89%|████████▊ | 243/274 [10:00<01:12,  2.33s/batch, train_accuracy=0.444, train_loss=0.612]  

# Test the model

In [94]:
model.load_state_dict(torch.load("save/best_model.pt"))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f"Test loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%")

Test loss: 0.689 | Test Acc: 54.97%
