In [1]:
import torch
from torchtext.legacy import data

SEED = 1234
# same seed generator
torch.manual_seed(SEED)

# to make sure input output are fixed/deterministic
torch.backends.cudnn.deterministic = True

# load the text with spacy tokenization (in English)
TEXT = data.Field(tokenize="spacy", tokenizer_language="en_core_web_sm")
# the label type is float
LABEL = data.LabelField(dtype=torch.float)

In [None]:
from torchtext.legacy import datasets

# load the IMDB dataset
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print(f"Number of training examples: {len(train_data)}")
print(f"Number of testing examples: {len(test_data)}")
print(vars(train_data.examples[0]))

In [None]:
import random

# split the data into train set and valid set with random seed
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

In [None]:
MAX_VOCAB_SIZE = 25000

# build the vocab with given max vocab size
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
# switching the <pad> and <unk>, since we need <pad> to be 0 index
TEXT.vocab = TEXT.vocab_cls(
    TEXT.vocab.freqs, max_size=MAX_VOCAB_SIZE, specials=["<pad>", "<unk>"]
)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(TEXT.vocab.itos)
# print(TEXT.vocab.freqs.most_common(20))
# print(TEXT.vocab.itos[:10])
# print(LABEL.vocab.stoi)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Do not use BucketIterator in your implementation because you are required to implement the padding and masking yourself.
# TODO: implementing padding and masking
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=1, device=device
)

In [None]:
# define the mini-batch size => tunable
batch_size = 4

# train_iter, test_iter = train_data.iters(batch_size=batch_size)

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
# label preprocess pipeline
def label_pipeline(label):
    return LABEL.vocab.stoi[label]


# text preprocess pipeline
def text_pipeline(text):
    return [TEXT.vocab.stoi[token] for token in text]


# preprocess batch data before loading each batch
def collate_batch(batch):
    label_list, text_list = [], []
    for example in batch:
        _label, _text = example.label, example.text
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True)
    return text_list.to(device), label_list.to(device)

In [None]:
train_loader = DataLoader(
    dataset=train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)
valid_loader = DataLoader(
    dataset=valid_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)

In [None]:
# from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
# seq = torch.tensor([[1,2,0], [3,0,0], [4,5,6]])
# lens = [2, 1, 3]
# packed = pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)

In [None]:
import torch.nn as nn


class LR(nn.Module):
    def __init__(self, input_dim, embedding_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text).squeeze().sum(0)
        return self.fc(embedded)

In [None]:
import copy

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1

# vocab_len x 100 x 1 => a binary LR classifier
model_single_instance = LR(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)
model_mini_batch = LR(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

model_mini_batch.load_state_dict(copy.deepcopy(model_single_instance.state_dict()))

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model_single_instance):,} trainable parameters")

In [None]:
import torch.optim as optim

optimizer_si = optim.SGD(model_single_instance.parameters(), lr=1e-3)
optimizer_mb = optim.SGD(model_mini_batch.parameters(), lr=1e-3)

In [None]:
# Binary Cross Entropy with sigmoid layer
criterion = nn.BCEWithLogitsLoss()

In [None]:
model_single_instance = model_single_instance.to(device)
model_mini_batch = model_mini_batch.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    # round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [None]:
from tqdm import tqdm


def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for instance in tqdm(iterator, desc="Training...", total=len(iterator)):

        optimizer.zero_grad()

        predictions = model(instance.text)

        loss = criterion(predictions, instance.label)

        acc = binary_accuracy(predictions, instance.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for instance in iterator:

            predictions = model(instance.text)

            loss = criterion(predictions, instance.label)

            acc = binary_accuracy(predictions, instance.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_valid_loss = float("inf")

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(
        model_single_instance, train_iterator, optimizer_si, criterion
    )
    valid_loss, valid_acc = evaluate(model_single_instance, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_single_instance.state_dict(), "tut1-model.pt")

    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%")

In [None]:
N_EPOCHS = 5

best_valid_loss = float("inf")

for epoch in range(N_EPOCHS):

    start_time = time.time()
    for X, y in train_loader:
        train_loss, train_acc = train(
            model_mini_batch, train_iterator, optimizer_mb, criterion
        )

    valid_loss, valid_acc = evaluate(model_mini_batch, valid_iterator, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     torch.save(model_single_instance.state_dict(), 'tut1-model.pt')

    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%")

Training...: 100%|██████████| 17500/17500 [00:22<00:00, 770.24it/s]


Epoch: 01 | Epoch Time: 0m 26s
	Train Loss: 12.339 | Train Acc: 62.22%
	 Val. Loss: 8.236 |  Val. Acc: 62.87%


Training...: 100%|██████████| 17500/17500 [00:23<00:00, 759.91it/s]


Epoch: 02 | Epoch Time: 0m 26s
	Train Loss: 7.725 | Train Acc: 65.55%
	 Val. Loss: 6.985 |  Val. Acc: 67.36%


Training...: 100%|██████████| 17500/17500 [00:22<00:00, 763.05it/s]


Epoch: 03 | Epoch Time: 0m 26s
	Train Loss: 7.196 | Train Acc: 67.15%
	 Val. Loss: 4.812 |  Val. Acc: 71.17%


Training...: 100%|██████████| 17500/17500 [00:22<00:00, 767.04it/s]


Epoch: 04 | Epoch Time: 0m 26s
	Train Loss: 6.424 | Train Acc: 68.81%
	 Val. Loss: 6.351 |  Val. Acc: 69.84%


Training...: 100%|██████████| 17500/17500 [00:22<00:00, 780.28it/s]


Epoch: 05 | Epoch Time: 0m 26s
	Train Loss: 6.011 | Train Acc: 69.81%
	 Val. Loss: 5.491 |  Val. Acc: 74.05%


In [None]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f"Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%")

Test Loss: 5.811 | Test Acc: 67.99%


In [None]:
# Test of model correctness
max_n_test_instances = 5
i = 1
for instance in valid_iterator:
    score = model(instance.text)
    print(score)
    if i >= max_n_test_instances:
        break
    else:
        i += 1

tensor([-2.3597], device='cuda:0', grad_fn=<AddBackward0>)
tensor([-3.1945], device='cuda:0', grad_fn=<AddBackward0>)
tensor([-8.8104], device='cuda:0', grad_fn=<AddBackward0>)
tensor([-7.3358], device='cuda:0', grad_fn=<AddBackward0>)
tensor([-21.1220], device='cuda:0', grad_fn=<AddBackward0>)
