In [3]:
!pip install torchdata
!pip install -U torchtext==0.13.0
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.4.0-cp37-cp37m-manylinux2014_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
[?25hCollecting urllib3>=1.25
  Downloading urllib3-1.26.11-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 30.5 MB/s 
Collecting portalocker>=2.0.0
  Downloading portalocker-2.5.1-py2.py3-none-any.whl (15 kB)
Collecting urllib3>=1.25
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 57.6 MB/s 
Installing collected packages: urllib3, portalocker, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed portalocker-2.5.1 torchdata-0.4.0 urllib3-1.25.11


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 5.2 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 73.2 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 70.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.7 MB/s 
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 68.6 MB/s 
[?25hCollecting response

In [226]:
import torch
import torchtext
import datasets

# Used hugging face tutorial as a starting point: https://notebooks.githubusercontent.com/view/ipynb?azure_maps_enabled=false&browser=chrome&color_mode=auto&commit=b4efbefa47672174394a8b6a27d4e7bc193bc224&device=unknown&enc_url=68747470733a2f2f7261772e67697468756275736572636f6e74656e742e636f6d2f62656e747265766574742f7079746f7263682d73656e74696d656e742d616e616c797369732f623465666265666134373637323137343339346138623661323764346537626331393362633232342f746f726368746578745f305f31305f696d64625f6e626f772e6970796e62&enterprise_enabled=false&logged_in=false&nwo=bentrevett%2Fpytorch-sentiment-analysis&path=torchtext_0_10_imdb_nbow.ipynb&platform=android&repository_id=114125096&repository_type=Repository&version=102

In [227]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [228]:
# get dataset from hugging face library
train_data, test_data = datasets.load_dataset('imdb', split = ['train', 'test'])



  0%|          | 0/2 [00:00<?, ?it/s]

In [229]:
train_data, test_data

(Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }), Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }))

In [230]:
from torchtext.data.utils import get_tokenizer

# use a standard english tokenizer
tokenizer = get_tokenizer('basic_english')

In [231]:
# function that tokenizes the data
def tokenize_data(ex):
  tokens = {'tokens': tokenizer(ex['text'])}
  return tokens

In [232]:
# add a tokens column with the tokens for each text review
# Documentation: https://huggingface.co/docs/datasets/v2.4.0/en/package_reference/main_classes#datasets.Dataset.map
train_data = train_data.map(tokenize_data)
test_data = test_data.map(tokenize_data)



In [233]:
train_data, test_data

(Dataset({
     features: ['text', 'label', 'tokens'],
     num_rows: 25000
 }), Dataset({
     features: ['text', 'label', 'tokens'],
     num_rows: 25000
 }))

In [234]:
#Documentation: https://huggingface.co/docs/datasets/v2.4.0/en/package_reference/main_classes#datasets.Dataset.train_test_split
# split the training data into training and validation data
train_valid_data = train_data.train_test_split() # default test_size is 0.25
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

In [235]:
train_data, valid_data, test_data

(Dataset({
     features: ['text', 'label', 'tokens'],
     num_rows: 18750
 }), Dataset({
     features: ['text', 'label', 'tokens'],
     num_rows: 6250
 }), Dataset({
     features: ['text', 'label', 'tokens'],
     num_rows: 25000
 }))

In [236]:
from torchtext.vocab import Vocab, build_vocab_from_iterator

UNK_IDX, PAD_IDX = 0, 1
special_symbols = ['<unk>', '<pad>']

# build vocab from the text tokens from training data
vocab = build_vocab_from_iterator(train_data['tokens'], min_freq = 2, specials = special_symbols)

# set unknown index as the default
vocab.set_default_index(UNK_IDX)


In [237]:
len(vocab)

45440

In [238]:
vocab.get_itos()[:5]

['<unk>', '<pad>', 'the', '.', ',']

In [239]:
# transform the tokens into numerical values 
def transform_tokens(ex):
  idxs = {'idxs': [vocab[token] for token in ex['tokens']]}
  return idxs

In [240]:
# add a column with the idxs
train_data = train_data.map(transform_tokens)
valid_data = valid_data.map(transform_tokens)
test_data = test_data.map(transform_tokens)

  0%|          | 0/18750 [00:00<?, ?ex/s]

  0%|          | 0/6250 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

In [241]:
train_data, valid_data, test_data

(Dataset({
     features: ['text', 'label', 'tokens', 'idxs'],
     num_rows: 18750
 }), Dataset({
     features: ['text', 'label', 'tokens', 'idxs'],
     num_rows: 6250
 }), Dataset({
     features: ['text', 'label', 'tokens', 'idxs'],
     num_rows: 25000
 }))

In [242]:
print(train_data[0]) # labels are 0 (neg) and 1 (pos)

{'text': 'Very bad. Very, very bad. As a fellow who aspires to make, be in or - at least - sniff the catering table at a movie set, I find it hard to criticize independents who actually got a movie of any sort made. However, this movie ... oh dear.<br /><br />Realizing Frightworld doesn\'t aspire to anything more than crude exploitation (an honorable thing in itself) and to try to make it conform to more mainstream standards is a mistake. And to be fair, it is more entertaining than - say - Red Zone Cuba ... but not by much. So I won\'t try to critique, just let me ask throw out some observations.<br /><br />1) If gore is the point of the movie, shouldn\'t you be able to see it?<br /><br />2) If you have hire three sound men make sure at least one knows how to operate the equipment.<br /><br />3) In a horror movie your lead maniac must be scarier than a smurf doll. Difficult I know but really...<br /><br />4) There is a lot of talented videographers in the Buffalo/Rochester area, most 

In [243]:
# set format of the idxs and labels to torch tensors
# Documentation: https://huggingface.co/docs/datasets/v2.4.0/en/package_reference/main_classes#datasets.Dataset.set_format
#train_data.set_format(type = 'torch', columns = ['idxs', 'label'])
#valid_data.set_format(type = 'torch', columns = ['idxs', 'label'])
#test_data.set_format(type = 'torch', columns = ['idxs', 'label'])

In [271]:
# SIMPLE RNN MODEL
from torch import nn

class RNN(nn.Module):
# I referred to this tutorial for help: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb
    def __init__(self, vocab_size, emb_dim, hid_dim, output_dim):
        super().__init__()
        # embedding layer
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        # RNN layer
        self.rnn = nn.RNN(emb_dim, hid_dim) # default batch_first is False
        # linear layer
        self.linear = nn.Linear(hid_dim, output_dim)
    def forward(self, text):
        # input text is dimension [seq_len, batch_size]
        # apply embeddings to the words
        embedded = self.embedding(text)
        # embedded is dimension [seq_len, batch_size, emb_dim] because batch_first = False
        # run through RNN
        output, hidden = self.rnn(embedded)
        # hidden is dimension [1, batch_size, hid_dim]
        # get the predictions
        scores = self.linear(hidden.squeeze(0))
        # scores should have the dimension [batch_size, output_dim]
        return scores

In [272]:
from torch.nn.utils.rnn import pad_sequence

# collate function to put examples in batches 
def collate_batch(batch):
  batch_idxs, batch_labels = [], []

  for b in batch: 
    #print(b)
    batch_idxs.append(torch.LongTensor(b['idxs']))
    batch_labels.append(torch.tensor(b['label']))
  
  #print('batch_idxs')
  #print(batch_idxs)
  #print('batch_labels')
  #print(batch_labels)

  # pad idxs that are shorter than the longest sentence
  batch_idxs = pad_sequence(batch_idxs, batch_first = False, padding_value = PAD_IDX)
  batch_labels = torch.stack(batch_labels)
  
  #print('batch_idx again')
  #print(batch_idxs)

  batch = {'idxs': batch_idxs,
           'labels': batch_labels}
  
  return batch


In [276]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2 # there are only 2 labels - 0 (neg) or 1 (pos)

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [277]:
from torch.utils.data import DataLoader

BATCH_SIZE = 128

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=collate_batch)
valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=BATCH_SIZE, collate_fn=collate_batch)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=collate_batch)

In [278]:
for i, b in enumerate(train_dataloader):
  print('b')
  print(b)

  if i > -1:
    break

b
{'idxs': tensor([[   62,  2110,   682,  ...,   393, 13696,    56],
        [   85,  2818,  5745,  ...,   324,     4,    68],
        [    3,  1111,  2730,  ...,    10,  8145,    12],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]]), 'labels': tensor([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
        1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
        1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 0, 1, 0, 1, 0, 1])}


In [279]:
import torch.optim as optim

# initialize optimizer
optimizer = optim.Adam(model.parameters())

In [280]:
def get_accuracy(predictions, actual_labels):
  #print('predictions')
  #print(predictions)
  #print('predictions shape')
  #print(predictions.shape)
  #print('actual_labels')
  #print(actual_labels)
  batch_size = predictions.shape[0]
  predicted_labels = predictions.argmax(1)
  #print('predicted_labels')
  #print(predicted_labels)
  num_correct = (predicted_labels == actual_labels).sum()
  #print('num_correct')
  #print(num_correct)
  accuracy = num_correct / len(actual_labels)
  #print('accuracy')
  #print(accuracy)
  return accuracy


In [281]:
def train(dataloader):
  model.train()

  epoch_loss, epoch_accuracy = 0, 0
  log_interval = 500

  for idx, batch in enumerate(dataloader):
    token_ids = batch['idxs'].to(device)
    labels = batch['labels'].to(device)
    optimizer.zero_grad()
    predictions = model(token_ids)
    loss = criterion(predictions, labels)
    accuracy = get_accuracy(predictions, labels)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_accuracy += accuracy.item()

    #if idx % log_interval == 0 and idx > 0:
    #  print(f'At iteration {idx} the loss is {loss:.3f}.')
    #  print(f'At iteration {idx} the accuracy is {accuracy:.3f}.')

  return epoch_loss / len(dataloader), epoch_accuracy / len(dataloader)


In [282]:
def evaluate(dataloader):
  model.eval()

  epoch_loss, epoch_accuracy = 0, 0

  with torch.no_grad():
    for idx, batch in enumerate(dataloader):
      token_ids = batch['idxs'].to(device)
      labels = batch['labels'].to(device)
      predictions = model(token_ids)
      loss = criterion(predictions, labels)
      accuracy = get_accuracy(predictions, labels)
      epoch_loss += loss.item()
      epoch_accuracy += accuracy.item()
  
  return epoch_loss / len(dataloader), epoch_accuracy / len(dataloader)
  

In [283]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [284]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
  
  start_time = time.time()

  train_loss, train_acc = train(train_dataloader)
  valid_loss, valid_acc = evaluate(valid_dataloader)

  end_time = time.time()
    
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  # save the best model
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'tut2-model.pt')

  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}')
  print(f'valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}')


Epoch: 01 | Time: 0m 18s
train_loss: 0.702, train_acc: 0.497
valid_loss: 0.698, valid_acc: 0.490
Epoch: 02 | Time: 0m 18s
train_loss: 0.726, train_acc: 0.504
valid_loss: 0.698, valid_acc: 0.493
Epoch: 03 | Time: 0m 18s
train_loss: 0.699, train_acc: 0.496
valid_loss: 0.694, valid_acc: 0.499
Epoch: 04 | Time: 0m 18s
train_loss: 0.697, train_acc: 0.504
valid_loss: 0.695, valid_acc: 0.497
Epoch: 05 | Time: 0m 19s
train_loss: 0.696, train_acc: 0.504
valid_loss: 0.695, valid_acc: 0.497
Epoch: 06 | Time: 0m 18s
train_loss: 0.696, train_acc: 0.505
valid_loss: 0.695, valid_acc: 0.497
Epoch: 07 | Time: 0m 18s
train_loss: 0.696, train_acc: 0.505
valid_loss: 0.696, valid_acc: 0.498
Epoch: 08 | Time: 0m 18s
train_loss: 0.696, train_acc: 0.505
valid_loss: 0.696, valid_acc: 0.498
Epoch: 09 | Time: 0m 19s
train_loss: 0.696, train_acc: 0.504
valid_loss: 0.696, valid_acc: 0.498
Epoch: 10 | Time: 0m 18s
train_loss: 0.696, train_acc: 0.504
valid_loss: 0.696, valid_acc: 0.498


In [285]:
model.load_state_dict(torch.load('tut2-model.pt'))
test_loss, test_acc = evaluate(test_dataloader)
print(f'test_loss: {test_loss:.3f}, test_acc: {test_acc:.3f}')

test_loss: 0.694, test_acc: 0.504
