### Using torchtext datasets

In [1]:
from torchtext.datasets import IMDB
train_ds = IMDB('./data/imdb/train', split='train')
train_ds = list(train_ds)

test_ds = IMDB('./data/imdb/test', split='test')
test_dataset = list(test_ds)

In [2]:
from torch.utils.data.dataset import random_split
train_dataset, valid_dataset = random_split(train_ds, [20000, 5000])


In [3]:
from text_tokenizer import build_vocab
token_counts = build_vocab(train_dataset)
vocab_size = len(token_counts)
print('vocab_size:', vocab_size)

vocab_size: 69367


#### Creating the encoding dictionary


In [4]:
from torchtext.vocab import vocab
from collections import OrderedDict
sorted_tokens = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
tokens_dict = OrderedDict(sorted_tokens)
text_vocab = vocab(tokens_dict)

In [5]:
text_vocab.insert_token('<pad>', 0)
text_vocab.insert_token('<unk>', 1)
text_vocab.set_default_index(1)

In [6]:
from text_tokenizer import tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence
def text_pipeline(text):
    return [text_vocab[token] for token in tokenizer(text)]

def label_pipeline(label):
    return 1. if label == 'pos' else 0.

def collate_batch(batch):
    labels, texts, texts_lenghts = [], [], []
    for label, text in batch:
        labels.append(label_pipeline(label))
        procesed_text = text_pipeline(text)
        texts.append(torch.tensor(procesed_text, dtype=torch.int32))
        texts_lenghts.append(len(procesed_text))
    labels = torch.tensor(labels)
    texts_lenghts = torch.tensor(texts_lenghts)
    texts = pad_sequence(texts, batch_first=True)    
    return texts, labels, texts_lenghts   

In [7]:
from torch.utils.data import DataLoader
batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch, num_workers=20)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=20)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=20)

In [8]:
from torch import nn
features_size = 30
batch_size = 32
sequences_len = 62
rnn = nn.RNN(input_size=features_size, hidden_size=8, num_layers=1, batch_first=True)
sample_batch = torch.rand(batch_size, sequences_len, features_size)
a, b = rnn(sample_batch)

print(a.shape)
print(b.shape)

lstm = nn.LSTM(input_size=features_size, hidden_size=8, num_layers=1, batch_first=True)
c, d = lstm(sample_batch)
print(type(c))
print(type(d))

torch.Size([32, 62, 8])
torch.Size([1, 32, 8])
<class 'torch.Tensor'>
<class 'tuple'>


In [9]:
next(iter(train_dataloader))

(tensor([[ 127,   13,  249,  ...,    0,    0,    0],
         [  35,  301,  240,  ...,    0,    0,    0],
         [  11,   65,  143,  ...,    0,    0,    0],
         ...,
         [  10,   65,  507,  ...,    0,    0,    0],
         [   2, 1088,    5,  ...,    0,    0,    0],
         [   2,   20,    7,  ...,    0,    0,    0]], dtype=torch.int32),
 tensor([0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0.,
         1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1.,
         1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1.,
         1., 0., 1., 0., 1., 1., 0., 1., 0., 1.]),
 tensor([176, 322, 183, 110, 505, 381, 127, 168, 212, 178, 657, 207, 149, 119,
         174, 255, 104, 168, 105, 249, 122, 156,  90,  95, 293, 225, 309, 307,
          78, 201, 403,  81, 211, 526, 131, 147, 305, 363, 129, 135, 171, 288,
         167, 254, 134, 263, 529, 530, 162, 157, 157, 194, 285,  65, 460, 157,
         163, 323, 180, 925,  74, 195

In [10]:
features_size = 16
vocab_size = len(token_counts)
emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=features_size, padding_idx=0)

In [11]:
class SentimentModel0(nn.Module):
    def __init__(self, vocab_size, features_size=20, hidden_size=64, fc_size=64):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=features_size, padding_idx=0)
        self.rnn = nn.LSTM(input_size=features_size, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, fc_size)
        self.fc_relu = nn.ReLU()
        self.fc_out = nn.Linear(fc_size, 1)

    def forward(self, texts, texts_lengths):
        out = self.emb(texts)
        out = nn.utils.rnn.pack_padded_sequence(out, texts_lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        
        out = self.fc(out)
        out = self.fc_relu(out)
        out = self.fc_out(out)
        return out    

model = SentimentModel0(vocab_size+2)
print(model)        

SentimentModel0(
  (emb): Embedding(69369, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=64, bias=True)
  (fc_relu): ReLU()
  (fc_out): Linear(in_features=64, out_features=1, bias=True)
)


In [12]:

from torch.functional import F
from torch.optim.lr_scheduler import MultiStepLR


def train(model, dataloader, optimizer, device, loss_fn, progress_bar):
    model.train()
    epoch_loss, epoch_acc = 0., 0.
    num_samples = len(dataloader.dataset)
    for text_batch, labels_batch, lengths_batch in dataloader:
        optimizer.zero_grad()
        text_batch = text_batch.to(device)
        labels_batch = labels_batch.to(device)
        lengths_batch = lengths_batch.to(device)
        y_pred = model(text_batch, lengths_batch)[:, 0]
        loss = loss_fn(y_pred, labels_batch)
        loss.backward()
        optimizer.step()
        progress_bar.update(1)
        epoch_loss += loss.item() * text_batch.size(0)
        epoch_acc += (torch.sigmoid(y_pred).round() == labels_batch).float().sum().item()
    return epoch_acc/num_samples, epoch_loss/num_samples    
        
        
def evaluate(model, dataloader, device, loss_fn):
    model.eval()
    epoch_loss, epoch_acc = 0., 0.
    num_samples = len(dataloader.dataset)
    with torch.no_grad():
        for text_batch, labels_batch, lengths_batch in dataloader:
            text_batch = text_batch.to(device)
            labels_batch = labels_batch.to(device)
            lengths_batch = lengths_batch.to(device)
            y_pred = model(text_batch, lengths_batch)[:, 0]
            loss = loss_fn(y_pred, labels_batch)
            epoch_loss += loss.item() * text_batch.size(0)
            epoch_acc += (torch.sigmoid(y_pred).round() == labels_batch).float().sum().item()
    return epoch_acc/num_samples, epoch_loss/num_samples   
    

In [14]:
from tqdm.auto import tqdm
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()
device = torch.device('cuda')
model.to(device)
epochs = 10
progress_bar = tqdm(range(epochs*len(train_dataloader)))

for epoch in range(epochs):
    train_acc, train_loss = train(model, train_dataloader, optimizer, device, loss_fn, progress_bar)
    valid_acc, valid_loss = evaluate(model, valid_dataloader, device, loss_fn)
    print(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Valid Loss: {valid_loss:.4f}, Valid Acc: {valid_acc:.4f}')


  0%|          | 0/3130 [00:00<?, ?it/s]

Epoch: 0, Train Loss: 0.1234, Train Acc: 0.9582, Valid Loss: 0.3727, Valid Acc: 0.8738
Epoch: 1, Train Loss: 0.0986, Train Acc: 0.9679, Valid Loss: 0.4003, Valid Acc: 0.8750
Epoch: 2, Train Loss: 0.0811, Train Acc: 0.9750, Valid Loss: 0.4324, Valid Acc: 0.8724
Epoch: 3, Train Loss: 0.0679, Train Acc: 0.9796, Valid Loss: 0.4846, Valid Acc: 0.8698
Epoch: 4, Train Loss: 0.0504, Train Acc: 0.9862, Valid Loss: 0.5121, Valid Acc: 0.8744
Epoch: 5, Train Loss: 0.0394, Train Acc: 0.9891, Valid Loss: 0.5614, Valid Acc: 0.8666
Epoch: 6, Train Loss: 0.0301, Train Acc: 0.9922, Valid Loss: 0.5898, Valid Acc: 0.8714
Epoch: 7, Train Loss: 0.0248, Train Acc: 0.9937, Valid Loss: 0.6181, Valid Acc: 0.8700
Epoch: 8, Train Loss: 0.0178, Train Acc: 0.9959, Valid Loss: 0.7040, Valid Acc: 0.8678
Epoch: 9, Train Loss: 0.0105, Train Acc: 0.9980, Valid Loss: 0.6920, Valid Acc: 0.8678


In [202]:
len(train_dataloader.dataset)

20000