In [None]:
import pandas as pd
import numpy as np

np.random.seed(1337)

# read processed data
data = pd.read_csv('./imdb_processed_full.csv')
data = data.iloc[np.random.permutation(len(data))]
print(data['text'].str.len().describe())

CORPUS_MAX_SIZE = 752

data['text'] = data['text'].str.slice(0, CORPUS_MAX_SIZE)

count    50000.000000
mean       858.389040
std        658.428061
min         22.000000
25%        452.000000
50%        633.000000
75%       1044.000000
max       9434.000000
Name: text, dtype: float64


In [2]:
train_size = 0.8
validation_size = 0.5

split_id = int(len(data) * train_size)

temp_train_x, test_x = data.text[:split_id], data.text[split_id:]
temp_train_y, test_y = data.label[:split_id], data.label[split_id:]

# now we have train, val and test
print('Feature Shapes:')
print('===============')
# print('Train set: {}'.format(train_x.shape))
# print('Validation set: {}'.format(val_x.shape))
print('Test set: {}'.format(test_x.shape))

Feature Shapes:
Test set: (10000,)


In [3]:
from collections import Counter
from tqdm import tqdm
tqdm.pandas()

words = temp_train_x.str.cat(sep=' ').split()

# build vocabulary
frequency_counter = Counter(words)
# sort words by the frequency they appear in the text
vocab = sorted(frequency_counter, key=frequency_counter.get, reverse=True)

# associate a number to each word in the list in ascending order
# in this way the most frequent words have lower numbers
int2word = dict(enumerate(vocab[:5000], 2))
int2word[0] = '<PAD>'
int2word[1] = '<UNK>'
word2int = {word: id for id, word in int2word.items()}
# encode words
reviews_enc = [[word2int.get(word, 1) for word in review.split()] for review in tqdm(temp_train_x.values)]

100%|██████████| 40000/40000 [00:00<00:00, 85265.29it/s]


In [4]:
# TODO: maybe float 16
features = np.zeros((len(reviews_enc), CORPUS_MAX_SIZE), dtype=int)

for i, row in enumerate(reviews_enc):
  # print(f"ROW LENGTH {len(row)}")
  # print(f"ITEM {i} COULD BE INSERTED AT {1000 - len(row)}")
  # print(f"######################")
  index = CORPUS_MAX_SIZE - len(row)
  features[i, index:] = np.array(row)[:CORPUS_MAX_SIZE]
  # features[i, :len(row)] = np.array(row)[:1000]

# make val and test set
split_val_id = int(len(temp_train_x) * validation_size)
train_x, val_x = features[:split_val_id], features[split_val_id:]
train_y, val_y = temp_train_y[:split_val_id], temp_train_y[split_val_id:]

In [5]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

# define batch size
batch_size = 128

# create tensor datasets
trainset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y.to_numpy()))
validset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y.to_numpy()))

# create dataloaders
trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
valloader = DataLoader(validset, shuffle=True, batch_size=batch_size)

In [6]:
# model architecture

class SentimentModel(nn.Module):
  def __init__(self, vocab_size, output_size, hidden_size=128, embedding_size=400, n_layers=2, dropout=0.2):
    super(SentimentModel, self).__init__()
    # embedding layer is useful to map input into vector representation
    self.embedding = nn.Embedding(vocab_size, embedding_size)
    # LSTM layer preserved by PyTorch library
    self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout=dropout, batch_first=True)
    # dropout layer
    self.dropout = nn.Dropout(0.3)
    # Linear layer for output
    self.fc = nn.Linear(hidden_size, output_size)
    # Sigmoid layer cz we will have binary classification
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    # convert feature to long
    x = x.long()
    # map input to vector
    x = self.embedding(x)
    # pass forward to lstm
    o, _ =  self.lstm(x)
    # get last sequence output
    o = o[:, -1, :]
    # apply dropout and fully connected layer
    o = self.dropout(o)
    o = self.fc(o)
    # sigmoid
    o = self.sigmoid(o)

    return o

In [7]:
# define training device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.mps.is_available() else 'cpu')
print(device)

mps


In [8]:
# model hyperparamters
vocab_size = len(word2int)
output_size = 1
embedding_size = 256
hidden_size = 512
n_layers = 2
dropout=0.25

print(f"VOCAB SIZE: {vocab_size}")

# model initialization
model = SentimentModel(vocab_size, output_size, hidden_size, embedding_size, n_layers, dropout)
print(model)

VOCAB SIZE: 5002
SentimentModel(
  (embedding): Embedding(5002, 256)
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.25)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [9]:
# training config
lr = 0.001
criterion = nn.BCELoss()  # we use BCELoss cz we have binary classification problem
optim = Adam(model.parameters(), lr=lr)
grad_clip = 5
epochs = 8
print_every = 1
history = {
  'train_loss': [],
  'train_acc': [],
  'val_loss': [],
  'val_acc': [],
  'epochs': epochs
}
es_limit = 5

In [None]:
model = model.to(device)

epochloop = tqdm(range(epochs), position=0, desc="Training", leave=True)

# early stop trigger
es_trigger = 0
# setting the minimun validation loss to infinity so we are sure it will be resetted
val_loss_min = torch.inf

for e in epochloop:
  # training
  model.train()
  train_loss, train_acc = 0, 0

  for id, (feature, target) in enumerate(trainloader):
    epochloop.set_postfix_str(f"Training batch {id}/{len(trainloader)}")
    feature, target = feature.to(device), target.to(device)

    # reset the optimizer to avoid accumulation of gradients across training iterations
    optim.zero_grad()
    out = model(feature)
    predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
    equals = predicted == target
    acc = torch.mean(equals.type(torch.FloatTensor))
    train_acc += acc.item()

    loss = criterion(out.squeeze(), target.float())
    train_loss += loss.item()
    loss.backward()

    # gradient clipping to avoid exploding gradient
    nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

    # update optimizer
    optim.step()
    
    # free memory
    del feature, target, predicted
  
  history["train_loss"].append(train_loss / len(trainloader))
  history["train_acc"].append(train_acc / len(trainloader))

  # validation
  model.eval()
  val_loss, val_acc = 0, 0

  with torch.no_grad():
    for id, (feature, target) in enumerate(valloader):
      epochloop.set_postfix_str(f"Validation batch {id}/{len(valloader)}")
      feature, target = feature.to(device), target.to(device)

      out = model(feature)

      predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
      equals = predicted == target
      acc = torch.mean(equals.type(torch.FloatTensor))
      val_acc += acc.item()

      loss = criterion(out.squeeze(), target.float())
      val_loss += loss.item()

      del feature, target, predicted

    history["val_loss"].append(val_loss/len(valloader))
    history["val_acc"].append(val_acc/len(valloader))

  model.train()
  epochloop.set_postfix_str(f"Val loss: {val_loss / len(valloader):.3f} | Val acc: {val_acc / len(valloader):.3f}")

  if (e+1) % print_every == 0:
    epochloop.write(f"epoch {e+1}/{epochs} | val loss: {val_loss/len(valloader):.3f} val acc: {val_acc/len(valloader):.3f}")
    epochloop.update()

  # save model if validation loss decrease 
  if val_loss / len(valloader) <= val_loss_min:
    torch.save(model.state_dict(), "./sentiment_lstm.pt")
    val_loss_min = val_loss / len(valloader)
    es_trigger = 0
  else:
    epochloop.write(f"[WARNING] validation loss did not improve")
    es_trigger += 1
  
  if es_trigger >= es_limit: 
    epochloop.write(f"early stopped at epoch {e+1}")
    history["epochs"] = e+1
    break

In [10]:
# map_location=torch.device('mps')
model = model.to(device)
model.load_state_dict(torch.load("./sentiment_lstm.pt", weights_only=True, map_location=torch.device('mps')))

test_enc = [[word2int.get(word, 1) for word in review.split()] for review in tqdm(test_x.values)]

test_features = np.zeros((len(test_enc), CORPUS_MAX_SIZE), dtype=int)

for i, row in enumerate(test_enc):
  index = CORPUS_MAX_SIZE - len(row)
  test_features[i, index:] = np.array(row)[:CORPUS_MAX_SIZE]

testset = TensorDataset(torch.from_numpy(test_features), torch.from_numpy(test_y.to_numpy()))
testloader = DataLoader(testset, shuffle=True, batch_size=batch_size)

# test loop
model.eval()

# metrics
test_loss = 0
test_acc = 0

all_target = []
all_predicted = []

testloop = tqdm(testloader, leave=True, desc='Inference')
with torch.no_grad():
  for feature, target in testloop:
    feature, target = feature.to(device), target.to(device)

    out = model(feature)
    
    predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
    equals = predicted == target
    acc = torch.mean(equals.type(torch.FloatTensor))
    test_acc += acc.item()

    loss = criterion(out.squeeze(), target.float())
    test_loss += loss.item()

    all_target.extend(target.cpu().numpy())
    all_predicted.extend(predicted.cpu().numpy())

    print(f'Accuracy: {test_acc/len(testloader):.4f}, Loss: {test_loss/len(testloader):.4f}')

100%|██████████| 10000/10000 [00:00<00:00, 97380.48it/s]
Inference:   1%|▏         | 1/79 [00:01<01:22,  1.06s/it]

Accuracy: 0.0104, Loss: 0.0043


Inference:   3%|▎         | 2/79 [00:01<00:53,  1.44it/s]

Accuracy: 0.0212, Loss: 0.0085


Inference:   4%|▍         | 3/79 [00:01<00:43,  1.74it/s]

Accuracy: 0.0323, Loss: 0.0130


Inference:   5%|▌         | 4/79 [00:02<00:39,  1.88it/s]

Accuracy: 0.0422, Loss: 0.0180


Inference:   6%|▋         | 5/79 [00:02<00:37,  1.98it/s]

Accuracy: 0.0525, Loss: 0.0235


Inference:   8%|▊         | 6/79 [00:03<00:35,  2.06it/s]

Accuracy: 0.0621, Loss: 0.0293


Inference:   9%|▉         | 7/79 [00:03<00:33,  2.15it/s]

Accuracy: 0.0724, Loss: 0.0349


Inference:  10%|█         | 8/79 [00:04<00:32,  2.21it/s]

Accuracy: 0.0828, Loss: 0.0397


Inference:  11%|█▏        | 9/79 [00:04<00:31,  2.25it/s]

Accuracy: 0.0937, Loss: 0.0439


Inference:  13%|█▎        | 10/79 [00:05<00:30,  2.27it/s]

Accuracy: 0.1044, Loss: 0.0479


Inference:  14%|█▍        | 11/79 [00:05<00:29,  2.29it/s]

Accuracy: 0.1147, Loss: 0.0524


Inference:  15%|█▌        | 12/79 [00:05<00:28,  2.31it/s]

Accuracy: 0.1248, Loss: 0.0578


Inference:  16%|█▋        | 13/79 [00:06<00:28,  2.32it/s]

Accuracy: 0.1353, Loss: 0.0632


Inference:  18%|█▊        | 14/79 [00:06<00:28,  2.30it/s]

Accuracy: 0.1462, Loss: 0.0679


Inference:  19%|█▉        | 15/79 [00:07<00:27,  2.31it/s]

Accuracy: 0.1571, Loss: 0.0715


Inference:  20%|██        | 16/79 [00:07<00:27,  2.32it/s]

Accuracy: 0.1677, Loss: 0.0767


Inference:  22%|██▏       | 17/79 [00:08<00:27,  2.29it/s]

Accuracy: 0.1777, Loss: 0.0821


Inference:  23%|██▎       | 18/79 [00:08<00:26,  2.28it/s]

Accuracy: 0.1888, Loss: 0.0857


Inference:  24%|██▍       | 19/79 [00:08<00:26,  2.29it/s]

Accuracy: 0.1991, Loss: 0.0907


Inference:  25%|██▌       | 20/79 [00:09<00:26,  2.27it/s]

Accuracy: 0.2101, Loss: 0.0952


Inference:  27%|██▋       | 21/79 [00:09<00:25,  2.28it/s]

Accuracy: 0.2209, Loss: 0.0998


Inference:  28%|██▊       | 22/79 [00:10<00:25,  2.28it/s]

Accuracy: 0.2310, Loss: 0.1045


Inference:  29%|██▉       | 23/79 [00:10<00:24,  2.30it/s]

Accuracy: 0.2420, Loss: 0.1087


Inference:  30%|███       | 24/79 [00:11<00:24,  2.27it/s]

Accuracy: 0.2529, Loss: 0.1125


Inference:  32%|███▏      | 25/79 [00:11<00:23,  2.29it/s]

Accuracy: 0.2631, Loss: 0.1173


Inference:  33%|███▎      | 26/79 [00:11<00:22,  2.31it/s]

Accuracy: 0.2728, Loss: 0.1227


Inference:  34%|███▍      | 27/79 [00:12<00:22,  2.33it/s]

Accuracy: 0.2838, Loss: 0.1273


Inference:  35%|███▌      | 28/79 [00:12<00:21,  2.33it/s]

Accuracy: 0.2940, Loss: 0.1327


Inference:  37%|███▋      | 29/79 [00:13<00:21,  2.34it/s]

Accuracy: 0.3040, Loss: 0.1381


Inference:  38%|███▊      | 30/79 [00:13<00:20,  2.34it/s]

Accuracy: 0.3146, Loss: 0.1423


Inference:  39%|███▉      | 31/79 [00:14<00:20,  2.35it/s]

Accuracy: 0.3246, Loss: 0.1473


Inference:  41%|████      | 32/79 [00:14<00:20,  2.35it/s]

Accuracy: 0.3349, Loss: 0.1524


Inference:  42%|████▏     | 33/79 [00:14<00:19,  2.35it/s]

Accuracy: 0.3450, Loss: 0.1580


Inference:  43%|████▎     | 34/79 [00:15<00:19,  2.32it/s]

Accuracy: 0.3554, Loss: 0.1623


Inference:  44%|████▍     | 35/79 [00:15<00:19,  2.30it/s]

Accuracy: 0.3656, Loss: 0.1673


Inference:  46%|████▌     | 36/79 [00:16<00:18,  2.30it/s]

Accuracy: 0.3752, Loss: 0.1728


Inference:  47%|████▋     | 37/79 [00:16<00:18,  2.29it/s]

Accuracy: 0.3864, Loss: 0.1771


Inference:  48%|████▊     | 38/79 [00:17<00:17,  2.31it/s]

Accuracy: 0.3973, Loss: 0.1815


Inference:  49%|████▉     | 39/79 [00:17<00:17,  2.32it/s]

Accuracy: 0.4078, Loss: 0.1865


Inference:  51%|█████     | 40/79 [00:17<00:16,  2.33it/s]

Accuracy: 0.4182, Loss: 0.1911


Inference:  52%|█████▏    | 41/79 [00:18<00:16,  2.33it/s]

Accuracy: 0.4282, Loss: 0.1960


Inference:  53%|█████▎    | 42/79 [00:18<00:15,  2.34it/s]

Accuracy: 0.4395, Loss: 0.1995


Inference:  54%|█████▍    | 43/79 [00:19<00:15,  2.32it/s]

Accuracy: 0.4498, Loss: 0.2045


Inference:  56%|█████▌    | 44/79 [00:19<00:15,  2.33it/s]

Accuracy: 0.4599, Loss: 0.2096


Inference:  57%|█████▋    | 45/79 [00:20<00:14,  2.34it/s]

Accuracy: 0.4699, Loss: 0.2147


Inference:  58%|█████▊    | 46/79 [00:20<00:14,  2.34it/s]

Accuracy: 0.4810, Loss: 0.2191


Inference:  59%|█████▉    | 47/79 [00:20<00:13,  2.35it/s]

Accuracy: 0.4913, Loss: 0.2242


Inference:  61%|██████    | 48/79 [00:21<00:13,  2.35it/s]

Accuracy: 0.5014, Loss: 0.2292


Inference:  62%|██████▏   | 49/79 [00:21<00:12,  2.35it/s]

Accuracy: 0.5126, Loss: 0.2331


Inference:  63%|██████▎   | 50/79 [00:22<00:12,  2.35it/s]

Accuracy: 0.5235, Loss: 0.2372


Inference:  65%|██████▍   | 51/79 [00:22<00:11,  2.35it/s]

Accuracy: 0.5349, Loss: 0.2411


Inference:  66%|██████▌   | 52/79 [00:23<00:11,  2.36it/s]

Accuracy: 0.5456, Loss: 0.2454


Inference:  67%|██████▋   | 53/79 [00:23<00:11,  2.32it/s]

Accuracy: 0.5562, Loss: 0.2502


Inference:  68%|██████▊   | 54/79 [00:23<00:10,  2.31it/s]

Accuracy: 0.5664, Loss: 0.2549


Inference:  70%|██████▉   | 55/79 [00:24<00:10,  2.27it/s]

Accuracy: 0.5771, Loss: 0.2593


Inference:  71%|███████   | 56/79 [00:24<00:10,  2.26it/s]

Accuracy: 0.5874, Loss: 0.2648


Inference:  72%|███████▏  | 57/79 [00:25<00:09,  2.28it/s]

Accuracy: 0.5982, Loss: 0.2693


Inference:  73%|███████▎  | 58/79 [00:25<00:09,  2.29it/s]

Accuracy: 0.6090, Loss: 0.2736


Inference:  75%|███████▍  | 59/79 [00:26<00:08,  2.31it/s]

Accuracy: 0.6197, Loss: 0.2783


Inference:  76%|███████▌  | 60/79 [00:26<00:08,  2.32it/s]

Accuracy: 0.6300, Loss: 0.2828


Inference:  77%|███████▋  | 61/79 [00:27<00:07,  2.33it/s]

Accuracy: 0.6407, Loss: 0.2874


Inference:  78%|███████▊  | 62/79 [00:27<00:07,  2.31it/s]

Accuracy: 0.6521, Loss: 0.2910


Inference:  80%|███████▉  | 63/79 [00:27<00:06,  2.32it/s]

Accuracy: 0.6636, Loss: 0.2940


Inference:  81%|████████  | 64/79 [00:28<00:06,  2.32it/s]

Accuracy: 0.6741, Loss: 0.2988


Inference:  82%|████████▏ | 65/79 [00:28<00:06,  2.33it/s]

Accuracy: 0.6845, Loss: 0.3029


Inference:  84%|████████▎ | 66/79 [00:29<00:05,  2.33it/s]

Accuracy: 0.6947, Loss: 0.3080


Inference:  85%|████████▍ | 67/79 [00:29<00:05,  2.33it/s]

Accuracy: 0.7049, Loss: 0.3133


Inference:  86%|████████▌ | 68/79 [00:30<00:04,  2.33it/s]

Accuracy: 0.7155, Loss: 0.3178


Inference:  87%|████████▋ | 69/79 [00:30<00:04,  2.33it/s]

Accuracy: 0.7257, Loss: 0.3232


Inference:  89%|████████▊ | 70/79 [00:30<00:03,  2.33it/s]

Accuracy: 0.7354, Loss: 0.3283


Inference:  90%|████████▉ | 71/79 [00:31<00:03,  2.31it/s]

Accuracy: 0.7460, Loss: 0.3332


Inference:  91%|█████████ | 72/79 [00:31<00:03,  2.29it/s]

Accuracy: 0.7570, Loss: 0.3370


Inference:  92%|█████████▏| 73/79 [00:32<00:02,  2.30it/s]

Accuracy: 0.7669, Loss: 0.3424


Inference:  94%|█████████▎| 74/79 [00:32<00:02,  2.31it/s]

Accuracy: 0.7776, Loss: 0.3476


Inference:  95%|█████████▍| 75/79 [00:33<00:01,  2.32it/s]

Accuracy: 0.7893, Loss: 0.3513


Inference:  96%|█████████▌| 76/79 [00:33<00:01,  2.33it/s]

Accuracy: 0.7998, Loss: 0.3555


Inference:  97%|█████████▋| 77/79 [00:33<00:00,  2.33it/s]

Accuracy: 0.8106, Loss: 0.3597


Inference: 100%|██████████| 79/79 [00:34<00:00,  2.29it/s]

Accuracy: 0.8212, Loss: 0.3646
Accuracy: 0.8315, Loss: 0.3720





In [11]:
from sklearn.metrics import classification_report
print(classification_report(all_predicted, all_target))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      5004
           1       0.83      0.83      0.83      4996

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

