In [20]:
import pandas as pd
import numpy as np

np.random.seed(1337)

# read processed data
data = pd.read_csv('./imdb_processed.csv')
data = data.iloc[np.random.permutation(len(data))]
print(data['text'].str.len().describe())

data['text'] = data['text'].str.slice(0, 1000)

count    50000.000000
mean       858.389040
std        658.428061
min         22.000000
25%        452.000000
50%        633.000000
75%       1044.000000
max       9434.000000
Name: text, dtype: float64


In [21]:
train_size = 0.8
validation_size = 0.5

split_id = int(len(data) * train_size)

temp_train_x, test_x = data.text[:split_id], data.text[split_id:]
temp_train_y, test_y = data.label[:split_id], data.label[split_id:]

# now we have train, val and test
print('Feature Shapes:')
print('===============')
# print('Train set: {}'.format(train_x.shape))
# print('Validation set: {}'.format(val_x.shape))
print('Test set: {}'.format(test_x.shape))

Feature Shapes:
Test set: (10000,)


In [22]:
# TODO: SHOULD DO THIS ONLY ON THE TRAINING DATA

from collections import Counter
from tqdm import tqdm
tqdm.pandas()

words = temp_train_x.str.cat(sep=' ').split()

# build vocabulary
frequency_counter = Counter(words)
# sort words by the frequency they appear in the text
vocab = sorted(frequency_counter, key=frequency_counter.get, reverse=True)

# associate a number to each word in the list in ascending order
# in this way the most frequent words have lower numbers
int2word = dict(enumerate(vocab[:5000], 2))
int2word[0] = '<PAD>'
int2word[1] = '<UNK>'
word2int = {word: id for id, word in int2word.items()}
# encode words
reviews_enc = [[word2int.get(word, 1) for word in review.split()] for review in tqdm(temp_train_x.values)]

100%|██████████| 40000/40000 [00:00<00:00, 92114.08it/s]


In [23]:
# TODO: maybe float 16
features = np.zeros((len(reviews_enc), 1000), dtype=int)

for i, row in enumerate(reviews_enc):
  # print(f"ROW LENGTH {len(row)}")
  # print(f"ITEM {i} COULD BE INSERTED AT {1000 - len(row)}")
  # print(f"######################")
  index = 1000 - len(row)
  features[i, index:] = np.array(row)[:1000]
  # features[i, :len(row)] = np.array(row)[:1000]

# make val and test set
split_val_id = int(len(temp_train_x) * validation_size)
train_x, val_x = features[:split_val_id], features[split_val_id:]
train_y, val_y = temp_train_y[:split_val_id], temp_train_y[split_val_id:]

In [24]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

# define batch size
batch_size = 128

# create tensor datasets
trainset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y.to_numpy()))
validset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y.to_numpy()))
# testset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# create dataloaders
trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
valloader = DataLoader(validset, shuffle=True, batch_size=batch_size)
# testloader = DataLoader(testset, shuffle=True, batch_size=batch_size)

In [25]:
# model architecture

class SentimentModel(nn.Module):
  def __init__(self, vocab_size, output_size, hidden_size=128, embedding_size=400, n_layers=2, dropout=0.2):
    super(SentimentModel, self).__init__()
    # embedding layer is useful to map input into vector representation
    self.embedding = nn.Embedding(vocab_size, embedding_size)
    # LSTM layer preserved by PyTorch library
    self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout=dropout, batch_first=True)
    # dropout layer
    self.dropout = nn.Dropout(0.3)
    # Linear layer for output
    self.fc = nn.Linear(hidden_size, output_size)
    # Sigmoid layer cz we will have binary classification
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    # convert feature to long
    x = x.long()
    # map input to vector
    x = self.embedding(x)
    # pass forward to lstm
    o, _ =  self.lstm(x)
    # get last sequence output
    o = o[:, -1, :]
    # apply dropout and fully connected layer
    o = self.dropout(o)
    o = self.fc(o)
    # sigmoid
    o = self.sigmoid(o)

    return o

In [26]:
# define training device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.mps.is_available() else 'cpu')
print(device)

mps


In [27]:
# model hyperparamters
vocab_size = len(word2int)
output_size = 1
embedding_size = 256
hidden_size = 512
n_layers = 2
dropout=0.25

print(f"VOCAB SIZE: {vocab_size}")

# model initialization
model = SentimentModel(vocab_size, output_size, hidden_size, embedding_size, n_layers, dropout)
print(model)

VOCAB SIZE: 5002
SentimentModel(
  (embedding): Embedding(5002, 256)
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.25)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [28]:
# training config
lr = 0.001
criterion = nn.BCELoss()  # we use BCELoss cz we have binary classification problem
optim = Adam(model.parameters(), lr=lr)
grad_clip = 5
epochs = 8
print_every = 1
history = {
  'train_loss': [],
  'train_acc': [],
  'val_loss': [],
  'val_acc': [],
  'epochs': epochs
}
es_limit = 5

In [29]:
# TODO: read and understand training and testing data