In [1]:
#! pip install torchtext==0.17.2
#! pip install numpy==1.24.4

In [2]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import StepLR
from torch.nn.utils import clip_grad_norm_

from torchtext.data.utils import get_tokenizer

from collections import Counter

from sklearn.model_selection import train_test_split

import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
imdb_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Text DL/Text Classification CNN/IMDB Dataset.csv')
df = imdb_data.copy()

In [21]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [22]:
tokenizer = get_tokenizer('basic_english')

In [23]:
def preprocess(text):
  text = re.sub('<br />', ' ', text)
  text = re.sub('[^a-zA-Z]', ' ', text)
  text = text.lower()
  return tokenizer(text)

In [24]:
df['review'] = df['review'].apply(preprocess)

word_count = Counter(word for review in df['review'] for word in review)
vocabs = sorted(word_count, key=word_count.get, reverse=True)
w_i = {w:i+1 for i, w in enumerate(vocabs)}

encoded_review = [[w_i[word] for word in review] for review in df['review']]
encoded_labels = [1 if sentiment == 'positive' else 0 for sentiment in df['sentiment']]

padded_review = pad_sequence([torch.tensor(r) for r in encoded_review], batch_first=True, padding_value=0)[:,:200]

features = torch.tensor(padded_review)
labels = torch.tensor(encoded_labels)

  features = torch.tensor(padded_review)


In [25]:
class IMDBDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.features)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

train_ds = IMDBDataset(X_train, y_train)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)

test_ds = IMDBDataset(X_test, y_test)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True)

In [50]:
class LSTMClassifier(nn.Module):
  def __init__(self, vocab_size, embedd_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
    super().__init__()
    self.embedded = nn.Embedding(vocab_size, embedd_dim, padding_idx=0)
    self.lstm = nn.LSTM(
        input_size=embedd_dim,
        hidden_size=hidden_dim,
        num_layers=n_layers,
        bidirectional=bidirectional,
        dropout=dropout if n_layers > 0 else 0,
        batch_first=True
    )
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    embedded = self.embedded(text)
    _, (hidden, cell) = self.lstm(embedded)
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
    return self.fc(hidden)

In [51]:
VOCAB_SIZE = len(vocabs)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256  # Dimension of the hidden state
OUTPUT_DIM = 1
N_LAYERS = 2      # Number of stacked RNN layers
BIDIRECTIONAL = True
DROPOUT = 0.5

In [52]:
model = LSTMClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')
model = model.to(device)

device: cuda


In [53]:
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=5, gamma=0.2)

In [54]:
def calc_acc(preds, y):
  rounded = torch.round(torch.sigmoid(preds))
  correct = (rounded == y).float()
  acc = correct.sum() / len(correct)
  return acc

In [58]:
def train_model(model, iters, criterion, optimizer):
  epoch_loss, epoch_acc = 0, 0
  model.train()

  for batch in iters:
    text, labels = batch
    text, labels = text.to(device), labels.to(device)

    optimizer.zero_grad()
    output = model(text).squeeze(1)
    loss = criterion(output, labels.float())
    acc = calc_acc(output, labels.float())

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss/len(iters), epoch_acc/len(iters)

In [59]:
def eval_model(model, iters, criterion):
  epoch_loss, epoch_acc = 0, 0
  model.eval()

  with torch.no_grad():
    for batch in iters:
      text, labels = batch
      text, labels = text.to(device), labels.to(device)

      output = model(text).squeeze(1)
      loss = criterion(output, labels.float())
      acc = calc_acc(output, labels.float())

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss/len(iters), epoch_acc/len(iters)


In [60]:
for epoch in range(20):
    train_loss, train_acc = train_model(model, train_dl, criterion, optimizer)
    valid_loss, valid_acc = eval_model(model, test_dl, criterion)
    scheduler.step()

    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')


Epoch: 01 | Train Loss: 0.539 | Train Acc: 73.21% | Val. Loss: 0.500 | Val. Acc: 76.27%
Epoch: 02 | Train Loss: 0.464 | Train Acc: 78.59% | Val. Loss: 0.471 | Val. Acc: 77.93%
Epoch: 03 | Train Loss: 0.416 | Train Acc: 81.71% | Val. Loss: 0.396 | Val. Acc: 83.01%
Epoch: 04 | Train Loss: 0.382 | Train Acc: 83.36% | Val. Loss: 0.379 | Val. Acc: 83.72%
Epoch: 05 | Train Loss: 0.355 | Train Acc: 84.71% | Val. Loss: 0.383 | Val. Acc: 83.84%
Epoch: 06 | Train Loss: 0.304 | Train Acc: 87.55% | Val. Loss: 0.360 | Val. Acc: 85.08%
Epoch: 07 | Train Loss: 0.292 | Train Acc: 88.03% | Val. Loss: 0.354 | Val. Acc: 85.49%
Epoch: 08 | Train Loss: 0.284 | Train Acc: 88.46% | Val. Loss: 0.354 | Val. Acc: 85.38%
Epoch: 09 | Train Loss: 0.278 | Train Acc: 88.71% | Val. Loss: 0.361 | Val. Acc: 85.66%
Epoch: 10 | Train Loss: 0.271 | Train Acc: 89.11% | Val. Loss: 0.347 | Val. Acc: 86.05%
Epoch: 11 | Train Loss: 0.258 | Train Acc: 89.71% | Val. Loss: 0.352 | Val. Acc: 85.93%
Epoch: 12 | Train Loss: 0.256 | 