<a href="https://colab.research.google.com/github/ggsmith842/pytorch-tutorials/blob/main/BIDIRECTIONAL_LSTM_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -Uq datasets transformers

In [3]:
import os
import time
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm

from transformers import AutoTokenizer
from datasets import load_dataset, concatenate_datasets, Dataset
from torch.utils.data import DataLoader, TensorDataset

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Load & Stage Data

In [None]:
# load imdb dataset from huggingface
ds = load_dataset("stanfordnlp/imdb")

In [6]:
merged_dataset = concatenate_datasets([ds['train'], ds['test']])
split_dataset = merged_dataset.train_test_split(test_size=0.25, shuffle=True, seed=42)

train = split_dataset['train']
test = split_dataset['test']

In [7]:
train['text'][0]

"This movie was one of the most boring horror movies I have seen in a long time (and I have seen a lot). Personally I liked the piercing take on it all that was original but other than that it was pretty unwatchable. I could not stand Dee Snider as an actor nor as a singer. I seemed that he was trying with everything he said to make it a memorable quote, which they weren't. I can get movies for free and I still didn't think it was worth the time to get."

# Preprocessing

In [8]:
# import tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
def tokenize_reviews(data: Dataset):
  inputs = data['text']
  tokens = tokenizer(
      inputs,
      return_tensors='pt',
      padding=True,
      truncation=True,
      max_length=512)

  return (tokens['input_ids'], tokens['attention_mask'])

In [10]:
train_input_ids, train_attention_mask = tokenize_reviews(train)
test_input_ids, test_attention_mask = tokenize_reviews(test)

In [11]:
train_labels = torch.tensor(train['label'])
test_labels = torch.tensor(test['label'])

## Train & Validation Loader

In [12]:
# create custom datasets
train_dataset = TensorDataset(
    train_input_ids,
    train_attention_mask,
    train_labels
    )

test_dataset = TensorDataset(
    test_input_ids,
    test_attention_mask,
    test_labels
)


In [13]:
batch_size=32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True)

# Model Instantiation and Training

In [21]:
class LSTM(nn.Module):
  def __init__(self, embedding_dim: int, hidden_dim: int , output_dim: int, num_layers: int, dropout: float = 0.0):
    super(LSTM, self).__init__()
    vocab_size = len(tokenizer.get_vocab())
    self.embedding = nn.Embedding.from_pretrained(torch.zeros(vocab_size,embedding_dim))
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)

    # dropout % to prevent overfitting
    self.dropout = nn.Dropout(dropout)
    # fully connected (linear) layer; x2 because bidirectional
    self.fc = nn.Linear(hidden_dim * 2, output_dim)

  def forward(self, input_ids, attention_mask):
    # pass input ids to get embeddings
    embedded = self.embedding(input_ids)
    outputs, _ = self.lstm(embedded)
    outputs = self.dropout(outputs)

    outputs = self.fc(outputs[:, -1, :])

    return outputs

In [27]:
# define training loop
def training_loop(model: nn.Module, dataloader: DataLoader, epochs: int = 10):
  model.train()

  for epoch in range(epochs):
    correct = 0
    total = 0

    for batch in tqdm(dataloader):
      batch = [item.to(device, non_blocking=True) for item in batch]
      input_ids, attention_mask, labels = batch
      optimizer.zero_grad()
      outputs = model(input_ids, attention_mask)
      predicted = (torch.sigmoid(outputs) > 0.5).float()
      total += labels.size(0)
      correct += (predicted.squeeze() == labels.float()).sum().item()
      loss = loss_func(outputs.squeeze(), labels.float())
      loss.backward()
      optimizer.step()
    accuracy = correct / total
    print(f'Epoch {1+epoch}/{epochs}: Loss {loss.item()} | Accuracy {accuracy*100:.2f}%\n')


In [35]:
def eval_loop(model: nn.Module, dataloader: DataLoader):
  model.eval()
  correct = 0
  total = 0
  with torch.no_grad():
    for batch in tqdm(dataloader):
      batch = [item.to(device) for item in batch]
      input_ids, attention_mask, labels = batch
      outputs = model(input_ids, attention_mask)
      predicted = (torch.sigmoid(outputs) > 0.5).float()
      total += labels.size(0)
      correct += (predicted.squeeze() == labels.float()).sum().item()
  accuracy = correct/ total

  print(f'Accuracy: {accuracy*100:.2f}%')

In [29]:
# Initialize the model
embedding_dim = 128
hidden_dim = 256
output_dim = 1
num_layers = 2
dropout = 0.5

In [30]:
lstm_model = LSTM(embedding_dim=embedding_dim, output_dim=output_dim, num_layers=num_layers, hidden_dim=hidden_dim).to(device)
optimizer = optim.Adam(lstm_model.parameters(), lr=0.003)
loss_func = nn.BCEWithLogitsLoss().to(device)

In [31]:
training_loop(lstm_model, train_dataloader, 10)

100%|██████████| 1172/1172 [01:49<00:00, 10.66it/s]


Epoch 1/10: Loss 0.6946743726730347 | Accuracy 50.52%


100%|██████████| 1172/1172 [02:05<00:00,  9.36it/s]


Epoch 2/10: Loss 0.6942399144172668 | Accuracy 50.14%


100%|██████████| 1172/1172 [01:58<00:00,  9.90it/s]


Epoch 3/10: Loss 0.6887026429176331 | Accuracy 50.22%


100%|██████████| 1172/1172 [01:53<00:00, 10.36it/s]


Epoch 4/10: Loss 0.692821204662323 | Accuracy 50.10%


100%|██████████| 1172/1172 [01:46<00:00, 10.97it/s]


Epoch 5/10: Loss 0.6931650042533875 | Accuracy 49.89%


100%|██████████| 1172/1172 [01:45<00:00, 11.06it/s]


Epoch 6/10: Loss 0.6937905550003052 | Accuracy 49.77%


100%|██████████| 1172/1172 [01:45<00:00, 11.13it/s]


Epoch 7/10: Loss 0.6930750608444214 | Accuracy 49.97%


100%|██████████| 1172/1172 [01:45<00:00, 11.12it/s]


Epoch 8/10: Loss 0.6940949559211731 | Accuracy 49.73%


100%|██████████| 1172/1172 [01:45<00:00, 11.13it/s]


Epoch 9/10: Loss 0.6928125023841858 | Accuracy 49.95%


100%|██████████| 1172/1172 [01:45<00:00, 11.14it/s]

Epoch 10/10: Loss 0.6923677325248718 | Accuracy 50.78%





In [36]:
eval_loop(lstm_model, test_dataloader)

100%|██████████| 391/391 [00:14<00:00, 27.15it/s]

Accuracy: 50.68%



