In [1]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import DataLoader, Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load IMDB dataset
def load_imdb_data():
    from datasets import load_dataset
    dataset = load_dataset("imdb")
    train_texts = dataset['train']['text']
    train_labels = dataset['train']['label']
    test_texts = dataset['test']['text']
    test_labels = dataset['test']['label']
    return train_texts, train_labels, test_texts, test_labels

In [3]:
# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [4]:
# LSTM-BERT Model
class LSTMBERT(nn.Module):
    def __init__(self, bert_model_name, hidden_size, num_classes):
        super(LSTMBERT, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(input_size=768, hidden_size=hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_output, _ = self.lstm(bert_output.last_hidden_state)
        output = self.fc(lstm_output[:, -1, :])
        return output

In [5]:
# Training Function
def train_model(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

In [6]:
# Evaluation Function
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=['Negative', 'Positive'])
    return accuracy, report

In [7]:
if __name__ == "__main__":
    # Load IMDB dataset
    train_texts, train_labels, test_texts, test_labels = load_imdb_data()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 45480.77 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 70039.22 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 72989.28 examples/s]


In [8]:
# Parameters
max_len = 128
batch_size = 16
epochs = 3
learning_rate = 2e-5
hidden_size = 256
num_classes = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Prepare datasets and dataloaders
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len)
test_dataset = CustomDataset(test_texts, test_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [11]:
# Initialize LSTM-BERT model
model = LSTMBERT('bert-base-uncased', hidden_size, num_classes).to(device)

In [12]:
# Optimizer and Loss Function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [13]:
# Training Loop
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss:.4f}")

    test_accuracy, test_report = evaluate_model(model, test_loader, device)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print("Test Report:\n", test_report)

Epoch 1/3, Training Loss: 0.5460
Test Accuracy: 0.7977
Test Report:
               precision    recall  f1-score   support

    Negative       0.78      0.84      0.81     12500
    Positive       0.82      0.76      0.79     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000

Epoch 2/3, Training Loss: 0.4430
Test Accuracy: 0.8155
Test Report:
               precision    recall  f1-score   support

    Negative       0.82      0.81      0.82     12500
    Positive       0.81      0.82      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

Epoch 3/3, Training Loss: 0.4188
Test Accuracy: 0.8156
Test Report:
               precision    recall  f1-score   support

    Negative       0.79      0.87      0.82     12500
    Positive       0.85      0

In [14]:
# Save LSTM-BERT model
torch.save(model.state_dict(), "lstmb_bert_model_imdb.pth")
print("LSTM-BERT Model training and evaluation completed.")

LSTM-BERT Model training and evaluation completed.
