In [None]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
import sentencepiece as spm
from datasets import load_dataset
from sklearn.metrics import accuracy_score

# Load IMDb dataset using the datasets library
dataset = load_dataset('imdb')

# Split the dataset into training and test sets
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']


In [None]:
# Define maximum sequence length
max_length = 512
max_sentence_length = 4192

# Train a BPE tokenizer using SentencePiece
with open('train_texts.txt', 'w') as f:
    for text in train_texts:
        if len(text) <= max_sentence_length:
            f.write("%s\n" % text)

spm.SentencePieceTrainer.train(input='train_texts.txt', model_prefix='bpe', vocab_size=10000, max_sentence_length=max_sentence_length)
sp_bpe = spm.SentencePieceProcessor(model_file='bpe.model')

# Tokenize using BPE and truncate
train_texts_bpe = [sp_bpe.encode_as_ids(text)[:max_length] for text in train_texts]
test_texts_bpe = [sp_bpe.encode_as_ids(text)[:max_length] for text in test_texts]

# Standard tokenization using BertTokenizer and truncate
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_texts_non_bpe = [tokenizer.encode(text, add_special_tokens=True, max_length=max_length, truncation=True) for text in train_texts]
test_texts_non_bpe = [tokenizer.encode(text, add_special_tokens=True, max_length=max_length, truncation=True) for text in test_texts]


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: train_texts.txt
  input_format: 
  model_prefix: bpe
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differ

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx])

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return texts_padded, labels

train_dataset_bpe = IMDBDataset(train_texts_bpe, train_labels)
test_dataset_bpe = IMDBDataset(test_texts_bpe, test_labels)

train_dataset_non_bpe = IMDBDataset(train_texts_non_bpe, train_labels)
test_dataset_non_bpe = IMDBDataset(test_texts_non_bpe, test_labels)

train_loader_bpe = DataLoader(train_dataset_bpe, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader_bpe = DataLoader(test_dataset_bpe, batch_size=32, shuffle=False, collate_fn=collate_fn)

train_loader_non_bpe = DataLoader(train_dataset_non_bpe, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader_non_bpe = DataLoader(test_dataset_non_bpe, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        out = self.fc(lstm_out)
        return out


In [None]:
def train_model(model, train_loader, criterion, optimizer):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Instantiate and train models
vocab_size_bpe = 10000
vocab_size_non_bpe = tokenizer.vocab_size
embed_dim = 128
hidden_dim = 64
output_dim = 2

model_bpe = SentimentClassifier(vocab_size_bpe, embed_dim, hidden_dim, output_dim).to(device)
optimizer_bpe = optim.Adam(model_bpe.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

train_model(model_bpe, train_loader_bpe, criterion, optimizer_bpe)
accuracy_bpe = evaluate_model(model_bpe, test_loader_bpe)

model_non_bpe = SentimentClassifier(vocab_size_non_bpe, embed_dim, hidden_dim, output_dim).to(device)
optimizer_non_bpe = optim.Adam(model_non_bpe.parameters(), lr=0.001)

train_model(model_non_bpe, train_loader_non_bpe, criterion, optimizer_non_bpe)
accuracy_non_bpe = evaluate_model(model_non_bpe, test_loader_non_bpe)

print(f"BPE Model Accuracy: {accuracy_bpe}")
print(f"Non-BPE Model Accuracy: {accuracy_non_bpe}")


BPE Model Accuracy: 0.50424
Non-BPE Model Accuracy: 0.50144
