In [1]:
# load data
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# split the dataset into training and testing
train_dataset, test_dataset = dataset['train'], dataset['test']

In [3]:
# lightweight preprocessing
import re

def preprocess_function(example):
    # lower case
    example['text'] = example['text'].lower()
    # remove html tags
    example['text'] = re.sub(r'<.*?>', '', example['text'])
    # remove special characters
    example['text'] = re.sub(r'[^a-zA-Z0-9 ]', '', example['text'])
    # remove extra spaces
    example['text'] = re.sub(r'\s+', ' ', example['text']).strip()
    
    return example


train_dataset = train_dataset.map(
    preprocess_function,
    desc="Running preprocessing on trainset",
)
test_dataset = test_dataset.map(
    preprocess_function,
    desc="Running preprocessing on testset",
)

In [4]:
train_dataset['text'][0]

'i rented i am curiousyellow from my video store because of all the controversy that surrounded it when it was first released in 1967 i also heard that at first it was seized by us customs if it ever tried to enter this country therefore being a fan of films considered controversial i really had to see this for myselfthe plot is centered around a young swedish drama student named lena who wants to learn everything she can about life in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states in between asking politicians and ordinary denizens of stockholm about their opinions on politics she has sex with her drama teacher classmates and married menwhat kills me about i am curiousyellow is that 40 years ago this was considered pornographic really the sex and nudity scenes are few and far between even then its not shot like some cheaply made

In [5]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True, return_tensors='pt')


In [6]:
import torch 

def setup_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using CUDA {torch.cuda.get_device_name(0)}")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using MPS")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

device = setup_device()

Using MPS


In [8]:
def to_device(inputs: dict[str, torch.Tensor], device: torch.device):
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            inputs[k] = v.to(device)

    return inputs


In [9]:
model = model.to(device)

In [10]:
# extract embeddings from bert-base-uncased model
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

train_embeddings_mean_pooling = []
train_embeddings_cls_token = []
train_labels = []

with torch.inference_mode():
    for batch in tqdm(train_loader):
        labels = batch['label']
        encoded_input = tokenize_function(batch)
        encoded_input = to_device(encoded_input, device)
        output = model(**encoded_input)
        
        embeddings = output.last_hidden_state.mean(dim=1).cpu() # mean pooling
        train_embeddings_mean_pooling.append(embeddings)
        
        embeddings = output.pooler_output.cpu() # cls token
        train_embeddings_cls_token.append(embeddings)
        
        train_labels.append(labels)
        
        
train_embeddings_mean_pooling = torch.cat(train_embeddings_mean_pooling, dim=0)
train_embeddings_cls_token = torch.cat(train_embeddings_cls_token, dim=0)
train_labels = torch.cat(train_labels, dim=0)

100%|██████████| 1563/1563 [03:41<00:00,  7.05it/s]


In [11]:
test_loader = DataLoader(test_dataset, batch_size=16)

test_embeddings_mean_pooling = []
test_embeddings_cls_token = []
test_labels = []

with torch.inference_mode():
    for batch in tqdm(test_loader):
        labels = batch['label']
        encoded_input = tokenize_function(batch)
        encoded_input = to_device(encoded_input, device)
        output = model(**encoded_input)
        
        embeddings = output.last_hidden_state.mean(dim=1).cpu() # mean pooling
        test_embeddings_mean_pooling.append(embeddings)
        
        embeddings = output.pooler_output.cpu() # cls token
        test_embeddings_cls_token.append(embeddings)
        
        test_labels.append(labels)
        
        
test_embeddings_mean_pooling = torch.cat(test_embeddings_mean_pooling, dim=0)
test_embeddings_cls_token = torch.cat(test_embeddings_cls_token, dim=0)
test_labels = torch.cat(test_labels, dim=0)

100%|██████████| 1563/1563 [03:39<00:00,  7.13it/s]


In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate_model(X_train, y_train, X_test, y_test, model, method_name, model_name):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return {
        "Method Name": method_name,
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

In [13]:
results = []

In [14]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

results.append(evaluate_model(train_embeddings_mean_pooling.numpy(), train_labels.numpy(), test_embeddings_mean_pooling.numpy(), test_labels.numpy(), LogisticRegression(max_iter=2000), "bert-base-uncased + mean pooling", "LogisticRegression"))
results.append(evaluate_model(train_embeddings_cls_token.numpy(), train_labels.numpy(), test_embeddings_cls_token.numpy(), test_labels.numpy(), LogisticRegression(max_iter=2000), "bert-base-uncased + cls token", "LogisticRegression"))

In [15]:
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

results.append(evaluate_model(train_embeddings_mean_pooling.numpy(), train_labels.numpy(), test_embeddings_mean_pooling.numpy(), test_labels.numpy(), KNeighborsClassifier(n_neighbors=5), "bert-base-uncased + mean pooling", "kNN"))
results.append(evaluate_model(train_embeddings_cls_token.numpy(), train_labels.numpy(), test_embeddings_cls_token.numpy(), test_labels.numpy(), KNeighborsClassifier(n_neighbors=5), "bert-base-uncased + cls token", "kNN"))

In [16]:
from xgboost import XGBClassifier

results.append(evaluate_model(train_embeddings_mean_pooling.numpy(), train_labels.numpy(), test_embeddings_mean_pooling.numpy(), test_labels.numpy(),XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), "bert-base-uncased + mean pooling", "XGBoost"))
results.append(evaluate_model(train_embeddings_cls_token.numpy(), train_labels.numpy(), test_embeddings_cls_token.numpy(), test_labels.numpy(), XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), "bert-base-uncased + cls token", "XGBoost"))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [18]:
import pandas as pd

metrics_df = pd.DataFrame(results)
metrics_df

Unnamed: 0,Method Name,Model,Accuracy,Precision,Recall,F1 Score
0,bert-base-uncased + mean pooling,LogisticRegression,0.8376,0.841922,0.83128,0.836567
1,bert-base-uncased + cls token,LogisticRegression,0.80924,0.817756,0.79584,0.806649
2,bert-base-uncased + mean pooling,kNN,0.74832,0.800659,0.66128,0.724325
3,bert-base-uncased + cls token,kNN,0.65368,0.677542,0.58648,0.628731
4,bert-base-uncased + mean pooling,XGBoost,0.81032,0.813075,0.80592,0.809482
5,bert-base-uncased + cls token,XGBoost,0.75376,0.759362,0.74296,0.751072
