<a href="https://colab.research.google.com/github/dbwls0087/BERT_MovieReviews/blob/main/code/LLM_BERT_MovieReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import os
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [16]:
csv_path = "/content/IMDB Dataset.csv"
model_name = "bert-base-uncased"
max_len = 256
batch_size = 16
epochs = 2
lr = 2e-5
save_dir = "/content/imdb_bert_sentiment"

device = "cuda" if torch.cuda.is_available() else "cpu"

print("device:", device)

device: cuda


##Preprocessing

In [17]:
df = pd.read_csv(csv_path).dropna(subset=["review", "sentiment"]).copy()

label_map = {"negative": 0, "positive": 1}
df["label"] = df["sentiment"].map(label_map).astype(int)

train_df, valid_df = train_test_split(
    df[["review", "label"]],
    test_size=0.1,
    random_state=42,
    stratify=df["label"]
)

print(len(train_df), len(valid_df))

45000 5000


In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

class imdb_dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.texts = dataframe["review"].astype(str).tolist()
        self.labels = dataframe["label"].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_ds = imdb_dataset(train_df, tokenizer, max_len)
valid_ds = imdb_dataset(valid_df, tokenizer, max_len)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)


In [19]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [20]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

total_steps = epochs * len(train_loader)
warmup_steps = int(0.1 * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)


In [21]:
def train_one_epoch():
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = loss_fn(logits, labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)


In [22]:
@torch.no_grad()
def valid_probs_labels():
    model.eval()
    probs_list, labels_list = [], []

    for batch in valid_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs = torch.softmax(logits, dim=-1)[:, 1]

        probs_list.append(probs.cpu().numpy())
        labels_list.append(labels)

    return np.concatenate(probs_list), np.concatenate(labels_list)


def tune_threshold(probs, labels):
    best_t, best_f1 = 0.5, -1.0
    for t in np.arange(0.3, 0.71, 0.02):
        preds = (probs >= t).astype(int)
        f1 = f1_score(labels, preds)
        if f1 > best_f1:
            best_f1, best_t = f1, float(t)
    return best_t, best_f1


In [23]:
os.makedirs(save_dir, exist_ok=True)

best_f1 = -1
best_threshold = 0.5

for epoch in range(epochs):
    loss = train_one_epoch()

    probs, labels = valid_probs_labels()
    t, f1 = tune_threshold(probs, labels)

    preds = (probs >= t).astype(int)
    acc = accuracy_score(labels, preds)

    print(epoch, loss, acc, f1)

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)


0 0.2652129758618854 0.9256 0.9265692854322938


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

1 0.12394454946938122 0.9294 0.9301957682420408


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
@torch.no_grad()
def infer(texts):
    tok = AutoTokenizer.from_pretrained(save_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(save_dir).to(device)
    mdl.eval()

    enc = tok(
        texts,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )

    logits = mdl(
        input_ids=enc["input_ids"].to(device),
        attention_mask=enc["attention_mask"].to(device)
    ).logits

    probs = torch.softmax(logits, dim=-1)[:, 1].detach().cpu().numpy()
    preds = (probs >= best_threshold).astype(int)
    return probs, preds

probs, preds = infer(["this movie was amazing", "worst movie ever"])
print("probs:", probs)
print("preds(1=positive):", preds)


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

probs: [0.9937196 0.0024023]
preds(1=positive): [1 0]


In [25]:
@torch.no_grad()
def predict_texts(texts, save_dir, device, max_len=256, batch_size=64, threshold=0.5):
    tok = AutoTokenizer.from_pretrained(save_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(save_dir).to(device)
    mdl.eval()

    all_probs = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        enc = tok(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=max_len,
            return_tensors="pt"
        )

        logits = mdl(
            input_ids=enc["input_ids"].to(device),
            attention_mask=enc["attention_mask"].to(device)
        ).logits

        probs = torch.softmax(logits, dim=-1)[:, 1].detach().cpu().numpy()
        all_probs.append(probs)

    prob_pos = np.concatenate(all_probs, axis=0)
    pred_label = (prob_pos >= threshold).astype(int)
    pred_sentiment = np.where(pred_label == 1, "positive", "negative")

    return prob_pos, pred_label, pred_sentiment

In [26]:
df_all = pd.read_csv(csv_path).dropna(subset=["review"]).copy()
texts = df_all["review"].astype(str).tolist()

prob_pos, pred_label, pred_sentiment = predict_texts(
    texts=texts,
    save_dir=save_dir,
    device=device,
    max_len=max_len,
    batch_size=64,
    threshold=best_threshold
)

df_all["prob_pos"] = prob_pos
df_all["pred_label"] = pred_label
df_all["pred_sentiment"] = pred_sentiment

out_path = "/content/imdb_with_predictions.csv"
df_all.to_csv(out_path, index=False)

print("saved:", out_path)
df_all.head(3)

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

saved: /content/imdb_with_predictions.csv


Unnamed: 0,review,sentiment,prob_pos,pred_label,pred_sentiment
0,One of the other reviewers has mentioned that ...,positive,0.993809,1,positive
1,A wonderful little production. <br /><br />The...,positive,0.997703,1,positive
2,I thought this was a wonderful way to spend ti...,positive,0.997364,1,positive


In [30]:
@torch.no_grad()
def get_valid_probs_and_labels():
    model.eval()
    probs_list, labels_list = [], []

    for batch in valid_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs = torch.softmax(logits, dim=-1)[:, 1]

        probs_list.append(probs.cpu().numpy())
        labels_list.append(labels)

    return np.concatenate(probs_list), np.concatenate(labels_list)


In [31]:
from sklearn.metrics import confusion_matrix, classification_report


probs, labels = get_valid_probs_and_labels()


preds = (probs >= best_threshold).astype(int)

# confusion matrix
cm = confusion_matrix(labels, preds)
cm_df = pd.DataFrame(
    cm,
    index=["true_neg", "true_pos"],
    columns=["pred_neg", "pred_pos"]
)
print("confusion matrix (threshold =", best_threshold, ")")
print(cm_df)

# classification report
print("\nclassification report")
print(classification_report(labels, preds, target_names=["negative", "positive"], digits=4))


confusion matrix (threshold = 0.46000000000000013 )
          pred_neg  pred_pos
true_neg      2295       205
true_pos       148      2352

classification report
              precision    recall  f1-score   support

    negative     0.9394    0.9180    0.9286      2500
    positive     0.9198    0.9408    0.9302      2500

    accuracy                         0.9294      5000
   macro avg     0.9296    0.9294    0.9294      5000
weighted avg     0.9296    0.9294    0.9294      5000

