<a href="https://colab.research.google.com/github/dbwls0087/BERT_MovieReviews/blob/main/code/LLM_BERT_MovieReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import os
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [27]:
csv_path = "/content/IMDB Dataset.csv"
model_name = "bert-base-uncased"
max_len = 128
batch_size = 16
epochs = 1
lr = 2e-5
save_dir = "/content/imdb_bert_sentiment"

device = "cuda" if torch.cuda.is_available() else "cpu"

print("device:", device)

device: cuda


##Preprocessing

In [28]:
df = pd.read_csv(csv_path).dropna(subset=["review", "sentiment"]).copy()

label_map = {"negative": 0, "positive": 1}
df["label"] = df["sentiment"].map(label_map).astype(int)

train_df, valid_df = train_test_split(
    df[["review", "label"]],
    test_size=0.1,
    random_state=42,
    stratify=df["label"]
)

print(len(train_df), len(valid_df))

45000 5000


In [29]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

class imdb_dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.texts = dataframe["review"].astype(str).tolist()
        self.labels = dataframe["label"].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_ds = imdb_dataset(train_df, tokenizer, max_len)
valid_ds = imdb_dataset(valid_df, tokenizer, max_len)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)


## model

In [30]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [31]:
def train_one_epoch():
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)



In [32]:
@torch.no_grad()
def eval_accuracy():
    model.eval()
    all_preds = []
    all_labels = []

    for batch in valid_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()

        all_preds.append(preds)
        all_labels.append(labels)

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    return accuracy_score(all_labels, all_preds)

In [33]:
for epoch in range(1, epochs + 1):
    train_loss = train_one_epoch()
    val_acc = eval_accuracy()
    print(f"epoch {epoch} | train_loss {train_loss:.4f} | val_acc {val_acc:.4f}")

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("saved:", save_dir)

epoch 1 | train_loss 0.3057 | val_acc 0.8956


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

saved: /content/imdb_bert_sentiment


## inference

In [34]:
@torch.no_grad()
def infer(texts):
    tok = AutoTokenizer.from_pretrained(save_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(save_dir).to(device)
    mdl.eval()

    enc = tok(
        texts,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )

    logits = mdl(
        input_ids=enc["input_ids"].to(device),
        attention_mask=enc["attention_mask"].to(device)
    ).logits

    probs = torch.softmax(logits, dim=-1)[:, 1].detach().cpu().numpy()
    preds = (probs >= 0.5).astype(int)
    return probs, preds

probs, preds = infer(["this movie was amazing", "worst movie ever"])
print("probs:", probs)
print("preds(1=positive):", preds)


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

probs: [0.9958359  0.00181917]
preds(1=positive): [1 0]


In [46]:
@torch.no_grad()
def predict_labels(texts, batch_size = 16):
    tok = AutoTokenizer.from_pretrained(save_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(save_dir).to(device)
    mdl.eval()

    preds_all = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        enc = tok(batch_texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt")
        logits = mdl(
            input_ids=enc["input_ids"].to(device),
            attention_mask=enc["attention_mask"].to(device)
        ).logits

        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        preds_all.append(preds)


    return np.concatenate(preds_all)

In [47]:
df_all = pd.read_csv(csv_path).dropna(subset=["review"]).copy()
texts = df_all["review"].astype(str).tolist()

df_all["pred_label"] = predict_labels(texts)
df_all["pred_sentiment"] = np.where(df_all["pred_label"] == 1, "positive", "negative")

out_path = "/content/imdb_pred.csv"
df_all.to_csv(out_path, index=False)

print("saved:", out_path)
df_all.head(5)

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

saved: /content/imdb_pred.csv


Unnamed: 0,review,sentiment,pred_label,pred_sentiment
0,One of the other reviewers has mentioned that ...,positive,1,positive
1,A wonderful little production. <br /><br />The...,positive,1,positive
2,I thought this was a wonderful way to spend ti...,positive,1,positive
3,Basically there's a family where a little boy ...,negative,0,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,positive


## validation

In [50]:
@torch.no_grad()
def get_valid_preds_and_labels():
    model.eval()
    preds_list = []
    labels_list = []

    for batch in valid_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()

        preds_list.append(preds)
        labels_list.append(labels)

    return np.concatenate(preds_list), np.concatenate(labels_list)


In [53]:
from sklearn.metrics import confusion_matrix, classification_report


preds, labels = get_valid_preds_and_labels()


# confusion matrix
cm = confusion_matrix(labels, preds)
cm_df = pd.DataFrame(
    cm,
    index=["true_neg", "true_pos"],
    columns=["pred_neg", "pred_pos"]
)
print("confusion matrix")
print(cm_df)

# classification report
print("\nclassification report")
print(classification_report(labels, preds, target_names=["negative", "positive"], digits=4))


confusion matrix
          pred_neg  pred_pos
true_neg      2243       257
true_pos       265      2235

classification report
              precision    recall  f1-score   support

    negative     0.8943    0.8972    0.8958      2500
    positive     0.8969    0.8940    0.8954      2500

    accuracy                         0.8956      5000
   macro avg     0.8956    0.8956    0.8956      5000
weighted avg     0.8956    0.8956    0.8956      5000

