<a href="https://colab.research.google.com/github/chihhui5/Hands_On_PyTorch-TensorFlow/blob/main/W3.Text%20Classification%20with%20BERW/W3_Transformer_Text_Classification_with_BERW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mission**: A IMDb emotional classification trained by BERT model

**Goal**:Accurate rate ≥ 85%

**Dataset**:IMDb dataset（25,000 training data、25,000 testing data)

**Classification**:Positive / Negative

In [None]:
!pip install torch torchvision torchaudio
!pip install transformers datasets
!pip install matplotlib scikit-learn



In [None]:
#Step 1.import library

import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim #To use optim.AdamW

In [None]:
#Step 2. Load dataset

dataset = load_dataset("imdb")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [None]:
#Step 3.Tokenization

#Trans original word to number format which understood by BERT

#BertTokenizer(from the library, transformers, of huggind face): specially for dealing with Token from BERT
#process:Splitting, Mapping to IDs, Adding Special Tokens
#base: basic version(12 layers); uncased: rutn to lower case
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(batch):
  #padding all text as long as max_length
  #truncation: cut the length which is longer than max_length
  return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)

#batched=True: pack mutiple sample as a batch and send it to tokenize
encoded_dataset = dataset.map(tokenize, batched=True)
#check if label exist, then rename
if 'label' in encoded_dataset.column_names:
  encoded_dataset = encoded_dataset.rename_column("label","labels")
#Return type is torch(Pytorch)
encoded_dataset.set_format(type='torch',columns=['input_ids','attention_mask','labels'])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

ValueError: Columns ['labels'] not in the dataset. Current columns in the dataset: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
#Step 4.Build Dataloader

#shuffle default is fasle
train_dataloader = DataLoader(encoded_dataset['train'], shuffle=True, batch_size=16)
test_dataloader = DataLoader(encoded_dataset['test'],shuffle=False, batch_size=16)

In [None]:
#Step 5.load BERT model

#check if GPU available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#.from_pretrained : from Hugging face, to load a pretrained model, like weight, configuration
#use base BERT and turn all words to lower case
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
#Step 6.define Optimizer and Scheduler(Learning Rate Scheduler)

#AdamW is a optimizer to update weight and biase of model
#model.parameter for returning a iterator
#lr(learning rate) is a hyperparameter of optimizer
#2e-5 is a learning rate(0.000002)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

#define epochs = 2, that is model will review twice
# usually set 2-4, to avoiding overfitting
num_epochs = 2

#count total training steps
#len return how many batches
num_training_steps = num_epochs*len(train_dataloader)

#get_scheduler : a model from hugging face, for dynamically adjust the learning rate
#linear: from 2e-5 to 0
#warmup step(no warm up here, linear decline from 2e-5)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
#Step 7. Train model

from tqdm.auto import tqdm

#Turn model into train mode
model.train()

#num_training_steps: total training steps from Step6
#range: to generate from 0 to num_training_steps-1
#tqdm:To create a progress bar
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
  for batch in train_dataloader:
    #!!dictionary comprehension
    #.to(device): for ensuring all Tensor(input data) in same device(GPU/CPU) with model
    batch = {k:v.to(device) for k,v in batch.items()}

    #Forward Pass
    #**batch: unpacking
    #model(): to get modle BertForSequenceClassification(input_ids, attention_mask, labels)
    outputs = model(**batch)

    #loss value from output
    loss = outputs.loss
    #Backpropagation
    loss.backward()

    #Parameter Update
    optimizer.step()
    #Learning Rate Scheduler Step Update
    lr_scheduler.step()
    #before next gradient, clear current gradient
    optimizer.zero_grad()
    #update progress bar, move forward 1 step
    progress_bar.update(1)

  0%|          | 0/3126 [00:00<?, ?it/s]

TypeError: BertForSequenceClassification.forward() got an unexpected keyword argument 'label'

In [None]:
#Step 8. Evaluate Model

#Turn model into evaluate mode
model.eval()
preds, labels = [], []

#Pytorch context management
#In 'with', Pytorch would band 'gradient calculation'
#Under eval mode, no need to calculate lose and no need to update parameter
#Significantly reduce the usage of memory, like GPU
with torch.no_grad():
  for batch in test_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    outputs = model(**batch)
    logits = outputs.logits

    #.extend(): append the result to preds
    #.cpu():Move tensor from CPU to GPU
    #argmax: argument of the maximum(to find the index of maximum)
    #dim(dimension)
    preds.extend(logits.argmax(dim=-1).cpu().numpy())
    labels.extend(batch["labels"].cpu().numpy())

  #accuracy_score(): from sklearn.metrics, which import parameters(y_true,y_pred)
  acc = accuracy_score(labels, preds)
  print(f"Accuracy: {acc*100:.2f}%")

In [None]:
#Step 9. visualize confusion matrix

from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['neg','pos'])

#cmap:colormap
disp.plot(cmap = 'Blues')
plt.show()