<a href="https://colab.research.google.com/github/dodofk/IM-Camp-ML-Course/blob/main/Bert_Downstream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bert Pretrain 

## Download Package

In [1]:
!pip install transformers datasets wandb --upgrade --quiet
!nvidia-smi

[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
[K     |████████████████████████████████| 298 kB 63.8 MB/s 
[K     |████████████████████████████████| 1.7 MB 66.5 MB/s 
[K     |████████████████████████████████| 596 kB 43.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 46.8 MB/s 
[K     |████████████████████████████████| 61 kB 622 kB/s 
[K     |████████████████████████████████| 895 kB 41.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 57.4 MB/s 
[K     |████████████████████████████████| 243 kB 71.8 MB/s 
[K     |████████████████████████████████| 132 kB 76.0 MB/s 
[K     |████████████████████████████████| 140 kB 72.4 MB/s 
[K     |████████████████████████████████| 180 kB 73.3 MB/s 
[K     |████████████████████████████████| 97 kB 8.7 MB/s 
[K     |████████████████████████████████| 63 kB 2.2 MB/s 
[K     |████████████████████████████████| 192 kB 70.7 MB/s 
[K     |████████████████████████████████| 271 kB 68.6 MB/s 
[K     |██████████████████████

In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from transformers import pipeline
from transformers import BertTokenizer, BertModel

from tqdm.autonotebook import tqdm

from datasets import load_dataset

from typing import Dict, List

import wandb

## Setup Wandb

In [33]:
wandb.init(project="bert-sst")

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_acc,▄▆▆▆▆▇▇████▇▇▃▂▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▆▆▆▇▇▇▇▇
train_loss,▄▂▆▂▃▃█▃▁▁▃▅▅▅▅▅▅▅▅▅▅▅▅▆▅▅▅▅▅▅▅▅▇▂▆▂▄▃▂▂
valid_acc,▇▇▇▇▇▇▇▁▁▁▂▃▄▄███▇▆▅▁▁▁▁▃▄▄███▇▆▅▅███▇▇▇
valid_loss,▄▄▁▃▂▃▄▇▇▇▆▆▆▆▆▆▆█▇█▇▇▇▆▆▆▆▆▆▆▇▇▇▇▆▃▂▄▅█

0,1
train_acc,0.86187
train_loss,0.2467
valid_acc,0.81651
valid_loss,0.90513


## Setup Configurations

In [37]:
class cfg:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    bert_model = "bert-base-uncased"
    batch_size = 8
    epochs = 10
    lr = 1e-5
    weight_decay = 1e-2
    dataset_path = "gpt3mix/sst2"
    tokenizer = BertTokenizer.from_pretrained(bert_model, padding=True, truncation=False, device=device)
    save_path = "drive/MyDrive/MIULAB_DATASET/sst_downstream.pt"

## Demonstrate Bert

In [5]:
unmasker = pipeline("fill-mask", model=cfg.bert_model)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


M1 Max is the most powerful chip ever created for a pro notebook.

In [6]:
unmasker("M1 Max is the most [MASK] chip ever created for a pro notebook.")

[{'score': 0.33024832606315613,
  'sequence': 'm1 max is the most powerful chip ever created for a pro notebook.',
  'token': 3928,
  'token_str': 'powerful'},
 {'score': 0.14291511476039886,
  'sequence': 'm1 max is the most expensive chip ever created for a pro notebook.',
  'token': 6450,
  'token_str': 'expensive'},
 {'score': 0.09646724164485931,
  'sequence': 'm1 max is the most advanced chip ever created for a pro notebook.',
  'token': 3935,
  'token_str': 'advanced'},
 {'score': 0.03862152248620987,
  'sequence': 'm1 max is the most complex chip ever created for a pro notebook.',
  'token': 3375,
  'token_str': 'complex'},
 {'score': 0.031219029799103737,
  'sequence': 'm1 max is the most common chip ever created for a pro notebook.',
  'token': 2691,
  'token_str': 'common'}]

## Load Dataset

In [7]:
def default_collate_fn(inputs: List) -> Dict:
    
    label = [data['label'] for data in inputs]
    text = [data['text'] for data in inputs]

    return {
        'label': torch.Tensor(label),
        'text': text,
    }
    

In [8]:
def build_loaders():
    train_dataset = load_dataset(cfg.dataset_path, split="train")
    valid_dataset = load_dataset(cfg.dataset_path, split="validation")
    test_dataset = load_dataset(cfg.dataset_path, split="test")

    train_dataloader = DataLoader(
        train_dataset,
        batch_size = cfg.batch_size, 
        collate_fn=default_collate_fn,
        shuffle=True,
        drop_last=True, 
    )
    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size = cfg.batch_size, 
        collate_fn=default_collate_fn,
        shuffle=False,
        drop_last=True, 
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size = cfg.batch_size, 
        collate_fn=default_collate_fn,
        shuffle=True,
        drop_last=True, 
    )

    return train_dataloader, valid_dataloader, test_dataloader

## Setup Model

In [40]:
class SST_BERT(nn.Module):
    def __init__(
        self,
    ):
        super().__init__()
        self.bert = BertModel.from_pretrained(cfg.bert_model)
        self.final_classification = nn.Sequential(
            nn.Linear(768, 2),
        )
        #     nn.ReLU(),
        #     nn.BatchNorm1d(256),
        #     nn.Linear(256, 2),
        # )

    def forward(self, x):
        hidden_state = self.bert(**x)['last_hidden_state'][:,0,:]
        return self.final_classification(hidden_state)

## Train

In [41]:
def train_epoch(model, train_loader, optimizer) -> None:
    tqdm_object = tqdm(train_loader, total=len(train_loader))
    loss_fn = nn.BCEWithLogitsLoss()

    cur_loss = 0
    cur_count = 0
    train_total = 0
    train_acc = 0
    for batch in tqdm_object:
        optimizer.zero_grad()

        label = batch['label'].to(cfg.device)

        inputs = cfg.tokenizer(
            batch['text'],
            return_tensors='pt',
            padding=True,
        ).to(cfg.device)

        label = F.one_hot(label.to(torch.int64), num_classes=2)
        pred = model(inputs)

        pred_label = torch.argmax(pred, dim=1)


        loss = loss_fn(pred, label.to(torch.float32))
        loss.backward()

        optimizer.step()

        train_total += cfg.batch_size
        train_acc += (pred_label.to("cpu") == torch.argmax(label, dim=1).to("cpu")).sum().item()

        wandb.log({"train_loss": loss.item(), "train_acc": train_acc/train_total})

        cur_count += 1
        cur_loss += loss.item()

        tqdm_object.set_postfix(train_loss=cur_loss/cur_count, train_acc=train_acc/train_total)

def valid_epoch(model, valid_loader):
    tqdm_object = tqdm(valid_loader, total=len(valid_loader))
    loss_fn = nn.BCEWithLogitsLoss()

    cur_loss = 0
    cur_count = 0
    valid_total = 0
    valid_acc = 0
    for batch in tqdm_object:
        
        label = batch['label'].to(cfg.device)

        inputs = cfg.tokenizer(
            batch['text'],
            return_tensors='pt',
            padding=True,
        ).to(cfg.device)


        label = F.one_hot(label.to(torch.int64), num_classes=2)
        pred = model(inputs)


        pred_label = torch.argmax(pred, dim=1)

        loss = loss_fn(pred, label.to(torch.float32))

        valid_total += cfg.batch_size
        valid_acc += (pred_label.to("cpu") == torch.argmax(label, dim=1).to("cpu")).sum().item()

        wandb.log({"valid_loss": loss.item(), "valid_acc": valid_acc/valid_total})

        cur_count += 1
        cur_loss += loss.item()

        tqdm_object.set_postfix(valid_loss=cur_loss/cur_count, valid_acc=valid_acc/valid_total)

    return cur_loss/cur_count

def train() -> None:
    train_loader, valid_loader, _ = build_loaders()
    model = SST_BERT().to(cfg.device)

    optimizer = torch.optim.AdamW(
        params=model.parameters(),
        lr=cfg.lr,
        weight_decay=cfg.weight_decay,
    )

    best_loss = float("inf")

    for epoch in range(cfg.epochs):
        print(f"Epoch: {epoch+1}")
        model.train()
        train_epoch(
            model=model,
            train_loader=train_loader,
            optimizer=optimizer,
        )
        model.eval()
        with torch.no_grad():
            valid_loss = valid_epoch(
                model=model,
                valid_loader=valid_loader
            )

        if valid_loss < best_loss:
            best_loss=valid_loss
            torch.save(model.state_dict(), cfg.save_path)
            print("Save Best Model!")

    

In [42]:
train()

Using custom data configuration default
Reusing dataset sst2 (/root/.cache/huggingface/datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Using custom data configuration default
Reusing dataset sst2 (/root/.cache/huggingface/datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Using custom data configuration default
Reusing dataset sst2 (/root/.cache/huggingface/datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initi

Epoch: 1


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Save Best Model!
Epoch: 2


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Save Best Model!
Epoch: 3


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 4


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 5


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 6


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 7


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 8


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 9


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 10


  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/109 [00:00<?, ?it/s]

# Testing

In [43]:
test_model = SST_BERT()
test_model.load_state_dict(torch.load(cfg.save_path))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [44]:
test_model.eval()
test_model.to(cfg.device)
_, _, test_loader = build_loaders()

tqdm_object = tqdm(test_loader, total=len(test_loader))
    
test_total = 0
test_acc = 0
for batch in tqdm_object:
    
    label = batch['label'].to(cfg.device)

    inputs = cfg.tokenizer(
        batch['text'],
        return_tensors='pt',
        padding=True,
    ).to(cfg.device)


    label = F.one_hot(label.to(torch.int64), num_classes=2)
    pred = test_model(inputs)


    pred_label = torch.argmax(pred, dim=1)


    test_total += cfg.batch_size
    test_acc += (pred_label.to("cpu") == torch.argmax(label, dim=1).to("cpu")).sum().item()

    wandb.log({"test_acc": test_acc/test_total})

    tqdm_object.set_postfix(test_acc=test_acc/test_total)

Using custom data configuration default
Reusing dataset sst2 (/root/.cache/huggingface/datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Using custom data configuration default
Reusing dataset sst2 (/root/.cache/huggingface/datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Using custom data configuration default
Reusing dataset sst2 (/root/.cache/huggingface/datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)


  0%|          | 0/227 [00:00<?, ?it/s]

## Result
[Wandb Link](https://wandb.ai/dodofk/bert-sst/runs/2r8f1auf/overview?workspace=user-dodofk)

## Demo

In [65]:
text = "If you sometimes like to go to the movies to have fun , Wasabi is a good place to start ."
input = cfg.tokenizer(text, padding=True, return_tensors='pt').to(cfg.device)

result = test_model(input)

print(result)

if result.argmax(): 
    print("negative")
else:
    print("positive")

tensor([[ 3.2919, -2.9967]], device='cuda:0', grad_fn=<AddmmBackward0>)
positive
