# Binary Classification with BERT

*Code is adapted from*
- https://huggingface.co/transformers/v3.2.0/custom_datasets.html
- https://mccormickml.com/2019/07/22/BERT-fine-tuning/
- https://luv-bansal.medium.com/fine-tuning-bert-for-text-classification-in-pytorch-503d97342db2

In [1]:
! git clone https://github.com/e9t/nsmc.git

In [None]:
import os

import pandas as pd
import numpy as np

from pathlib import Path

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, RobertaForSequenceClassification, AdamW

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from tqdm import tqdm

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.manual_seed(70)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
DEVICE

## Step 1: Load Dataset

In [None]:
def read_nsmc_split(path):
    path = Path(path)
    texts = list()
    labels = list()
    df = pd.read_csv(path, delimiter='\t')
    df = df.dropna()
    texts = df["document"].to_list()
    labels = df["label"].to_list()
    return texts, labels

In [None]:
train_texts, train_labels = read_nsmc_split('nsmc/ratings_train.txt')
test_texts, test_labels = read_nsmc_split('nsmc/ratings_test.txt')

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

In [None]:
MAX_LENGTH = 128 # BERT can process upto 512 tokens, but it is too long for the Colab GPU to handle
train_encodings = tokenizer(train_texts, max_length=MAX_LENGTH, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, max_length=MAX_LENGTH, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, max_length=MAX_LENGTH, truncation=True, padding=True)

In [None]:
class BertDataset(Dataset):
    def __init__(self, encodings, labels):
        super(BertDataset, self).__init__()
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):    
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [None]:
BATCH_SIZE = 32

In [None]:
train_dataset = BertDataset(train_encodings, train_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = BertDataset(val_encodings, val_labels)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = BertDataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

## Step 2: Define the model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('klue/roberta-base',
                                                      num_labels = 2,
                                                      output_attentions = False,
                                                      output_hidden_states = False
                                                     )
model.to(DEVICE)

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
writer = SummaryWriter()
log_interval = 50

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def finetune(epochs, train_loader, val_loader, model, optimizer):
    
    val_losses = list()
    best_loss = 0.0
    
    for epoch in range(epochs):
        
        # ==========================
        #           Train
        # ==========================
        print("Training epoch: ", epoch+1)

        model.train()        
        
        for train_step, batch in enumerate(tqdm(train_loader)):
            
        
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
            
            loss=outputs.loss

            writer.add_scalar("Loss/train", loss.item(), train_step)
            loss.backward()
            
            optimizer.step()
                
      
        # ==========================
        #           Validation
        # ==========================
        print("Validating epoch: ", epoch+1)

        model.eval()

        total_eval_accuracy = 0 
        total_eval_loss = 0

        for batch in tqdm(val_loader):

            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            with torch.no_grad():

                outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)

                loss = outputs.loss
                logits = outputs.logits

                total_eval_loss += loss.item()

                logits = logits.detach().cpu().numpy()
                label_ids = labels.to('cpu').numpy()

                total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(val_loader)
        print("\tAccuracy:{0:.2f}".format(avg_val_accuracy))
        
        avg_val_loss = total_eval_loss / len(val_loader)
        print("\tValidation Loss:{0:.2f}".format(avg_val_loss))
        
        val_losses.append(avg_val_loss)
        
        if len(val_losses) >= 2:
            if avg_val_loss <= best_loss:
                best_loss = avg_val_loss
                best_model = model
        else:
            best_loss = avg_val_loss
            best_model = model

        print("\n\n")
    
    return best_model

## Step 3: Train the model

In [None]:
model = finetune(4, train_loader, val_loader, model, optimizer)

## Step 5: Save the model

In [None]:
output_dir = "../resources/model_save/klue-RoBERTa-base-SA"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

## Step 6: Load the model

In [None]:
PATH = "../resources/model_save/klue-RoBERTa-base-SA"

tokenizer = AutoTokenizer.from_pretrained(PATH, local_files_only=True)
model = RobertaForSequenceClassification.from_pretrained(PATH, local_files_only=True,
                                                     num_labels=2,
                                                     output_attentions = False,
                                                      output_hidden_states = False
                                                     )
model = model.to(DEVICE)

In [None]:
def read_nsmc_split(path):
    path = Path(path)
    texts = list()
    labels = list()
    df = pd.read_csv(path, delimiter='\t')
    df = df.dropna()
    texts = df["document"].to_list()
    labels = df["label"].to_list()
    return texts, labels

In [None]:
test_texts, test_labels = read_nsmc_split('nsmc/ratings_test.txt')

In [None]:
len(test_texts)

In [None]:
MAX_LENGTH = 128 # BERT can process upto 512 tokens, but it is too long for the Colab GPU to handle
test_encodings = tokenizer(test_texts, max_length=MAX_LENGTH, truncation=True, padding=True)

In [None]:
len(test_encodings[0])

In [None]:
class BertDataset(Dataset):
    def __init__(self, encodings, labels):
        super(BertDataset, self).__init__()
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):    
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [None]:
BATCH_SIZE = 32

In [None]:
test_dataset = BertDataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(test_loader)*BATCH_SIZE))

model.eval()

total_test_accuracy = 0

for batch in tqdm(test_loader):

    input_ids = batch['input_ids'].to(DEVICE)
    attention_mask = batch['attention_mask'].to(DEVICE)
    labels = batch['labels'].to(DEVICE)

    with torch.no_grad():
        outputs = model(input_ids, 
                      attention_mask=attention_mask)
        logits = outputs['logits']

        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        total_test_accuracy += flat_accuracy(logits, label_ids)
    
avg_test_accuracy = total_test_accuracy / len(test_loader)
    
print('DONE.')

print("Accuracy: {0:.2f}".format(avg_test_accuracy))