This notebook contains a Finetuned BERT from Huggingface

BERT Paper: https://arxiv.org/pdf/1810.04805.pdf

### Library Installations

In [None]:
!pip install bertviz transformers torch torchtext tqdm captum

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3 (from bertviz)
  Downloading boto3-1.34.86-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3

### Library Imports

In [None]:
from google.colab import drive
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_scheduler, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
from tqdm.auto import tqdm
from captum.attr import visualization as viz
from captum.attr import LayerConductance, LayerIntegratedGradients
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import datetime

### Folder Constants

In [None]:
BERT_MODEL = 'bert-base-uncased'
FOLDER_PATH = '/content/drive/MyDrive/cs4248/'
DATASET_PATH = os.path.join(FOLDER_PATH, 'datasets')
ORIGINAL_DATASET_PATH = os.path.join(DATASET_PATH, 'lun_dataset_original')
ORIGINAL_TEST_DATASET_FILE_NAME = 'test_final_with_topics_new.csv'
MIXED_DATASET_PATH = os.path.join(DATASET_PATH, 'lun_dataset_modified')
MIXED_DATASET_TRAIN_FILE_NAME = 'merged_final_df_with_topics_new.csv'
TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
MODEL_SAVE_PATH = os.path.join(FOLDER_PATH, 'models', BERT_MODEL, TIMESTAMP)

### Device Constants

In [None]:
TORCH_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(TORCH_DEVICE)

### Mount Google Drive

In [None]:
drive.mount('/content/drive')

In [None]:
# Create folder if does not exist
dir_paths = [FOLDER_PATH, ORIGINAL_DATASET_PATH, MIXED_DATASET_PATH, MODEL_SAVE_PATH]
for dir_path in dir_paths:
  if not os.path.exists(dir_path):
    os.makedirs(dir_path)

### Prepare Dataset

In [None]:
train_dataframe = pd.read_csv(os.path.join(MIXED_DATASET_PATH, MIXED_DATASET_TRAIN_FILE_NAME))
# train_dataframe = pd.read_csv(os.path.join(ORIGINAL_DATASET_PATH, 'fulltrain.csv'), names=["label", "text"])
test_dataframe = pd.read_csv(os.path.join(MIXED_DATASET_PATH, ORIGINAL_TEST_DATASET_FILE_NAME))

In [None]:
print("Original Training Dataframe")
train_dataframe

In [None]:
print("Test Dataframe")
test_dataframe

### Model Constants

In [None]:
TOKEN_MAX_LENGTH = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 3e-5
N_CLASSES = len(set(train_dataframe['label']))

### Prepare Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

### Prepare Model Dataset

In [None]:
# LUN Dataset Class
class BertLUNDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.text[index]
        label = self.data.label[index] - 1  # Subtract 1 to map indices to range [0, num_classes-1]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Prepare Train Dataloader
training_dataset = BertLUNDataset(train_dataframe, tokenizer, TOKEN_MAX_LENGTH)
train_dataloader = DataLoader(training_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, generator=torch.Generator(device=TORCH_DEVICE))
test_dataset = BertLUNDataset(test_dataframe, tokenizer, TOKEN_MAX_LENGTH)
test_dataloader = DataLoader(test_dataset, batch_size=TRAIN_BATCH_SIZE, generator=torch.Generator(device=TORCH_DEVICE))

### Model


In [None]:
bert_config = BertConfig.from_pretrained(BERT_MODEL, output_hidden_states=True, output_attentions=True, num_labels=N_CLASSES)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, config=bert_config)
model

### Declare Loss Function

In [None]:
def criterion(outputs, targets):
  return nn.CrossEntropyLoss()(outputs, targets)

### Declare Optimizer

In [None]:
optimizer = torch.optim.Adam(params = model.parameters(),  lr=LEARNING_RATE)

### Declare Scheduler

In [None]:
num_training_steps = EPOCHS * len(train_dataloader)
scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

### Train the model

In [None]:
from tqdm import tqdm
from sklearn.metrics import f1_score

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    losses = []
    correct_predictions = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    with tqdm(total=len(data_loader), desc='Training') as pbar:
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask)

            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_samples += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            loss = loss_fn(outputs.logits, labels)
            losses.append(loss.item())

            loss.backward()
            optimizer.step()

            pbar.update(1)

    train_f1 = f1_score(all_labels, all_preds, average='weighted')
    return correct_predictions.double() / total_samples, sum(losses) / len(losses), train_f1

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    correct_predictions = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        with tqdm(total=len(data_loader), desc='Evaluation') as pbar:
            for batch in data_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)

                _, preds = torch.max(outputs.logits, dim=1)
                correct_predictions += torch.sum(preds == labels)
                total_samples += labels.size(0)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                loss = loss_fn(outputs.logits, labels)
                losses.append(loss.item())

                pbar.update(1)

    val_f1 = f1_score(all_labels, all_preds, average='weighted')
    return correct_predictions.double() / total_samples, sum(losses) / len(losses), val_f1


In [None]:
# Training loop
for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    # Training
    train_acc, train_loss, train_f1 = train_epoch(model, train_dataloader, criterion, optimizer, TORCH_DEVICE)

    # Evaluation
    val_acc, val_loss, val_f1 = eval_model(model, test_dataloader, criterion, TORCH_DEVICE)

    # Print progress


    print(f'Train Accuracy: {train_acc:.4f}, Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f}')
    print(f'Val Accuracy: {val_acc:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}')

### Test the model

In [None]:
model.eval()
prediction_outputs = []
with torch.no_grad():
    with tqdm(total=len(test_dataloader), desc='Evaluation') as pbar:
      for batch in test_dataloader:
        input_ids = batch['input_ids'].to(TORCH_DEVICE)
        attention_mask = batch['attention_mask'].to(TORCH_DEVICE)
        labels = batch['label'].to(TORCH_DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        for index in range(labels.size(0)):
          text = tokenizer.decode(input_ids[index])
          label = labels[index]
          pred = preds[index]
          prediction_outputs.append((text, label.item(), pred.item()))

In [None]:
pred_df = pd.DataFrame(prediction_outputs, columns=['text', 'true_label', 'pred_label'])
pred_df

In [None]:
pred_df.to_csv(os.path.join(MODEL_SAVE_PATH, 'predictions.csv'))

In [None]:
y_true = pred_df['true_label']
y_pred = pred_df['pred_label']
f1_micro= metrics.f1_score(y_true, y_pred, average='micro')
f1_macro= metrics.f1_score(y_true, y_pred, average='macro')
accuracy = metrics.accuracy_score(y_true, y_pred)
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_micro}")
print(f"F1 Score (Macro) = {f1_macro}")

### Save the model and tokenizer weights

In [None]:
tokenizer.save_pretrained(MODEL_SAVE_PATH)

In [None]:
# Reference: https://github.com/huggingface/transformers/issues/7849
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, os.path.join(MODEL_SAVE_PATH, 'bert_model_optim.pth'))

torch.save(model, os.path.join(MODEL_SAVE_PATH, 'model.pth'))
