This notebook contains a Finetuned BERT from Huggingface

BERT Paper: https://arxiv.org/pdf/1810.04805.pdf

### Library Installations

In [None]:
!pip install bertviz transformers torch torchtext tqdm captum

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/157.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m153.6/157.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3 (from bertviz)
  Downloading boto3-1.34.84-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
C

### Library Imports

In [None]:
from google.colab import drive
from transformers import BertTokenizer, BertModel, BertConfig, get_scheduler, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
from tqdm.auto import tqdm
from captum.attr import visualization as viz
from captum.attr import LayerConductance, LayerIntegratedGradients
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import datetime

### Folder Constants

In [None]:
BERT_MODEL = 'bert-base-uncased'
FOLDER_PATH = '/content/drive/MyDrive/cs4248/'
DATASET_PATH = os.path.join(FOLDER_PATH, 'datasets')
ORIGINAL_DATASET_PATH = os.path.join(DATASET_PATH, 'lun_dataset_original')
ORIGINAL_TEST_DATASET_FILE_NAME = 'test_final_with_topics_new.csv'
MIXED_DATASET_PATH = os.path.join(DATASET_PATH, 'lun_dataset_modified')
MIXED_DATASET_TRAIN_FILE_NAME = 'merged_final_df_with_topics_new.csv'
TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
MODEL_SAVE_PATH = os.path.join(FOLDER_PATH, 'models', BERT_MODEL, TIMESTAMP)

### Device Constants

In [None]:
TORCH_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(TORCH_DEVICE)

### Mount Google Drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Create folder if does not exist
dir_paths = [FOLDER_PATH, ORIGINAL_DATASET_PATH, MIXED_DATASET_PATH, MODEL_SAVE_PATH]
for dir_path in dir_paths:
  if not os.path.exists(dir_path):
    os.makedirs(dir_path)

### Prepare Dataset

In [None]:
train_dataframe = pd.read_csv(os.path.join(MIXED_DATASET_PATH, MIXED_DATASET_TRAIN_FILE_NAME))
# train_dataframe = pd.read_csv(os.path.join(ORIGINAL_DATASET_PATH, 'fulltrain.csv'), names=["label", "text"])
test_dataframe = pd.read_csv(os.path.join(MIXED_DATASET_PATH, ORIGINAL_TEST_DATASET_FILE_NAME))

In [None]:
print("Original Training Dataframe")
train_dataframe

Original Training Dataframe


Unnamed: 0,label,text,has_swear_word,severity,processed_text,topic
0,1,"A little less than a decade ago, hockey fans w...",False,0.0,"['little', 'less', 'decade', 'ago', 'hockey', ...",0
1,1,The writers of the HBO series The Sopranos too...,False,0.0,"['writers', 'hbo', 'series', 'sopranos', 'took...",4
2,1,Despite claims from the TV news outlet to offe...,False,0.0,"['despite', 'claims', 'tv', 'news', 'outlet', ...",4
3,1,After receiving 'subpar' service and experienc...,False,0.0,"['receiving', 'subpar', 'service', 'experienci...",0
4,1,After watching his beloved Seattle Mariners pr...,False,0.0,"['watching', 'beloved', 'seattle', 'mariners',...",0
...,...,...,...,...,...,...
59790,4,"A delegation of approximately 500 to 1,000 in...",False,0.0,"['delegation', 'approximately', '500', '1000',...",2
59791,4,Taiwan's Lu Yen-Hsun and his partner Janko Ti...,False,0.0,"['taiwans', 'lu', 'yenhsun', 'partner', 'janko...",4
59792,4,Democratic candidate Martha Coakley and Repub...,False,0.0,"['democratic', 'candidate', 'martha', 'coakley...",1
59793,4,Holding by-elections frequently due to electi...,False,0.0,"['holding', 'byelections', 'frequently', 'due'...",1


In [None]:
print("Test Dataframe")
test_dataframe

Test Dataframe


Unnamed: 0,label,text,has_swear_word,severity,processed_text,topic
0,1,When so many actors seem content to churn out ...,False,0.0,"['many', 'actors', 'seem', 'content', 'churn',...",0
1,1,In what football insiders are calling an unex...,True,2.2,"['football', 'insiders', 'calling', 'unexpecte...",4
2,1,In a freak accident following Game 3 of the N....,False,0.0,"['freak', 'accident', 'following', 'game', '3'...",4
3,1,North Koreas official news agency announced to...,False,0.0,"['north', 'koreas', 'official', 'news', 'agenc...",4
4,1,The former Alaska Governor Sarah Palin would b...,False,0.0,"['former', 'alaska', 'governor', 'sarah', 'pal...",4
...,...,...,...,...,...,...
2991,4,The Air Force mistakenly gave rival companies ...,False,0.0,"['air', 'force', 'mistakenly', 'gave', 'rival'...",0
2992,4,The United Nations climate chief on Friday cha...,False,0.0,"['united', 'nations', 'climate', 'chief', 'fri...",0
2993,4,River Plate midfielder Diego Buonanotte has un...,False,0.0,"['river', 'plate', 'midfielder', 'diego', 'buo...",0
2994,4,Lawmakers were on the brink Tuesday of exempti...,False,0.0,"['lawmakers', 'brink', 'tuesday', 'exempting',...",0


### Model Constants

In [None]:
TOKEN_MAX_LENGTH = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 3e-5
N_CLASSES = len(set(train_dataframe['label']))

### Prepare Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Prepare Model Dataset

In [None]:
# LUN Dataset Class
class BertLUNDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.text[index]
        label = self.data.label[index] - 1  # Subtract 1 to map indices to range [0, num_classes-1]
        topic = self.data.topic[index]
        severity = self.data.severity[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long),
            'topic': torch.Tensor([1 if topic == x else 0 for x in range(5) ]),
            'severity': torch.tensor(severity)
        }


In [None]:
# Prepare Train Dataloader
training_dataset = BertLUNDataset(train_dataframe, tokenizer, TOKEN_MAX_LENGTH)
train_dataloader = DataLoader(training_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, generator=torch.Generator(device=TORCH_DEVICE))
test_dataset = BertLUNDataset(test_dataframe, tokenizer, TOKEN_MAX_LENGTH)
test_dataloader = DataLoader(test_dataset, batch_size=TRAIN_BATCH_SIZE, generator=torch.Generator(device=TORCH_DEVICE))

### Model


In [None]:
# Create BERT + Linear Layer (for classification)
class BERTModel(torch.nn.Module):
  def __init__(self, n_classes, bert_config):
    super(BERTModel, self).__init__()
    self.bert_layer = BertModel.from_pretrained(BERT_MODEL)
    self.dropout1 = torch.nn.Dropout(0.1)
    self.fc1 = nn.Linear(in_features=774, out_features=4, bias=True)

  def forward(self, input_ids, attention_mask, topic, severity):
    _, output = self.bert_layer(input_ids, attention_mask = attention_mask, return_dict = False)
    output = self.dropout1(output)
    output = torch.cat((output, topic), dim=1)
    output = torch.cat((output, severity.unsqueeze(1)), dim=1).float()
    output = self.fc1(output)
    return output
bert_config = BertConfig.from_pretrained(BERT_MODEL, output_hidden_states=True, output_attentions=True, num_labels=N_CLASSES)
model = BERTModel(N_CLASSES, bert_config)
model.to(TORCH_DEVICE)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTModel(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
model

BERTModel(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

### Declare Loss Function

In [None]:
def criterion(outputs, targets):
  return nn.CrossEntropyLoss()(outputs, targets)

### Declare Optimizer

In [None]:
optimizer = torch.optim.Adam(params = model.parameters(),  lr=LEARNING_RATE)

### Declare Scheduler

In [None]:
num_training_steps = EPOCHS * len(train_dataloader)
scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

### Train the model

In [None]:
from tqdm import tqdm
from sklearn.metrics import f1_score

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    losses = []
    correct_predictions = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    with tqdm(total=len(data_loader), desc='Training') as pbar:
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            topic = batch['topic'].to(device)
            severity = batch['severity'].to(device)

            optimizer.zero_grad()

            outputs = model.forward(
                input_ids=input_ids,
                attention_mask=attention_mask,
                topic=topic,
                severity=severity
            )

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_samples += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            loss = loss_fn(outputs, labels)
            losses.append(loss.item())

            loss.backward()
            optimizer.step()

            pbar.update(1)

    train_f1 = f1_score(all_labels, all_preds, average='weighted')
    return correct_predictions.double() / total_samples, sum(losses) / len(losses), train_f1

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    correct_predictions = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        with tqdm(total=len(data_loader), desc='Evaluation') as pbar:
            for batch in data_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                topic = batch['topic'].to(device)
                severity = batch['severity'].to(device)

                outputs = model.forward(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    topic=topic,
                    severity=severity
                )

                _, preds = torch.max(outputs, dim=1)
                correct_predictions += torch.sum(preds == labels)
                total_samples += labels.size(0)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                loss = loss_fn(outputs, labels)
                losses.append(loss.item())

                pbar.update(1)

    val_f1 = f1_score(all_labels, all_preds, average='weighted')
    return correct_predictions.double() / total_samples, sum(losses) / len(losses), val_f1


In [None]:
# Training loop
for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    # Training
    train_acc, train_loss, train_f1 = train_epoch(model, train_dataloader, criterion, optimizer, TORCH_DEVICE)

    # Evaluation
    val_acc, val_loss, val_f1 = eval_model(model, test_dataloader, criterion, TORCH_DEVICE)

    # Print progress


    print(f'Train Accuracy: {train_acc:.4f}, Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f}')
    print(f'Val Accuracy: {val_acc:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}')

Epoch 1/3


Training: 100%|██████████| 7475/7475 [40:52<00:00,  3.05it/s]
Evaluation: 100%|██████████| 375/375 [01:02<00:00,  6.04it/s]


Train Accuracy: 0.9770, Train Loss: 0.0701, Train F1: 0.9770
Val Accuracy: 0.6175, Val Loss: 2.1477, Val F1: 0.5683
Epoch 2/3


Training: 100%|██████████| 7475/7475 [40:58<00:00,  3.04it/s]
Evaluation: 100%|██████████| 375/375 [01:04<00:00,  5.85it/s]


Train Accuracy: 0.9925, Train Loss: 0.0233, Train F1: 0.9925
Val Accuracy: 0.5995, Val Loss: 2.9130, Val F1: 0.5244
Epoch 3/3


Training: 100%|██████████| 7475/7475 [40:56<00:00,  3.04it/s]
Evaluation: 100%|██████████| 375/375 [01:02<00:00,  6.03it/s]

Train Accuracy: 0.9946, Train Loss: 0.0191, Train F1: 0.9946
Val Accuracy: 0.5864, Val Loss: 2.6429, Val F1: 0.5333





### Test the model

In [None]:
model.eval()
prediction_outputs = []
with torch.no_grad():
    with tqdm(total=len(test_dataloader), desc='Evaluation') as pbar:
      for batch in test_dataloader:
        input_ids = batch['input_ids'].to(TORCH_DEVICE)
        attention_mask = batch['attention_mask'].to(TORCH_DEVICE)
        labels = batch['label'].to(TORCH_DEVICE)
        topic = batch['topic'].to(TORCH_DEVICE)
        severity = batch['severity'].to(TORCH_DEVICE)
        outputs = model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            topic=topic,
            severity=severity
        )
        _, preds = torch.max(outputs, dim=1)
        for index in range(labels.size(0)):
          text = tokenizer.decode(input_ids[index])
          label = labels[index]
          pred = preds[index]
          prediction_outputs.append((text, label.item(), pred.item()))

Evaluation:   0%|          | 0/375 [01:02<?, ?it/s]


In [None]:
pred_df = pd.DataFrame(prediction_outputs, columns=['text', 'true_label', 'pred_label'])
pred_df

Unnamed: 0,text,true_label,pred_label
0,[CLS] when so many actors seem content to chur...,0,0
1,[CLS] in what football insiders are calling an...,0,0
2,[CLS] in a freak accident following game 3 of ...,0,0
3,[CLS] north koreas official news agency announ...,0,3
4,[CLS] the former alaska governor sarah palin w...,0,0
...,...,...,...
2991,[CLS] the air force mistakenly gave rival comp...,3,3
2992,[CLS] the united nations climate chief on frid...,3,3
2993,[CLS] river plate midfielder diego buonanotte ...,3,3
2994,[CLS] lawmakers were on the brink tuesday of e...,3,3


In [None]:
from sklearn.metrics import f1_score
y_true = pred_df['true_label']
y_pred = pred_df['pred_label']
print(f"F1 Macro Score: {f1_score(y_true, y_pred, average='macro')}")
print(f"F1 Micro Score: {f1_score(y_true, y_pred, average='micro')}")
print(f"F1 weighted Score: {f1_score(y_true, y_pred, average='weighted')}")

F1 Macro Score: 0.5335407791631717
F1 Micro Score: 0.5864485981308412
F1 weighted Score: 0.5333185882156726


In [None]:
pred_df.to_csv( os.path.join(MODEL_SAVE_PATH, 'predictions.csv'))

### Save the model and tokenizer weights

In [None]:
tokenizer.save_pretrained(MODEL_SAVE_PATH)

In [None]:
# Reference: https://github.com/huggingface/transformers/issues/7849
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, os.path.join(MODEL_SAVE_PATH, 'bert_model_optim.pth'))

torch.save(model, os.path.join(MODEL_SAVE_PATH, 'model.pth'))
