This notebook contains a Finetuned BERT from Huggingface

BERT Paper: https://arxiv.org/pdf/1810.04805.pdf

### Library Installations

In [24]:
!pip install bertviz transformers torch torchtext tqdm captum



### Library Imports

In [25]:
from google.colab import drive
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_scheduler, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
from tqdm.auto import tqdm
from captum.attr import visualization as viz
from captum.attr import LayerConductance, LayerIntegratedGradients
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import datetime

### Folder Constants

In [26]:
BERT_MODEL = 'bert-base-uncased'
FOLDER_PATH = '/content/drive/MyDrive/cs4248/'
DATASET_PATH = os.path.join(FOLDER_PATH, 'datasets')
ORIGINAL_DATASET_PATH = os.path.join(DATASET_PATH, 'lun_dataset_original')
ORIGINAL_TEST_DATASET_FILE_NAME = 'test_final_with_topics_new.csv'
MIXED_DATASET_PATH = os.path.join(DATASET_PATH, 'lun_dataset_modified')
MIXED_DATASET_TRAIN_FILE_NAME = 'merged_final_df_with_topics_new.csv'
TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
MODEL_SAVE_PATH = os.path.join(FOLDER_PATH, 'models', BERT_MODEL, TIMESTAMP)

### Device Constants

In [27]:
TORCH_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(TORCH_DEVICE)

### Mount Google Drive

In [28]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
# Create folder if does not exist
dir_paths = [FOLDER_PATH, ORIGINAL_DATASET_PATH, MIXED_DATASET_PATH, MODEL_SAVE_PATH]
for dir_path in dir_paths:
  if not os.path.exists(dir_path):
    os.makedirs(dir_path)

### Prepare Dataset

In [30]:
train_dataframe = pd.read_csv(os.path.join(MIXED_DATASET_PATH, MIXED_DATASET_TRAIN_FILE_NAME))
test_dataframe = pd.read_csv(os.path.join(MIXED_DATASET_PATH, ORIGINAL_TEST_DATASET_FILE_NAME))

In [31]:
print("Original Training Dataframe")
train_dataframe

Original Training Dataframe


Unnamed: 0,label,text,has_swear_word,severity,processed_text,topic
0,1,"A little less than a decade ago, hockey fans w...",False,0.0,"['little', 'less', 'decade', 'ago', 'hockey', ...",0
1,1,The writers of the HBO series The Sopranos too...,False,0.0,"['writers', 'hbo', 'series', 'sopranos', 'took...",4
2,1,Despite claims from the TV news outlet to offe...,False,0.0,"['despite', 'claims', 'tv', 'news', 'outlet', ...",4
3,1,After receiving 'subpar' service and experienc...,False,0.0,"['receiving', 'subpar', 'service', 'experienci...",0
4,1,After watching his beloved Seattle Mariners pr...,False,0.0,"['watching', 'beloved', 'seattle', 'mariners',...",0
...,...,...,...,...,...,...
59790,4,"A delegation of approximately 500 to 1,000 in...",False,0.0,"['delegation', 'approximately', '500', '1000',...",2
59791,4,Taiwan's Lu Yen-Hsun and his partner Janko Ti...,False,0.0,"['taiwans', 'lu', 'yenhsun', 'partner', 'janko...",4
59792,4,Democratic candidate Martha Coakley and Repub...,False,0.0,"['democratic', 'candidate', 'martha', 'coakley...",1
59793,4,Holding by-elections frequently due to electi...,False,0.0,"['holding', 'byelections', 'frequently', 'due'...",1


In [32]:
print("Test Dataframe")
test_dataframe

Test Dataframe


Unnamed: 0,label,text,has_swear_word,severity,processed_text,topic
0,1,When so many actors seem content to churn out ...,False,0.0,"['many', 'actors', 'seem', 'content', 'churn',...",0
1,1,In what football insiders are calling an unex...,True,2.2,"['football', 'insiders', 'calling', 'unexpecte...",4
2,1,In a freak accident following Game 3 of the N....,False,0.0,"['freak', 'accident', 'following', 'game', '3'...",4
3,1,North Koreas official news agency announced to...,False,0.0,"['north', 'koreas', 'official', 'news', 'agenc...",4
4,1,The former Alaska Governor Sarah Palin would b...,False,0.0,"['former', 'alaska', 'governor', 'sarah', 'pal...",4
...,...,...,...,...,...,...
2991,4,The Air Force mistakenly gave rival companies ...,False,0.0,"['air', 'force', 'mistakenly', 'gave', 'rival'...",0
2992,4,The United Nations climate chief on Friday cha...,False,0.0,"['united', 'nations', 'climate', 'chief', 'fri...",0
2993,4,River Plate midfielder Diego Buonanotte has un...,False,0.0,"['river', 'plate', 'midfielder', 'diego', 'buo...",0
2994,4,Lawmakers were on the brink Tuesday of exempti...,False,0.0,"['lawmakers', 'brink', 'tuesday', 'exempting',...",0


### Model Constants

In [33]:
TOKEN_MAX_LENGTH = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 3e-5
N_CLASSES = len(set(train_dataframe['label']))

### Prepare Tokenizer

In [34]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

### Prepare Model Dataset

In [35]:
# LUN Dataset Class
class BertLUNDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.text[index]
        label = self.data.label[index] - 1  # Subtract 1 to map indices to range [0, num_classes-1]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [36]:
# Prepare Train Dataloader
training_dataset = BertLUNDataset(train_dataframe, tokenizer, TOKEN_MAX_LENGTH)
train_dataloader = DataLoader(training_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, generator=torch.Generator(device=TORCH_DEVICE))
test_dataset = BertLUNDataset(test_dataframe, tokenizer, TOKEN_MAX_LENGTH)
test_dataloader = DataLoader(test_dataset, batch_size=TRAIN_BATCH_SIZE, generator=torch.Generator(device=TORCH_DEVICE))

### Model


In [37]:
bert_config = BertConfig.from_pretrained(BERT_MODEL, output_hidden_states=True, output_attentions=True, num_labels=N_CLASSES)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, config=bert_config)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Declare Loss Function

In [38]:
def criterion(outputs, targets):
  return nn.CrossEntropyLoss()(outputs, targets)

### Declare Optimizer

In [39]:
optimizer = torch.optim.Adam(params = model.parameters(),  lr=LEARNING_RATE)

### Declare Scheduler

In [40]:
num_training_steps = EPOCHS * len(train_dataloader)
scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

### Train the model

In [41]:
from tqdm import tqdm
from sklearn.metrics import f1_score
def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    correct_predictions = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        with tqdm(total=len(data_loader), desc='Evaluation') as pbar:
            for batch in data_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)

                _, preds = torch.max(outputs.logits, dim=1)
                correct_predictions += torch.sum(preds == labels)
                total_samples += labels.size(0)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                loss = loss_fn(outputs.logits, labels)
                losses.append(loss.item())

                pbar.update(1)

    val_f1 = f1_score(all_labels, all_preds, average='weighted')
    return correct_predictions.double() / total_samples, sum(losses) / len(losses), val_f1


In [42]:
# Training loop
for epoch in range(EPOCHS):
    # Evaluation
    val_acc, val_loss, val_f1 = eval_model(model, test_dataloader, criterion, TORCH_DEVICE)
    print(f'Val Accuracy: {val_acc:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}')

Evaluation: 100%|██████████| 375/375 [01:22<00:00,  4.57it/s]


Val Accuracy: 0.2503, Val Loss: 1.5050, Val F1: 0.1010


Evaluation: 100%|██████████| 375/375 [01:10<00:00,  5.31it/s]


Val Accuracy: 0.2503, Val Loss: 1.5050, Val F1: 0.1010


Evaluation: 100%|██████████| 375/375 [01:08<00:00,  5.45it/s]

Val Accuracy: 0.2503, Val Loss: 1.5050, Val F1: 0.1010





### Test the model

In [43]:
model.eval()
prediction_outputs = []
with torch.no_grad():
    with tqdm(total=len(test_dataloader), desc='Evaluation') as pbar:
      for batch in test_dataloader:
        input_ids = batch['input_ids'].to(TORCH_DEVICE)
        attention_mask = batch['attention_mask'].to(TORCH_DEVICE)
        labels = batch['label'].to(TORCH_DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        for index in range(labels.size(0)):
          text = tokenizer.decode(input_ids[index])
          label = labels[index]
          pred = preds[index]
          prediction_outputs.append((text, label.item(), pred.item()))

Evaluation:   0%|          | 0/375 [01:09<?, ?it/s]


In [46]:
pred_df = pd.DataFrame(prediction_outputs, columns=['text', 'true_label', 'pred_label'])
pred_df

Unnamed: 0,text,true_label,pred_label
0,[CLS] when so many actors seem content to chur...,0,2
1,[CLS] in what football insiders are calling an...,0,2
2,[CLS] in a freak accident following game 3 of ...,0,2
3,[CLS] north koreas official news agency announ...,0,2
4,[CLS] the former alaska governor sarah palin w...,0,2
...,...,...,...
2991,[CLS] the air force mistakenly gave rival comp...,3,2
2992,[CLS] the united nations climate chief on frid...,3,2
2993,[CLS] river plate midfielder diego buonanotte ...,3,2
2994,[CLS] lawmakers were on the brink tuesday of e...,3,2


In [49]:
y_true = pred_df['true_label']
y_pred = pred_df['pred_label']
f1_micro= metrics.f1_score(y_true, y_pred, average='micro')
f1_macro= metrics.f1_score(y_true, y_pred, average='macro')
accuracy = metrics.accuracy_score(y_true, y_pred)
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_micro}")
print(f"F1 Score (Macro) = {f1_macro}")

Accuracy Score = 0.25033377837116155
F1 Score (Micro) = 0.25033377837116155
F1 Score (Macro) = 0.10085304487898782


In [None]:
pred_df.to_csv( os.path.join(MODEL_SAVE_PATH, 'predictions.csv'))