#### BERT Sequence Classifier Reference Blogpost 
https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer
from transformers import RobertaForSequenceClassification, RobertaConfig

from transformers import AdamW, BertConfig, get_linear_schedule_with_warmup
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import pickle5 as pkl
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.optim import Adam
from tqdm import tqdm
import time

In [2]:
# First checking if GPU is available
train_on_gpu=torch.backends.mps.is_available()
if(train_on_gpu):
    print('Training on GPU.')
    device = torch.device("mps")
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [3]:
train_dataset_path = "tos_clauses_train.csv"
test_dataset_path = "tos_clauses_dev.csv"
train_df = pd.read_csv(train_dataset_path, header=0)
test_df = pd.read_csv(test_dataset_path, header=0)

In [4]:
def read_embeddings(embeddings_file_path):
    with open(embeddings_file_path, mode="rb") as file:
        data = pkl.load(file) 
    return data

train_data = read_embeddings("train_bert_embeddings.pkl")
test_data = read_embeddings("test_bert_embeddings.pkl")


In [5]:
class TenDataset(Dataset):
    def __init__(self, X, Y):
        self.data1 = X
        self.data2 = Y
        
    def __len__(self):
        return len(self.data1)
    
    def __getitem__(self, index):
        x = self.data1[index]
        y = self.data2[index]      
        return torch.tensor(x), y 

test_len = len(train_data)
train_len = len(test_data)
X_train_tensor = TenDataset(train_data["embeddings"], train_df["label"].values)
X_test_tensor = TenDataset(test_data["embeddings"], test_df["label"].values)

num_of_workers = 0
batch_size = 10
valid_size = 0.2

num_train = len(X_train_tensor)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

print(len(train_idx))
print(len(valid_idx))
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = torch.utils.data.DataLoader(X_train_tensor, batch_size=batch_size,
    sampler=train_sampler, num_workers=num_of_workers, drop_last=True)
valid_loader = torch.utils.data.DataLoader(X_train_tensor, batch_size=batch_size, 
    sampler=valid_sampler, num_workers=num_of_workers, drop_last=True)
test_loader = torch.utils.data.DataLoader(X_test_tensor, batch_size=batch_size, 
    num_workers=num_of_workers, drop_last=True)

6025
1506


In [6]:
train_fair = sum(train_df['label'] == 0)
train_unfair = sum(train_df['label'] == 1)

print("train_fair:" + str(train_fair))
print("train_unfair:" + str(train_unfair))

train_fair:6705
train_unfair:826


In [7]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

params = list(model.named_parameters())

print('The RoBERTa model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
    
optimizer = AdamW(model.parameters(),lr = 5e-5, eps = 1e-8)

epochs = 4
total_steps = len(train_loader) * epochs



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (



In [8]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [9]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
def train(model, train_dataloader, val_dataloader, learning_rate, epochs, train_idx, val_idx):
    epoch_nums_list = []
    train_loss_list = []
    val_loss_list = []
    model.train()
    criterion = nn.CrossEntropyLoss(weight = torch.FloatTensor([1/train_fair, 1/train_unfair]))
    for epoch_num in range(epochs):
        print("Epoch: " + str(epoch_num))
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            model.zero_grad()
            train_input = torch.reshape(train_input, (batch_size, 512, 768))
            
            output = model(inputs_embeds = train_input, labels = train_label.long())
            logits = output[1]
            loss = criterion(logits, train_label.long())
            total_loss_train += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            logits = logits.detach().numpy()
            acc = flat_accuracy(logits, train_label.numpy())
            total_acc_train += acc
            optimizer.step()
            
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in val_dataloader:
                val_input = torch.reshape(val_input, (batch_size, 512, 768))
                output = model(inputs_embeds = val_input, labels = val_label.long())
                logits = output[1]
                loss = criterion(logits, val_label.long())
                total_loss_val += loss.item()
                    
                
                logits = logits.detach().numpy()
                acc = flat_accuracy(logits, val_label.numpy())
                total_acc_val += acc
            
        print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_idx): .3f} \
                | Train Accuracy: {total_acc_train / len(train_dataloader): .3f} \
                | Val Loss: {total_loss_val / len(val_idx): .3f} \
                | Val Accuracy: {total_acc_val / len(val_dataloader): .3f}')
        epoch_nums_list.append(epoch_num)
        train_loss_list.append(1- (total_acc_train / len(train_dataloader)))
        val_loss_list.append(1- (total_acc_val / len(val_dataloader)))
        if epoch_num in [0, 1, 2, 3, 4, 5]:
            torch.save(model.state_dict(), "roberta_sc_all_models/" + "rbscmod_" + str(epoch_num) + "_file.pt")
    return epoch_nums_list, train_loss_list, val_loss_list
EPOCHS = 5
LR = 1e-6
              
epoch_nums_list, train_loss_list, val_loss_list = train(model, train_loader, valid_loader, LR, EPOCHS, train_idx, valid_idx)

  return torch.tensor(x), y


Epoch: 0


 18%|██████▊                                | 106/602 [18:26<1:28:13, 10.67s/it]

In [None]:
plt.plot(epoch_nums_list, val_loss_list, color = 'green', label = 'val loss')
plt.plot(epoch_nums_list, train_loss_list, color = 'red', label = 'train loss')
plt.legend(loc = 'upper left')
plt.ylabel('error')
plt.xlabel('epochs')
plt.show()

In [None]:
def evaluate(model, test_dataloader, test_data):
    prediction_list = []
    actual_list = []
    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            test_input = torch.reshape(test_input, (len(test_data), 512, 768))
            output = model(inputs_embeds = test_input)
            logits = output[0]
            logits = logits.detach().numpy()
            print(logits)
            acc = flat_accuracy(logits, test_label.numpy())
            total_acc_test += acc
            
            prediction_list.append(logits.argmax(axis=1))
            actual_list.append(test_label)
    
    print(f'Test Accuracy: {total_acc_test / len(test_dataloader): .3f}')
    
    return prediction_list[0].tolist(), actual_list[0].tolist()

test_loader = torch.utils.data.DataLoader(X_test_tensor, batch_size=len(df_test), num_workers=num_of_workers, drop_last=True)
prediction_list, actual_list = evaluate(model, test_loader, df_test)
report = classification_report(actual_list, prediction_list, output_dict=True)
report

In [None]:
from tabulate import tabulate
import os 

def evaluate(test_dataloader, test_data):
    #bert_sc_avg_models
    path = "roberta_sc_all_models/"
    files = os.listdir("./roberta_sc_all_models/")
    files.sort()
    for file in files:
        print("Model Epoch: " + file)
        curr_model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels = 2,
            output_attentions = False,
            output_hidden_states = False,)

        curr_model.load_state_dict(torch.load(path + file))
        prediction_list = []
        actual_list = []
        total_acc_test = 0
        with torch.no_grad():
            for test_input, test_label in test_dataloader:
                test_input = torch.reshape(test_input, (len(test_data), 512, 768))
                output = curr_model(inputs_embeds = test_input)
                logits = output[0]
                logits = logits.detach().numpy()
                print(logits)
                acc = flat_accuracy(logits, test_label.numpy())
                total_acc_test += acc
            
                prediction_list.append(logits.argmax(axis=1))
                actual_list.append(test_label)
    
        print(f'Test Accuracy: {total_acc_test / len(test_dataloader): .3f}')
        report = classification_report(actual_list[0].tolist(), prediction_list[0].tolist())
        print(report)

test_loader = torch.utils.data.DataLoader(X_test_tensor, batch_size=len(df_test), num_workers=num_of_workers, drop_last=True)
evaluate(test_loader, df_test)