In [1]:
!pip install -U torchtext==0.10.0
!pip install -U torch==1.9.0



In [2]:
!pip install transformers



In [3]:
!pip install pickle5



In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import pickle5 as pkl
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.optim import Adam
from tqdm import tqdm
from tabulate import tabulate
import os 

In [2]:
train_dataset_path = "tos_clauses_train.csv"
test_dataset_path = "tos_clauses_dev.csv"
train_df = pd.read_csv(train_dataset_path, header=0)
test_df = pd.read_csv(test_dataset_path, header=0)

In [3]:
def read_embeddings(embeddings_file_path):
    with open(embeddings_file_path, mode="rb") as file:
        data = pkl.load(file) 
    return data

train_data = read_embeddings("train_bert_embeddings.pkl")
test_data = read_embeddings("test_bert_embeddings.pkl")

In [4]:
# class TensorDataset(Dataset):
#     def __init__(self, data):
#         self.data = data

#     def __len__(self):
#         return len(self.data["embeddings"])

#     def __getitem__(self, index):
#         features = self.data["embeddings"][index]
#         label = int(self.data["label"][index])
#         return features, label
    
#     def __getindexlist__(self):
#         return list(np.arange(0, len(self.data["embeddings"])+1, 1))

class TenDataset(Dataset):
    def __init__(self, X, Y):
        self.data1 = X
        self.data2 = Y
        
    def __len__(self):
        return len(self.data1)
    
    def __getitem__(self, index):
        x = self.data1[index]
        y = self.data2[index]      
        return torch.tensor(x), y 

test_len = len(train_data)
train_len = len(test_data)
X_train_tensor = TenDataset(train_data["embeddings"], train_df["label"].values)
X_test_tensor = TenDataset(test_data["embeddings"], test_df["label"].values)

num_of_workers = 0
batch_size = 10
valid_size = 0.2

num_train = len(X_train_tensor)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = torch.utils.data.DataLoader(X_train_tensor, batch_size=batch_size,
    sampler=train_sampler, num_workers=num_of_workers, drop_last=True)
valid_loader = torch.utils.data.DataLoader(X_train_tensor, batch_size=batch_size, 
    sampler=valid_sampler, num_workers=num_of_workers, drop_last=True)
test_loader = torch.utils.data.DataLoader(X_test_tensor, batch_size=batch_size, 
    num_workers=num_of_workers, drop_last=True)


In [5]:
train_fair = sum(train_df['label'] == 0)
train_unfair = sum(train_df['label'] == 1)

print("train_fair:" + str(train_fair))
print("train_unfair:" + str(train_unfair))

train_fair:6705
train_unfair:826


In [6]:
# class BertClassifier(nn.Module):

#     def __init__(self, dropout=0.5):

#         super(BertClassifier, self).__init__()

#         self.bert = BertModel.from_pretrained('bert-base-cased')
#         self.dropout = nn.Dropout(dropout)
#         self.linear_1 = nn.Linear(768, 512)
#         self.relu = nn.ReLU()
#         self.linear_2 = nn.Linear(512, 10)
#         self.relu = nn.ReLU()
#         self.linear_3 = nn.Linear(10, 2)
#         self.relu = nn.ReLU()

#     def forward(self, inputs_embeds):

#         _, pooled_output = self.bert(inputs_embeds = inputs_embeds,return_dict=False)
#         dropout_output = self.dropout(pooled_output)
#         linear_output_1 = F.relu(self.linear_1(dropout_output))
#         linear_output_2 = F.relu(self.linear_2(linear_output_1))
#         final_layer = F.relu(self.linear_3(linear_output_2))
#         return final_layer

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear_1 = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, inputs_embeds):

        _, pooled_output = self.bert(inputs_embeds = inputs_embeds,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        final_layer = F.relu(self.linear_1(dropout_output))
        return final_layer

In [7]:
test_loader = torch.utils.data.DataLoader(X_test_tensor, batch_size=batch_size, num_workers=num_of_workers, drop_last=True)

def evaluate_after_epochs(model, test_dataloader, test_data):
    prediction_list = []
    actual_list = []
    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in tqdm(test_dataloader):
            test_input = torch.reshape(test_input, (batch_size, 512, 768))
            output = curr_model(test_input)
            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
            
            prediction_list.extend(output.argmax(dim=1))
            actual_list.extend(test_label)
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    report = classification_report(actual_list, prediction_list)
    print(report)

In [None]:
def train(model, train_dataloader, val_dataloader, learning_rate, epochs, train_idx, val_idx):
    criterion = nn.CrossEntropyLoss(weight = torch.FloatTensor([1/train_fair, 1/train_unfair]))
    optimizer = Adam(model.parameters(), lr= learning_rate)
    epoch_nums_list = []
    train_loss_list = []
    val_loss_list = []
    for epoch_num in range(epochs):
        print("Epoch: " + str(epoch_num))
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_input = torch.reshape(train_input, (batch_size, 512, 768))
            output = model(train_input)
                
            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()
                
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in tqdm(val_dataloader):
                val_input = torch.reshape(val_input, (batch_size, 512, 768))
                output = model(val_input)

                batch_loss = criterion(output, val_label.long())
                total_loss_val += batch_loss.item()
                    
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
            
        print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_idx): .3f} \
                | Train Accuracy: {total_acc_train / len(train_idx): .3f} \
                | Val Loss: {total_loss_val / len(val_idx): .3f} \
                | Val Accuracy: {total_acc_val / len(val_idx): .3f}')
        epoch_nums_list.append(epoch_num)
        train_loss_list.append(1- (total_acc_train / len(train_idx)))
        val_loss_list.append(1- (total_acc_val / len(val_idx)))
        if epoch_num in [0, 1, 2, 3, 4, 5, 6]:
            torch.save(model.state_dict(), "bert_mod_all_models/" + "bmod_" + str(epoch_num) + "_file.pt")
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
epoch_nums_list, train_loss_list, val_loss_list = train(model, train_loader, valid_loader, LR, EPOCHS, train_idx, valid_idx)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  return torch.tensor(x), y


Epoch: 0


100%|███████████████████████████████████████| 602/602 [3:13:26<00:00, 19.28s/it]
100%|█████████████████████████████████████████| 150/150 [06:51<00:00,  2.75s/it]


Epochs: 1 | Train Loss:  0.067                 | Train Accuracy:  0.779                 | Val Loss:  0.065                 | Val Accuracy:  0.721


  0%|                                                   | 0/602 [00:00<?, ?it/s]

Epoch: 1


100%|███████████████████████████████████████| 602/602 [3:10:17<00:00, 18.97s/it]
100%|█████████████████████████████████████████| 150/150 [05:59<00:00,  2.40s/it]


Epochs: 2 | Train Loss:  0.065                 | Train Accuracy:  0.807                 | Val Loss:  0.066                 | Val Accuracy:  0.830


  0%|                                                   | 0/602 [00:00<?, ?it/s]

Epoch: 2


  9%|███▌                                    | 54/602 [06:48<1:07:48,  7.42s/it]

In [8]:
plt.plot(epoch_nums_list, val_loss_list, color = 'green', label = 'val loss')
plt.plot(epoch_nums_list, train_loss_list, color = 'red', label = 'train loss')
plt.legend(loc = 'upper left')
plt.ylabel('epochs')
plt.xlabel('loss')
plt.show()

NameError: name 'epoch_nums_list' is not defined

In [9]:
def evaluate(test_dataloader, test_data):
    path = "bert_mod_all_models/"
    files = os.listdir("./bert_mod_all_models/")
    for file in files:
        print("Model Epoch: " + file)
        curr_model = BertClassifier()

        curr_model.load_state_dict(torch.load(path + file))
        total_acc_test = 0
        prediction_list = []
        actual_list = []
        with torch.no_grad():
            for test_input, test_label in tqdm(test_dataloader):
                test_input = torch.reshape(test_input, (batch_size, 512, 768))
                output = curr_model(test_input)
                acc = (output.argmax(dim=1) == test_label).sum().item()
                total_acc_test += acc
            
                prediction_list.extend(output.argmax(dim=1))
                actual_list.extend(test_label)
                
        print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
        report = classification_report(actual_list, prediction_list)
        print(report)


evaluate(test_loader, test_df)

Model Epoch: bmod_0_file.pt


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  return torch.tensor(x), y
100%|█████████████████████████████████████████| 188/188 [07:31<00:00,  2.40s/it]


Test Accuracy:  0.702
              precision    recall  f1-score   support

           0       0.92      0.73      0.81      1675
           1       0.17      0.46      0.25       205

    accuracy                           0.70      1880
   macro avg       0.55      0.60      0.53      1880
weighted avg       0.84      0.70      0.75      1880

Model Epoch: bmod_1_file.pt


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  return torch.tensor(x), y
100%|█████████████████████████████████████████| 188/188 [07:35<00:00,  2.42s/it]

Test Accuracy:  0.852
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1675
           1       0.27      0.20      0.23       205

    accuracy                           0.85      1880
   macro avg       0.59      0.57      0.57      1880
weighted avg       0.84      0.85      0.84      1880




