In [1]:
# DELETE CELL IF RUNNING ON LOCAL MACHINE INSTEAD OF GOOGLE COLAB
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 14.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 54.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.0 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 2.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import  DataLoader

import transformers
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer 
from transformers import BertPreTrainedModel, BertModel

from datasets import load_metric

import numpy as np
import re
import os
import time
import json
from tqdm.auto import tqdm
from sklearn.metrics import classification_report

In [3]:
# DELETE CELL IF RUNNING ON LOCAL MACHINE INSTEAD OF GOOGLE COLAB
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# DELETE CELL IF RUNNING ON LOCAL MACHINE INSTEAD OF GOOGLE COLAB
%cd /content/drive/MyDrive

/content/drive/MyDrive


In [5]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)
print(use_cuda)

cuda:0
True


In [6]:
# Create list of training data files
def load_from_directory(directory):
    """
    Utility function to load all json-converted reports into a dataset.
    params: directory: string representing location on disk of json files
    returns: dataset: list of deserialised jsons
    """
    path = os.getcwd()
    path = os.path.join(path, directory)
    json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')]

    dataset = [] 

    for filename in json_files:
        with open(path+filename, "r", encoding='utf-8') as read_file:
            dataset.append(json.load(read_file))
    
    return dataset

train_folder = 'json_train/'
train_dataset = load_from_directory(train_folder)

dev_folder = 'json_develop/'
development_dataset = load_from_directory(dev_folder)

test_folder = 'json_test/'
test_dataset = load_from_directory(test_folder)

In [7]:
def filter_dataset(input_dataset):
    """ 
    Utility function to convert input dataset into custom data structure
    params: input_dataset: list of deserialised jsons
    returns: dictionary with the following structure:
            key:  sentence_global_idx value , value:list of dictionaries
            dict0: key: report_no, value: int 
            dict1: key: text, value: string
            dict2: key: has_initiative, value: boolean
            dict3: key: list_of_initiatives, value: list of strings with initiative IDs
            dict4: key: sector, value: list of strings
            dict5: key: sdg, value: list of sgd strings (representing sgd number)
            dict6: key: sentence_length, value: int  
    """
    structured_data = {}
    total_no_reports = len(input_dataset)
    sentence_global_idx = 0
    re_punctuation_string = '[“”|()%&\s,_:;/\'!?-]'

    for report_no in range(total_no_reports): 
        no_sentences_per_report = len(input_dataset[report_no]['tokenised_sentences'])
        for sentence_no in range(no_sentences_per_report):
            tokenized_sentence = re.split(re_punctuation_string, input_dataset[report_no]['tokenised_sentences'][sentence_no]['text'])
            tokenized_sentence = list(filter(None, tokenized_sentence))
            if (len(tokenized_sentence) == 0):           
                continue
            else:
                structured_data[sentence_global_idx] = []
                structured_data[sentence_global_idx].append({'report_no':report_no}) 
                structured_data[sentence_global_idx].append({'text':' '.join([elem.lower() for elem in tokenized_sentence])})
                if len(input_dataset[report_no]['tokenised_sentences'][sentence_no]['initiative_ids']) > 0:
                  structured_data[sentence_global_idx].append({'has_initiative':1})
                else:
                   structured_data[sentence_global_idx].append({'has_initiative':0})
                structured_data[sentence_global_idx].append({'list_of_initiatives': input_dataset[report_no]['tokenised_sentences'][sentence_no]['initiative_ids']}) 
                
                structured_data[sentence_global_idx].append({'sentence_length':len(tokenized_sentence)}) 
                sentence_global_idx +=1
    
    return structured_data

In [8]:
# Set up datasets from json files
training_data = filter_dataset(train_dataset)
development_data = filter_dataset(development_dataset)
testing_data = filter_dataset(test_dataset)

In [9]:
def assisted_labelling(data, lower_threshold, upper_threshold):
    """
    Utility function which labels all sentences with fewer than the threshold number of tokens as not having a sustainability initiative.
    params: data: list of dictionaries
            threshold: int representing number of tokens
    returns: dictionary {global_sentence_index:boolean label}
    """
    labeled_dataset = {}
    for sentence_no in range(len(data)):
        tokenized_sentence = re.split(' ', data[sentence_no][1]['text'])
        tokenized_sentence_with_alphabetical_chars = [word for word in tokenized_sentence if re.search('[a-zA-Z]', word)]
        if (data[sentence_no][4]['sentence_length'] > lower_threshold) & (len(tokenized_sentence_with_alphabetical_chars)!=0) & (data[sentence_no][4]['sentence_length']<upper_threshold):
            labeled_dataset[sentence_no] = 1 
        else:
            labeled_dataset[sentence_no] = 0 # label short, long and non-alphabetical sentences as not having an initiative
    return labeled_dataset

assistant_labeled_training_data = assisted_labelling(training_data,lower_threshold=5, upper_threshold=100)
assistant_labeled_dev_data = assisted_labelling(development_data,lower_threshold=5, upper_threshold=100)
assistant_labeled_test_data = assisted_labelling(testing_data, lower_threshold=5, upper_threshold=100)

In [10]:
def reader(dataset, assisted_labels):
    """
    Utility function read in the data together with assisted labels and return a train dictionary and a pre-labelled dictionary.
    params: dataset: dict {global_sentence_index : list of 7 dictionaries}
            assisted_labels: dict {global_sentence_index : assistant label}
    returns: train_dict: dict
             pre_labelled_dict
    """
    texts = []
    labels = []
    positions = []
    initiative_IDs = []
    
    pre_labeled_texts =[]
    pre_labeled_labels =[]
    pre_labeled_positions =[]
    pre_labeled_IDs = []

    for sentence_no in range(len(dataset)):
        if assisted_labels[sentence_no] == 1:
            texts.append(dataset[sentence_no][1]['text'])
            labels.append(dataset[sentence_no][2]['has_initiative'])
            positions.append(sentence_no)
            initiative_IDs.append(dataset[sentence_no][3]['list_of_initiatives'])
        else:
            pre_labeled_texts.append(dataset[sentence_no][1]['text'])
            pre_labeled_labels.append(assisted_labels[sentence_no])
            pre_labeled_positions.append(sentence_no)
            pre_labeled_IDs.append(dataset[sentence_no][3]['list_of_initiatives'])

    train_dict = {'texts':texts, 'labels':labels, 'positions':positions, 'ID_list':initiative_IDs}
    pre_labeled_dict = {'texts':pre_labeled_texts, 'labels':pre_labeled_labels, 'positions': pre_labeled_positions, 'ID_list':pre_labeled_IDs}
            
    return train_dict, pre_labeled_dict

In [11]:
class SustainableDataset(torch.utils.data.Dataset):
    """Dataset class inheriting from pytorch to be used by dataloaders.
    """
    def __init__(self, tokenizer, input_set):
        self.tokenizer = tokenizer
        self.texts = input_set['texts']
        self.labels = input_set['labels']
        
    def collate_fn(self, batch):
        texts = []
        labels = []
        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])
        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        encodings['labels'] =  torch.tensor(labels)
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {'text': self.texts[idx],
                'label': self.labels[idx]
                }
        return item

In [12]:

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Read in training and dev data and split into data to feed into the model and pre-labeled data
train_data, pre_labeled_train_data = reader(training_data, assistant_labeled_training_data)
dev_data, pre_labeled_dev_data = reader(development_data, assistant_labeled_dev_data)
test_data, pre_labeled_test_data = reader(testing_data, assistant_labeled_test_data)

#Only data to be fed into the model is built into datasets
train_dataset = SustainableDataset(tokenizer, train_data)
dev_dataset = SustainableDataset(tokenizer, dev_data)
test_dataset = SustainableDataset(tokenizer, test_data)

#Create train and dev dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
# Unit tests -> check if assisted data labelling has been performed correctly
assert len(set(pre_labeled_train_data['positions']).intersection(set(train_data['positions']))) == 0 
assert (len(train_data['texts']) + len(pre_labeled_train_data['texts'])) == len(training_data)
assert set(pre_labeled_train_data['positions']).union(set(train_data['positions'])) == set(range(len(training_data)))
assert (len(dev_data['texts']) + len(pre_labeled_dev_data['texts'])) == len(development_data)

In [14]:
class Sustainable_BERT(BertPreTrainedModel):
    """ Transformer model class with custom output layer for fine-tuning.
    """
    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config)
        self.projection = torch.nn.Sequential(torch.nn.Dropout(0.1), torch.nn.Linear(config.hidden_size, 1))    
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
 
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits= torch.sigmoid(self.projection(outputs[1]))
        return logits

In [15]:
class Trainer_Sustainable(Trainer):
    """ Class inheriting from Trainer to configure loss function used.
    """
    def compute_loss(self, model, inputs, return_outputs=False):

        labels = inputs.pop('labels')
        outputs = model(**inputs)

        loss = nn.BCELoss()
        loss = loss(outputs.squeeze(-1), labels.float())
        
        if return_outputs: 
            return (loss, (loss, outputs)) 
        else:
            return loss

In [16]:
def model_predict(model, tokenizer, dataloader,device):
    """ Utility function to set the model to GPU and infer of given dataloader.
    """
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            encodings = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device) 
            output = model(**encodings).squeeze(-1)
            predictions.append(output)
    predictions = torch.cat(predictions).tolist()
    return predictions

In [17]:
# Instantiate and train model
model = Sustainable_BERT.from_pretrained('bert-base-uncased').to(device) 

total_epochs = 10 
learning_rate = 1e-5 

# Create evaluation metric F1 score 
metric = load_metric('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions_raw = predictions[:, 0]
    predictions = [1 if predictions_raw[i]>0.5 else 0 for i in range(len(predictions_raw))] 
    return metric.compute(predictions=predictions, references=labels)

# Define optimizer and lr schedule
optimizer = transformers.AdamW(model.parameters(),
                  lr = learning_rate,
                  )

total_steps = len(train_loader) * total_epochs 
warmup = 0.06 * total_steps
 
# Create the learning rate scheduler.
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup, 
                                            num_training_steps = total_steps) 

# Create training arguments
training_args = TrainingArguments(
    output_dir='./bert_binary_paper',
    save_total_limit = 2,
    learning_rate = learning_rate, 
    logging_strategy = 'epoch',
    per_device_train_batch_size=16, 
    num_train_epochs = total_epochs, 
    load_best_model_at_end = True,
    do_eval = True,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    metric_for_best_model = 'f1',
    )

# Define trainer module
trainer = Trainer_Sustainable(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,                   
    data_collator=train_dataset.collate_fn, 
    callbacks =[transformers.EarlyStoppingCallback(early_stopping_patience = 5, early_stopping_threshold=-0.03)],
    compute_metrics = compute_metrics,
    optimizers = (optimizer, scheduler)
    )

trainer.train() 

trainer.save_model('./final_bert_binary_paper')


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing Sustainable_BERT: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing Sustainable_BERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Sustainable_BERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Sustainable_BERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['proj

Downloading:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

***** Running training *****
  Num examples = 36920
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 23080


Epoch,Training Loss,Validation Loss,F1
1,0.2945,0.239332,0.141176
2,0.2142,0.230728,0.418388
3,0.1653,0.286943,0.361798
4,0.1171,0.404682,0.391348
5,0.0838,0.474779,0.36255
6,0.0543,0.608546,0.343359
7,0.0354,0.666486,0.369331


***** Running Evaluation *****
  Num examples = 20402
  Batch size = 8
Saving model checkpoint to ./bert_binary_paper/checkpoint-2308
Configuration saved in ./bert_binary_paper/checkpoint-2308/config.json
Model weights saved in ./bert_binary_paper/checkpoint-2308/pytorch_model.bin
Deleting older checkpoint [bert_binary_paper/checkpoint-7014] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20402
  Batch size = 8
Saving model checkpoint to ./bert_binary_paper/checkpoint-4616
Configuration saved in ./bert_binary_paper/checkpoint-4616/config.json
Model weights saved in ./bert_binary_paper/checkpoint-4616/pytorch_model.bin
Deleting older checkpoint [bert_binary_paper/checkpoint-18704] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20402
  Batch size = 8
Saving model checkpoint to ./bert_binary_paper/checkpoint-6924
Configuration saved in ./bert_binary_paper/checkpoint-6924/config.json
Model weights saved in ./bert_binary_paper/check

In [18]:
def reconcile_mapping(model_data, pre_labeled_data, model_predictions):
    """ Utility function to reconcile mapping between pre-labeled data and model predictions.
    params: predictions: list of model predictions
            pre_labeled_data: dictionary outputted from reader function
    returns: pred_mapping: dict
             predictions: dict
    """
    pred_mapping = {}
    for dataset_text, text_position, prediction in zip(model_data['texts'], model_data['positions'], model_predictions):
        if prediction>0.5:
            pred_mapping[text_position] = (dataset_text, 1)
        else:
            pred_mapping[text_position] = (dataset_text, 0)

    pre_labeled_mapping = {}
    for text, pos, label in zip(pre_labeled_data['texts'], pre_labeled_data['positions'], pre_labeled_data['labels']):
        pre_labeled_mapping[pos] = (text, label)


    pred_mapping.update(pre_labeled_mapping)

    pred_mapping = {k: v for k, v in sorted(pred_mapping.items(), key=lambda item: item[0])}

    predictions =[element[1] for element in list(pred_mapping.values())] 

    return pred_mapping, predictions

In [19]:
# Create predictions dictionary spanning initiatives
def sentence_to_initiative_aggregation(predictions):
    """ Utility function which takes in a list of binary predictions per sentence and aggregates these into a dictionary of initiatives.
    params: predictions: list of binary predictions
    returns: predictions_dict: {initiative_number:list of sentence positions}
    """
    initiative_span = []
    predictions_dict = {}
    index = 0
    initiative_index = 0
    for prediction in predictions:
        if prediction == 1:
            initiative_span.append(index)
        elif len(initiative_span) != 0:
            predictions_dict[initiative_index] = initiative_span
            initiative_span = []
            initiative_index += 1
        index += 1
    if initiative_span:
         predictions_dict[initiative_index] = initiative_span
    return predictions_dict

In [20]:
# Create a dictionary with ID_reportNo as keys and values as list of sentence indices
def create_init_dict(data):
    """ Utility function to extract individual initiatives as keys of a dict and a list of corresponding global sentence indices.
    """
    initiative_dict = {} #keys are initiative IDs, values are counts of IDs 
    for sentence_no in range(len(data)):
        if data[sentence_no][2]['has_initiative']:
            initiative_ID = data[sentence_no][3]['list_of_initiatives'][0]
            if (initiative_ID + '_' + str(data[sentence_no][0]['report_no'])) not in initiative_dict.keys():
                initiative_dict[initiative_ID + '_' + str(data[sentence_no][0]['report_no'])] = [sentence_no]
            else:
                initiative_dict[initiative_ID + '_' + str(data[sentence_no][0]['report_no'])].append(sentence_no)
    return initiative_dict

In [23]:
class Initiative_Evaluation():
    """ Class used to evaluate what % of initiatives have been correctly indentified.
    """
    def __init__(self, initiative_dict, predictions_dict):
        self.initiative_dict = initiative_dict
        self.predictions_dict = predictions_dict
        self.no_initiatives = len(self.initiative_dict)
    
    def evaluate(self):
        if len(self.initiative_dict) == len(self.predictions_dict) == 0:
            fully_correctly_labeled_proportion =  1
            half_correctly_labeled_proportion = 1
            min_correctly_labeled_proportion = 1
            fully_correct_F1 = 1
            half_correct_F1 = 1
            min_correct_F1 = 1
            return fully_correctly_labeled_proportion, half_correctly_labeled_proportion, min_correctly_labeled_proportion, fully_correct_F1, half_correct_F1, min_correct_F1
        else:
            # initiatize counters for true positive predictions
            fully_correct_TP = 0
            half_correct_TP = 0
            min_correct_TP = 0
            
            # initialize lists which contain prediction IDs\
            #  for the first correct prediction encountered across all initatives
            fully_correct_double_count = []
            half_correct_double_count = []
            min_correct_double_count = []

            for initiative_ID, initiative_positions_list in self.initiative_dict.items():
                # Keep a record of the first prediction id considered to be a success for each initiative
                fully_correct_match_pred_ID = []
                half_correct_match_pred_ID = []
                min_correct_match_pred_ID = []
                for prediction_ID, prediction_positions_list in self.predictions_dict.items():
                    if set(initiative_positions_list).intersection(prediction_positions_list): #check if the initiative span overlaps with the predicted span
                        if (len(set(initiative_positions_list).intersection(prediction_positions_list))/len(initiative_positions_list) == 1)\
                            and (len(set(prediction_positions_list).intersection(initiative_positions_list))/len(prediction_positions_list) == 1):
                                if (len(fully_correct_match_pred_ID) == 0) and (prediction_ID not in fully_correct_double_count): 
                                    fully_correct_match_pred_ID.append(prediction_ID)
                                    fully_correct_TP += 1
                        if(len(set(initiative_positions_list).intersection(prediction_positions_list))/len(initiative_positions_list) >= 0.5)\
                            and (len(set(prediction_positions_list).intersection(initiative_positions_list))/len(prediction_positions_list) >= 0.5):
                                if (len(half_correct_match_pred_ID) == 0) and (prediction_ID not in half_correct_double_count):
                                    half_correct_match_pred_ID.append(prediction_ID)
                                    half_correct_TP += 1
                        if(len(set(initiative_positions_list).intersection(prediction_positions_list))/len(initiative_positions_list) > 0)\
                            and (len(set(prediction_positions_list).intersection(initiative_positions_list))/len(prediction_positions_list) > 0):
                                if (len(min_correct_match_pred_ID) == 0) and (prediction_ID not in min_correct_double_count): 
                                        min_correct_match_pred_ID.append(prediction_ID)
                                        min_correct_TP += 1
                fully_correct_double_count.extend(fully_correct_match_pred_ID)
                half_correct_double_count.extend(half_correct_match_pred_ID)
                min_correct_double_count.extend(min_correct_match_pred_ID)
                        

            fully_correct_FN, fully_correct_FP = self.compute_FN_FP(fully_correct_TP)
            fully_correct_F1, fully_correct_precision, fully_correct_recall = self.compute_F1(fully_correct_TP, fully_correct_FP, fully_correct_FN)

            half_correct_FN, half_correct_FP = self.compute_FN_FP(half_correct_TP)
            half_correct_F1, half_correct_precision, half_correct_recall = self.compute_F1(half_correct_TP, half_correct_FP, half_correct_FN)

            min_correct_FN, min_correct_FP = self.compute_FN_FP(min_correct_TP)
            min_correct_F1, min_correct_precision, min_correct_recall = self.compute_F1(min_correct_TP, min_correct_FP, min_correct_FN)

            fully_correctly_labeled_proportion = fully_correct_TP/self.no_initiatives
            half_correctly_labeled_proportion = half_correct_TP/self.no_initiatives
            min_correctly_labeled_proportion = min_correct_TP/self.no_initiatives
            
            return fully_correctly_labeled_proportion, half_correctly_labeled_proportion, min_correctly_labeled_proportion, fully_correct_F1, half_correct_F1, min_correct_F1, fully_correct_precision, fully_correct_recall, half_correct_precision, half_correct_recall, min_correct_precision, min_correct_recall
    
    def compute_F1(self, TP, FP, FN):
        """ Utility method to compute F1 score
        """
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        if precision == recall == 0:
            F1 = 0
        else:
            F1 = 2 * precision * recall /(precision + recall)
        return F1, precision, recall
    
    def compute_FN_FP(self, TP):
        """ Utility method to compute FN and FP initiatives given the no of TP 
        (defined as the set intersection between gold initiative span and prediction span)
        """
        FN = len(self.initiative_dict) - TP
        FP = len(self.predictions_dict) - TP
        return FN, FP

In [26]:
# Unit tests for Initiative_Evaluation Class
mock_initiative_dict_1 = {1:[1,2], 2:[3,4]}
mock_predictions_dict_1 = {1: [1,2], 2:[3], 3:[4]}
mock_evaluation_1 = Initiative_Evaluation(mock_initiative_dict_1, mock_predictions_dict_1)
mock_init_strict_accuracy_1, mock_init_medium_accuracy_1, mock_init_lenient_accuracy_1, mock1_fully_correct_F1, mock1_half_correct_F1, mock1_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_1.evaluate()
assert mock_init_strict_accuracy_1 == 0.5
assert mock_init_medium_accuracy_1 == 1
assert mock_init_lenient_accuracy_1 == 1
assert mock1_fully_correct_F1 == 0.4
assert mock1_half_correct_F1 == mock1_min_correct_F1 == 0.8


mock_initiative_dict_2 = {1:[1,2], 2:[4,5,6]}
mock_predictions_dict_2 = {1: [1,2], 2:[4,5,6]}
mock_evaluation_2 = Initiative_Evaluation(mock_initiative_dict_2, mock_predictions_dict_2)
mock_init_strict_accuracy_2, mock_init_medium_accuracy_2, mock_init_lenient_accuracy_2, mock2_fully_correct_F1, mock2_half_correct_F1, mock2_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_2.evaluate()
assert mock_init_strict_accuracy_2 == 1
assert mock_init_medium_accuracy_2 == 1
assert mock_init_lenient_accuracy_2 == 1
assert mock2_fully_correct_F1 == mock2_half_correct_F1 == mock2_min_correct_F1 == 1

mock_initiative_dict_3 = {1:[1,2], 2:[3,4]}
mock_predictions_dict_3 = {1: [1,2], 2:[3,4]}
mock_evaluation_3 = Initiative_Evaluation(mock_initiative_dict_3, mock_predictions_dict_3)
mock_init_strict_accuracy_3, mock_init_medium_accuracy_3, mock_init_lenient_accuracy_3, mock3_fully_correct_F1, mock3_half_correct_F1, mock3_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_3.evaluate()
assert mock_init_strict_accuracy_3 == 1
assert mock_init_medium_accuracy_3 == 1
assert mock_init_lenient_accuracy_3 == 1
assert mock3_fully_correct_F1 == mock3_half_correct_F1 == mock3_min_correct_F1 == 1

mock_initiative_dict_4 = {}
mock_predictions_dict_4 = {}
mock_evaluation_4 = Initiative_Evaluation(mock_initiative_dict_4, mock_predictions_dict_4)
mock_init_strict_accuracy_4, mock_init_medium_accuracy_4, mock_init_lenient_accuracy_4, mock4_fully_correct_F1, mock4_half_correct_F1, mock4_min_correct_F1 = mock_evaluation_4.evaluate()
assert mock_init_strict_accuracy_4 == 1
assert mock_init_medium_accuracy_4 == 1
assert mock_init_lenient_accuracy_4 == 1
assert mock4_fully_correct_F1 == mock4_half_correct_F1 == mock4_min_correct_F1 == 1

mock_initiative_dict_5 = {1:[1,2], 2:[3,4,5], 3:[6]}
mock_predictions_dict_5 = {1:[1], 2:[2], 3:[3], 4:[4], 5:[5]}
mock_evaluation_5 = Initiative_Evaluation(mock_initiative_dict_5, mock_predictions_dict_5)
mock_init_strict_accuracy_5, mock_init_medium_accuracy_5, mock_init_lenient_accuracy_5, mock5_fully_correct_F1, mock5_half_correct_F1, mock5_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_5.evaluate()
assert mock_init_strict_accuracy_5 == 0
assert mock_init_medium_accuracy_5 == 1/3
assert mock_init_lenient_accuracy_5 == 2/3
assert mock5_fully_correct_F1 == 0
assert mock5_half_correct_F1 == 0.25
assert mock5_min_correct_F1 == 0.5

mock_initiative_dict_6 = {1:[1,2], 2:[3,4,5], 3:[6]}
mock_predictions_dict_6 = {1:[1,2,3,4,5,6]}
mock_evaluation_6 = Initiative_Evaluation(mock_initiative_dict_6, mock_predictions_dict_6)
mock_init_strict_accuracy_6, mock_init_medium_accuracy_6, mock_init_lenient_accuracy_6, mock6_fully_correct_F1, mock6_half_correct_F1, mock6_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_6.evaluate()
assert mock_init_strict_accuracy_6 == 0
assert mock_init_medium_accuracy_6 == 1/3
assert mock_init_lenient_accuracy_6 == 1/3
assert mock6_fully_correct_F1 == 0
assert mock6_half_correct_F1 == mock6_min_correct_F1 == 0.5

In [27]:
# Perform predictions on dev dataset
start_time = time.time()
sustainable_model = Sustainable_BERT.from_pretrained('final_bert_binary_paper')
dev_predictions_list = model_predict(sustainable_model, tokenizer, dev_loader, device)
end_time = time.time()
print(f'Predicting results on dev set took {end_time-start_time} seconds')

# Reconcile predictions on the dev set
dev_pred_mapping, dev_predictions = reconcile_mapping(dev_data, pre_labeled_dev_data, dev_predictions_list)

loading configuration file final_bert_binary_paper/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "Sustainable_BERT"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file final_bert_binary_paper/pytorch_model.bin
All model checkpoint weights were used when initializing Sustainable_BERT.

All the weights of Sustainable_BERT were initialized from the model checkpoint at final_bert_binary_paper.
I

Predicting results on dev set took 40.871758460998535 seconds


In [28]:
# Extract ground truth dev data labels
dev_labels = {}
for sent_no in range(len(development_data)):
    dev_labels[sent_no] = (development_data[sent_no][2]['has_initiative'], development_data[sent_no][3]['list_of_initiatives'])
dev_label_values = [item[0] for item in dev_labels.values()]

target_names = ['No Initiative', 'Initiative']
print(f'Classification Report on the Development Dataset \n')
print(classification_report(dev_label_values, np.array(dev_predictions), target_names = target_names, digits=4))

Classification Report on the Development Dataset 

               precision    recall  f1-score   support

No Initiative     0.9789    0.9886    0.9837     53854
   Initiative     0.5060    0.3536    0.4163      1776

     accuracy                         0.9683     55630
    macro avg     0.7425    0.6711    0.7000     55630
 weighted avg     0.9638    0.9683    0.9656     55630



In [33]:
dev_initiative_dict = create_init_dict(development_data)
dev_predictions_dict = sentence_to_initiative_aggregation(dev_predictions)
dev_init_evaluation = Initiative_Evaluation(dev_initiative_dict, dev_predictions_dict)
dev_init_strict_accuracy, dev_init_medium_accuracy, dev_init_lenient_accuracy, dev_strict_F1, dev_medium_F1, dev_lenient_F1, dev_strict_precision, dev_strict_recall, dev_medium_precision, dev_medium_recall, dev_lenient_precision, dev_lenient_recall = dev_init_evaluation.evaluate()

print(f'Percentage of correctly predicted initiatives where at least 1 sentence is identified is {dev_init_lenient_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where more than 50% of sentences are identified is {dev_init_medium_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where 100% of sentences are identified is {dev_init_strict_accuracy:.2%} \n')
print(f'F1 score where at least 1 sentence is identified is {dev_lenient_F1:.2%} \n')
print(f'Precision score where at least 1 sentence is identified is {dev_lenient_precision:.2%} \n')
print(f'Recall score where at least 1 sentence is identified is {dev_lenient_recall:.2%} \n')
print(f'F1 score where 50% of sentences are identified is {dev_medium_F1:.2%} \n')
print(f'Precision score where 50% of sentences are identified is {dev_medium_precision:.2%} \n')
print(f'Recal score where 50% of sentences are identified is {dev_medium_recall:.2%} \n')
print(f'F1 score where 100% of sentences are identified is {dev_strict_F1:.2%} \n')
print(f'Precision score where 100% of sentences are identified is {dev_strict_precision:.2%} \n')
print(f'Recall score where 100% of sentences are identified is {dev_strict_recall:.2%} \n')

Percentage of correctly predicted initiatives where at least 1 sentence is identified is 43.76% 

Percentage of correctly predicted initiatives where more than 50% of sentences are identified is 33.40% 

Percentage of correctly predicted initiatives where 100% of sentences are identified is 13.30% 

F1 score where at least 1 sentence is identified is 46.44% 

Precision score where at least 1 sentence is identified is 49.48% 

Recall score where at least 1 sentence is identified is 43.76% 

F1 score where 50% of sentences are identified is 35.45% 

Precision score where 50% of sentences are identified is 37.77% 

Recal score where 50% of sentences are identified is 33.40% 

F1 score where 100% of sentences are identified is 14.12% 

Precision score where 100% of sentences are identified is 15.04% 

Recall score where 100% of sentences are identified is 13.30% 



In [34]:
# Perform predictions on test dataset
sustainable_model = Sustainable_BERT.from_pretrained('./final_bert_binary_paper/')
test_predictions_list = model_predict(sustainable_model, tokenizer, test_loader, device)

# Reconcile predictions on the dev set
test_pred_mapping, test_predictions = reconcile_mapping(test_data, pre_labeled_test_data, test_predictions_list)

loading configuration file ./final_bert_binary_paper/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "Sustainable_BERT"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./final_bert_binary_paper/pytorch_model.bin
All model checkpoint weights were used when initializing Sustainable_BERT.

All the weights of Sustainable_BERT were initialized from the model checkpoint at ./final_bert_binary_pa

In [35]:
# Extract ground truth test data labels
test_labels = {}
for sent_no in range(len(testing_data)):
    test_labels[sent_no] = (testing_data[sent_no][2]['has_initiative'], testing_data[sent_no][3]['list_of_initiatives'])
test_label_values = [item[0] for item in test_labels.values()]

target_names = ['No Initiative', 'Initiative']
print(f'Classification Report on the Test Dataset \n')
print(classification_report(test_label_values, np.array(test_predictions), target_names = target_names, digits=4))

Classification Report on the Test Dataset 

               precision    recall  f1-score   support

No Initiative     0.9792    0.9796    0.9794     48175
   Initiative     0.4157    0.4108    0.4132      1704

     accuracy                         0.9601     49879
    macro avg     0.6974    0.6952    0.6963     49879
 weighted avg     0.9599    0.9601    0.9600     49879



In [36]:
test_initiative_dict = create_init_dict(testing_data)
test_predictions_dict = sentence_to_initiative_aggregation(test_predictions)
test_init_evaluation = Initiative_Evaluation(test_initiative_dict, test_predictions_dict)
test_init_strict_accuracy, test_init_medium_accuracy, test_init_lenient_accuracy, test_strict_F1, test_medium_F1, test_lenient_F1, test_strict_precision, test_strict_recall, test_medium_precision, test_medium_recall, test_lenient_precision, test_lenient_recall  = test_init_evaluation.evaluate()

print(f'Percentage of correctly predicted initiatives where at least 1 sentence is identified is {test_init_lenient_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where more than 50% of sentences are identified is {test_init_medium_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where 100% of sentences are identified is {test_init_strict_accuracy:.2%} \n')
print(f'F1 score where at least 1 sentence is identified is {test_lenient_F1:.2%} \n')
print(f'Precision score where at least 1 sentence is identified is {test_lenient_precision:.2%} \n')
print(f'Recall score where at least 1 sentence is identified is {test_lenient_recall:.2%} \n')
print(f'F1 score where 50% of sentences are identified is {test_medium_F1:.2%} \n')
print(f'Precision score where 50% of sentences are identified is {test_medium_precision:.2%} \n')
print(f'Recal score where 50% of sentences are identified is {test_medium_recall:.2%} \n')
print(f'F1 score where 100% of sentences are identified is {test_strict_F1:.2%} \n')
print(f'Precision score where 100% of sentences are identified is {test_strict_precision:.2%} \n')
print(f'Recall score where 100% of sentences are identified is {test_strict_recall:.2%} \n')


Percentage of correctly predicted initiatives where at least 1 sentence is identified is 44.73% 

Percentage of correctly predicted initiatives where more than 50% of sentences are identified is 34.18% 

Percentage of correctly predicted initiatives where 100% of sentences are identified is 16.60% 

F1 score where at least 1 sentence is identified is 41.92% 

Precision score where at least 1 sentence is identified is 39.45% 

Recall score where at least 1 sentence is identified is 44.73% 

F1 score where 50% of sentences are identified is 32.04% 

Precision score where 50% of sentences are identified is 30.15% 

Recal score where 50% of sentences are identified is 34.18% 

F1 score where 100% of sentences are identified is 15.56% 

Precision score where 100% of sentences are identified is 14.64% 

Recall score where 100% of sentences are identified is 16.60% 



In [37]:
# # Perform predictions on train dataset
train_predictions_list = model_predict(sustainable_model, tokenizer, train_loader, device)

# Reconcile predictions on the train set
train_pred_mapping, train_predictions = reconcile_mapping(train_data, pre_labeled_train_data, train_predictions_list)



In [38]:
training_labels = []
for sent_no in range(len(training_data)):
    training_labels.append(training_data[sent_no][2]['has_initiative'])
    
target_names = ['No Initiative', 'Initiative']
print(f'Classification Report on the Training Dataset \n')
print(classification_report(training_labels, np.array(train_predictions), target_names = target_names, digits=4))

Classification Report on the Training Dataset 

               precision    recall  f1-score   support

No Initiative     0.9872    0.9882    0.9877     83801
   Initiative     0.7238    0.7072    0.7154      3668

     accuracy                         0.9764     87469
    macro avg     0.8555    0.8477    0.8515     87469
 weighted avg     0.9762    0.9764    0.9763     87469



In [39]:
start_time = time.time()
train_initiative_dict = create_init_dict(training_data)
train_predictions_dict = sentence_to_initiative_aggregation(train_predictions)
train_init_evaluation = Initiative_Evaluation(train_initiative_dict, train_predictions_dict)
train_init_strict_accuracy, train_init_medium_accuracy, train_init_lenient_accuracy, train_strict_F1, train_medium_F1, train_lenient_F1, train_strict_precision, train_strict_recall, train_medium_precision, train_medium_recall, train_lenient_precision, train_lenient_recall  = train_init_evaluation.evaluate()

print(f'Percentage of correctly predicted initiatives where at least 1 sentence is identified is {train_init_lenient_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where more than 50% of sentences are identified is {train_init_medium_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where 100% of sentences are identified is {train_init_strict_accuracy:.2%} \n')
print(f'F1 score where at least 1 sentence is identified is {train_lenient_F1:.2%} \n')
print(f'F1 score where 50% of sentences are identified is {train_medium_F1:.2%} \n')
print(f'F1 score where 100% of sentences are identified is {train_strict_F1:.2%} \n')
end_time = time.time()
print(f'Evaluating initiatives on the train set took {end_time-start_time} seconds')

Percentage of correctly predicted initiatives where at least 1 sentence is identified is 70.74% 

Percentage of correctly predicted initiatives where more than 50% of sentences are identified is 59.22% 

Percentage of correctly predicted initiatives where 100% of sentences are identified is 30.83% 

F1 score where at least 1 sentence is identified is 67.75% 

F1 score where 50% of sentences are identified is 56.71% 

F1 score where 100% of sentences are identified is 29.53% 

Evaluating initiatives on the train set took 1.7844467163085938 seconds
