In [None]:
# DELETE CELL IF RUNNING ON LOCAL MACHINE INSTEAD OF GOOGLE COLAB
!pip install transformers
!pip install datasets
!pip install pytorch-crf

Collecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 2.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |█████████████████████▌          | 2.2 MB 54.9 MB/s eta 0:00:01

In [None]:
import torch
from torch import nn
from torchcrf import CRF
from torch.utils.data import  DataLoader

import transformers
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer 
from transformers import BertPreTrainedModel, BertModel

from datasets import load_metric

import numpy as np
import re
import json
import os
import time
from sklearn.metrics import classification_report

from tqdm.auto import tqdm

In [None]:
# DELETE CELL IF RUNNING ON LOCAL MACHINE INSTEAD OF GOOGLE COLAB
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# DELETE CELL IF RUNNING ON LOCAL MACHINE INSTEAD OF GOOGLE COLAB
%cd /content/drive/MyDrive 

In [None]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)
print(use_cuda)

In [None]:
# Create list of training data files
def load_from_directory(directory):
    """
    Utility function to load all json-converted reports into a dataset.
    params: directory: string representing location on disk of json files
    returns: dataset: list of deserialised jsons
    """
    path = os.getcwd()
    path = os.path.join(path, directory)
    json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')]

    dataset = [] 

    for filename in json_files: 
        with open(path+filename, "r", encoding='utf-8') as read_file:
            dataset.append(json.load(read_file))
    
    return dataset
    
train_folder = 'json_train/'
train_dataset = load_from_directory(train_folder)

dev_folder = 'json_develop/'
development_dataset = load_from_directory(dev_folder)

test_folder = 'json_test/'
test_dataset = load_from_directory(test_folder)

In [None]:
def filter_dataset(input_dataset):
    """ 
    Utility function to convert input dataset into custom data structure
    params: input_dataset: list of deserialised jsons
    returns: dictionary with the following structure:
            key:  sentence_global_idx value , value:list of dictionaries
            dict0: key: report_no, value: int 
            dict1: key: text, value: string
            dict2: key: has_initiative, value: boolean
            dict3: key: list_of_initiatives, value: list of strings with initiative IDs
            dict4: key: sector, value: list of strings
            dict5: key: sdg, value: list of sgd strings (representing sgd number)
            dict6: key: sentence_length, value: int  
    """
    structured_data = {}
    total_no_reports = len(input_dataset)
    sentence_global_idx = 0
    re_punctuation_string = '[“”|()%&\s,_:;/\'!?-]'

    for report_no in range(total_no_reports): 
        no_sentences_per_report = len(input_dataset[report_no]['tokenised_sentences'])
        for sentence_no in range(no_sentences_per_report):
            tokenized_sentence = re.split(re_punctuation_string, input_dataset[report_no]['tokenised_sentences'][sentence_no]['text'])
            tokenized_sentence = list(filter(None, tokenized_sentence))
            if (len(tokenized_sentence) == 0):           
                continue
            else:
                structured_data[sentence_global_idx] = []
                structured_data[sentence_global_idx].append({'report_no':report_no}) 
                structured_data[sentence_global_idx].append({'text':' '.join([elem.lower() for elem in tokenized_sentence])})
                if len(input_dataset[report_no]['tokenised_sentences'][sentence_no]['initiative_ids']) > 0:
                  structured_data[sentence_global_idx].append({'has_initiative':1})
                else:
                   structured_data[sentence_global_idx].append({'has_initiative':0})
                structured_data[sentence_global_idx].append({'list_of_initiatives': input_dataset[report_no]['tokenised_sentences'][sentence_no]['initiative_ids']}) 
                
                structured_data[sentence_global_idx].append({'sentence_length':len(tokenized_sentence)}) 
                sentence_global_idx +=1
    
    return structured_data


In [None]:
# Set up datasets from json files
training_data = filter_dataset(train_dataset)
development_data = filter_dataset(development_dataset)
testing_data = filter_dataset(test_dataset)

In [None]:
def assisted_labelling(data, lower_threshold, upper_threshold):
    """
    Utility function which labels all sentences with fewer than the threshold number of tokens as not having a sustainability initiative.
    params: data: list of dictionaries
            threshold: int representing number of tokens
    returns: dictionary {global_sentence_index:boolean label}
    """
    labeled_dataset = {}
    for sentence_no in range(len(data)):
        tokenized_sentence = re.split(' ', data[sentence_no][1]['text'])
        tokenized_sentence_with_alphabetical_chars = [word for word in tokenized_sentence if re.search('[a-zA-Z]', word)]
        if (data[sentence_no][4]['sentence_length'] > lower_threshold) & (len(tokenized_sentence_with_alphabetical_chars)!=0) & (data[sentence_no][4]['sentence_length']<upper_threshold):
            labeled_dataset[sentence_no] = 1 
        else:
            labeled_dataset[sentence_no] = 0 # label short, long and non-alphabetical sentences as not having an initiative
    return labeled_dataset

assistant_labeled_training_data = assisted_labelling(training_data,lower_threshold=5, upper_threshold=100)
assistant_labeled_dev_data = assisted_labelling(development_data,lower_threshold=5, upper_threshold=100)
assistant_labeled_test_data = assisted_labelling(testing_data, lower_threshold=5, upper_threshold=100)

In [None]:
def reader(dataset, assisted_labels, initiative_dict):
    """
    Utility function read in the data together with assisted labels and return a train dictionary and a pre-labelled dictionary.
    params: dataset: dict {global_sentence_index : list of 7 dictionaries}
            assisted_labels: dict {global_sentence_index : assistant label}
    returns: train_dict: dict
             pre_labelled_dict
    """
    texts = []
    labels = []
    positions = []
    initiative_IDs = []
    report_no = []
    
    pre_labeled_texts =[]
    pre_labeled_labels =[]
    pre_labeled_positions =[]
    pre_labeled_IDs = []

    for sentence_no in range(len(dataset)):
        if assisted_labels[sentence_no] == 1:
            report_no.append(dataset[sentence_no][0]['report_no'])
            texts.append(dataset[sentence_no][1]['text'])
            if dataset[sentence_no][3]['list_of_initiatives']: #check whether the sentence has an initiative
              initiative_unique_reference = dataset[sentence_no][3]['list_of_initiatives'][0] + '_' + str(dataset[sentence_no][0]['report_no'])
              if len(initiative_dict[initiative_unique_reference]) == 1:
                labels.append(dataset[sentence_no][2]['has_initiative']) # append 1 for singletons or 0 for non-initiative sentences
              elif initiative_dict[initiative_unique_reference].index(sentence_no) == 0:
                labels.append(2) #append 2 for beginning of initiative
              elif initiative_dict[initiative_unique_reference].index(sentence_no) == (len(initiative_dict[initiative_unique_reference]) - 1):
                labels.append(4) #append 4 for end of initiative
              else:
                labels.append(3) #append 3 for inside an initiative
            else:
              labels.append(dataset[sentence_no][2]['has_initiative'])
            positions.append(sentence_no)
            initiative_IDs.append(dataset[sentence_no][3]['list_of_initiatives'])
        else:
            pre_labeled_texts.append(dataset[sentence_no][1]['text'])
            pre_labeled_labels.append(assisted_labels[sentence_no]) # append 0 for non-initiative sentences
            pre_labeled_positions.append(sentence_no)
            pre_labeled_IDs.append(dataset[sentence_no][3]['list_of_initiatives'])

    actual_data_dict = {'texts':texts, 'labels':labels, 'positions':positions, 'ID_list':initiative_IDs, 'report_no':report_no}
    pre_labeled_dict = {'texts':pre_labeled_texts, 'labels':pre_labeled_labels, 'positions': pre_labeled_positions, 'ID_list':pre_labeled_IDs}
            
    return actual_data_dict, pre_labeled_dict

In [None]:
def context_builder(data_dict, left_context_size = 0, right_context_size = 0):
    """ Utility function to build context around a target sentence.
    """
    context = []
    multi_sentence_labels = []
    for sentence_index in range(len(data_dict['texts'])):
        sentence_context = []
        sentence_context_labels = []
        if (sentence_index - left_context_size >= 0) and (sentence_index + right_context_size < len(data_dict['texts'])):
            # test if target sentence is in the middle of the corpus
            for context_index in range(sentence_index - left_context_size, sentence_index + right_context_size + 1):
                if data_dict['report_no'][sentence_index] == data_dict['report_no'][context_index]:
                    sentence_context.append(data_dict['texts'][context_index])
                    sentence_context_labels.append(data_dict['labels'][context_index])
        elif sentence_index - left_context_size >= 0: #if target sentence is at end of the corpus 
            for context_index in range(sentence_index - left_context_size, sentence_index + right_context_size + 1):
                if context_index < len(data_dict['texts']): # add in a smaller context window at end of the corpus
                    if (data_dict['report_no'][sentence_index] == data_dict['report_no'][context_index]):
                        sentence_context.append(data_dict['texts'][context_index])
                        sentence_context_labels.append(data_dict['labels'][context_index])
        elif sentence_index + right_context_size < len(data_dict['texts']): #if target sentence is at beginning of the corpus 
                for context_index in range(sentence_index - left_context_size, sentence_index + right_context_size + 1):
                    if context_index >= 0: # add in smaller context window at the beginning of the corpus
                        if (data_dict['report_no'][sentence_index] == data_dict['report_no'][context_index]):
                            sentence_context.append(data_dict['texts'][context_index])
                            sentence_context_labels.append(data_dict['labels'][context_index])
        context.append(sentence_context)
        while len(sentence_context_labels) < (1 + left_context_size + right_context_size): # pad with 0 labels for senteces with a smaller context eg. beginning/end of docs
          sentence_context_labels.append(0)
        multi_sentence_labels.append(sentence_context_labels)
    return context, multi_sentence_labels

In [None]:
class SustainableDataset(torch.utils.data.Dataset):
    """Dataset class inheriting from pytorch to be used by dataloaders.
    """
    def __init__(self, tokenizer, input_set, input_context, input_multi_sentence_labels, max_paragraph_length, global_target_sentence_index):
        self.tokenizer = tokenizer
        self.texts = input_set['texts']
        self.labels = input_set['labels']
        self.report_nos = input_set['report_no']
        self.contexts = input_context
        self.context_labels = input_multi_sentence_labels
        self.max_paragraph_length = max_paragraph_length
        self.global_target_sentence_index = global_target_sentence_index
        
        
    def collate_fn(self, batch):
        texts = [b['text'] for b in batch]
        labels = [b['label'] for b in batch]
        contexts = [b['context'] for b in batch]
        context_labels = [b['context_label'] for b in batch]
        encodings, sep_positions = self.custom_tokenizer(batch = contexts) 
        encodings['labels'] =  torch.tensor(context_labels) # pass through labels for all sentences
        encodings['sep_positions'] = sep_positions
        return encodings
    
    def custom_collate_fn(self, batch):
      texts = [b['text'] for b in batch]
      labels = [b['label'] for b in batch]
      contexts = [b['context'] for b in batch]
      context_labels = [b['context_label'] for b in batch]
      return {'texts':texts, 'labels':labels, 'contexts':contexts, 'context_labels':context_labels}
    

    def custom_tokenizer(self, batch):
      """ Utility functions to tokenize a list of sentences using [SEP] at the beginning of each sentence with fixed positions.
      """
      batch_sequences = []
      batch_sep_positions = []
      batch_token_type_ids = []
      for sequence_list in batch:
        augmented_sequence = ''
        for sentence in sequence_list:
            augmented_sequence += '[SEP]' + sentence
        augmented_sequence.strip()
        batch_sequences.append(augmented_sequence)
      encoded_batch = self.tokenizer(batch_sequences, padding='longest', truncation=True, max_length=512, return_tensors='pt')
      for encoded_sequence in encoded_batch['input_ids']:
          sep_positions = [index for index in range(len(encoded_sequence)) if encoded_sequence[index]==102]
          while len(sep_positions) < self.max_paragraph_length + 1: # repeat last sep position to get full sequence 
            sep_positions.append(sep_positions[-1])
          batch_sep_positions.append(sep_positions)
          if self.global_target_sentence_index in range(len(sep_positions)-1): # check to see that the target sentence is actually part of the context
            custom_token_type_ids = [ 1 if index in range(sep_positions[self.global_target_sentence_index], sep_positions[self.global_target_sentence_index+1]+1) else 0 for index in range(len(encoded_sequence))]
          else:
            custom_token_type_ids = [1 for index in range(len(encoded_sequence))]
          batch_token_type_ids.append(custom_token_type_ids)
       
      encoded_batch['token_type_ids'] = torch.tensor(batch_token_type_ids)
      return encoded_batch, batch_sep_positions
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {'text': self.texts[idx],
                'label': self.labels[idx],
                'context': self.contexts[idx],
                'context_label' : self.context_labels[idx],
                }
        return item

In [None]:
def get_sep_positions(data_loader, global_target_sentence_index, max_paragraph_length, tokenizer):
  """ Utility function to get sep positions for given dataset.
  """
  dataset_sep_positions = []
  for batch in data_loader:
      batch_sep_positions = []
      batch_sequences = []
      for sequence_list in batch['contexts']:
        augmented_sequence = ''
        for sentence in sequence_list:
            augmented_sequence += '[SEP]' + sentence
        augmented_sequence.strip()
        batch_sequences.append(augmented_sequence)
      encoded_batch = tokenizer(batch_sequences, padding='longest', truncation=True, max_length=512, return_tensors='pt')
      for encoded_sequence in encoded_batch['input_ids']:
          sep_positions = [index for index in range(len(encoded_sequence)) if encoded_sequence[index]==102]
          while len(sep_positions) < max_paragraph_length + 1: # repeat last sep position to get full sequence 
            sep_positions.append(sep_positions[-1])
          batch_sep_positions.append(sep_positions)
      dataset_sep_positions.extend([sublist[:-1] for sublist in batch_sep_positions]) 
  return dataset_sep_positions
  

In [None]:
# Create a dictionary with ID_reportNo as keys and values as list of sentence indices
def create_init_dict(data):
    """ Utility function to extract individual initiatives as keys of a dict and a list of corresponding global sentence indices.
    """
    initiative_dict = {} #keys are initiative IDs, values are counts of IDs 
    for sentence_no in range(len(data)):
        if data[sentence_no][2]['has_initiative']:
            initiative_ID = data[sentence_no][3]['list_of_initiatives'][0]
            if (initiative_ID + '_' + str(data[sentence_no][0]['report_no'])) not in initiative_dict.keys():
                initiative_dict[initiative_ID + '_' + str(data[sentence_no][0]['report_no'])] = [sentence_no]
            else:
                initiative_dict[initiative_ID + '_' + str(data[sentence_no][0]['report_no'])].append(sentence_no)
    return initiative_dict

In [None]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

global_target_sentence_index = 2 #target sentence index to be used throughout entire script

# Create gold standard initiative dictionaries
train_initiative_dict = create_init_dict(training_data)
dev_initiative_dict = create_init_dict(development_data)
test_initiative_dict = create_init_dict(testing_data)

# Read in training and dev data and split into data to feed into the model and pre-labeled data
train_data, pre_labeled_train_data = reader(training_data, assistant_labeled_training_data, train_initiative_dict)
dev_data, pre_labeled_dev_data = reader(development_data, assistant_labeled_dev_data, dev_initiative_dict)
test_data, pre_labeled_test_data = reader(testing_data, assistant_labeled_test_data, test_initiative_dict)

# Construct context around each sentence per dataset
train_context, train_multi_sentence_labels = context_builder(train_data, left_context_size = 2, right_context_size = 2) 
dev_context, dev_multi_sentence_labels = context_builder(dev_data, left_context_size = 2, right_context_size = 2) 
test_context, test_multi_sentence_labels = context_builder(test_data, left_context_size = 2, right_context_size = 2)

max_paragraph_length = max([len(label_sequence) for label_sequence in train_multi_sentence_labels])

# Only data to be fed into the model is built into datasets
train_dataset = SustainableDataset(tokenizer, train_data, train_context, train_multi_sentence_labels, max_paragraph_length = max_paragraph_length, global_target_sentence_index = global_target_sentence_index)
dev_dataset = SustainableDataset(tokenizer, dev_data, dev_context, dev_multi_sentence_labels, max_paragraph_length = max_paragraph_length, global_target_sentence_index = global_target_sentence_index)
test_dataset = SustainableDataset(tokenizer, test_data, test_context, test_multi_sentence_labels, max_paragraph_length = max_paragraph_length, global_target_sentence_index = global_target_sentence_index)

# Create train and dev dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False, collate_fn = train_dataset.custom_collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False, collate_fn = dev_dataset.custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn = test_dataset.custom_collate_fn)

# Record sep positions for dev dataset (to be used for early stopping during training)
dev_sep_positions = get_sep_positions(dev_loader, global_target_sentence_index = global_target_sentence_index, max_paragraph_length = max_paragraph_length, tokenizer=tokenizer)

In [None]:
# Unit tests for building context assuming a window
mock_train_context, mock_train_multi_sentence_labels = context_builder(train_data, left_context_size = 1, right_context_size = 1)
mock_dev_context, mock_dev_multi_sentence_labels = context_builder(dev_data, left_context_size = 1, right_context_size = 1)
mock_train_dataset = SustainableDataset(tokenizer, train_data, mock_train_context, mock_train_multi_sentence_labels, max_paragraph_length = max_paragraph_length, global_target_sentence_index = global_target_sentence_index)
mock_dev_dataset = SustainableDataset(tokenizer, dev_data, mock_dev_context, mock_dev_multi_sentence_labels, max_paragraph_length = max_paragraph_length, global_target_sentence_index = global_target_sentence_index)
assert mock_train_dataset.contexts[11] == [mock_train_dataset.texts[10], mock_train_dataset.texts[11], mock_train_dataset.texts[12]]  #random corpus context
assert len(mock_train_dataset.contexts) == len(mock_train_dataset.texts) #there is a context for every target sentence in the train set
assert mock_dev_dataset.contexts[11] == [mock_dev_dataset.texts[10], mock_dev_dataset.texts[11], dev_dataset.texts[12]] #random corpus context
assert len(mock_dev_dataset.contexts) == len(mock_dev_dataset.texts) #there is a context for every target sentence in the dev set
assert [mock_train_dataset.texts[0], mock_train_dataset.texts[1] , mock_train_dataset.texts[2]] ==  mock_train_dataset.contexts[1] #the context for the first sentence in the corpus is only the following sentence
assert [mock_train_dataset.texts[-2] , mock_train_dataset.texts[-1]] == mock_train_dataset.contexts[-1] #the context for the last sentence in the corpus is only the preceding sentence
assert [mock_train_dataset.texts[mock_train_dataset.report_nos.index(1)] , mock_train_dataset.texts[mock_train_dataset.report_nos.index(1)+1]]== mock_train_dataset.contexts[mock_train_dataset.report_nos.index(1)] # first sentence of second report should have a context of only its following sentence
assert [mock_train_dataset.texts[mock_train_dataset.report_nos.index(1)-2], mock_train_dataset.texts[mock_train_dataset.report_nos.index(1)-1]] == mock_train_dataset.contexts[mock_train_dataset.report_nos.index(1)-1] # last sentence of first report should have a context of only its preceding sentence

In [None]:
# Unit tests -> check if assisted data labelling has been performed correctly
assert len(set(pre_labeled_train_data['positions']).intersection(set(train_data['positions']))) == 0 
assert (len(train_data['texts']) + len(pre_labeled_train_data['texts'])) == len(training_data)
assert set(pre_labeled_train_data['positions']).union(set(train_data['positions'])) == set(range(len(training_data)))
assert (len(dev_data['texts']) + len(pre_labeled_dev_data['texts'])) == len(development_data)

In [None]:
class Sustainable_BERT(BertPreTrainedModel):
    """ Transformer model class with custom output layer for fine-tuning.
    """
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.projection = torch.nn.Sequential(torch.nn.Dropout(0.1), torch.nn.Linear(config.hidden_size, 5))              
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
 
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits = self.projection(outputs.last_hidden_state) 
        return logits

In [None]:
class Trainer_Sustainable(Trainer):
    """ Class inheriting from Trainer to configure loss function used.
    """

    def __init__(self,
        crf_model,
        model = None,
        args = None,
        data_collator = None,
        train_dataset = None,
        eval_dataset = None,
        tokenizer = None,
        model_init = None,
        compute_metrics = None,
        callbacks = None,
        optimizers = (None, None),        
        ):
        super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers)
        self.crf_model = crf_model

    def compute_loss(self, model, inputs, global_target_sentence_index = global_target_sentence_index, max_paragraph_length = max_paragraph_length, return_outputs=False):

        labels = inputs.pop('labels')
        sep_positions = inputs.pop('sep_positions') # take all sep positions
        outputs = model(**inputs)
        batch_preds = []
        for i, sentence_sep_positions in zip(range(outputs.shape[0]), sep_positions):
          sentence_preds = []
          for j in sentence_sep_positions:
            sentence_preds.append(outputs[i,j])
          batch_preds.append(torch.cat(sentence_preds[:-1])) #ignore last sep token as we don't predict from it

        preds = torch.cat(batch_preds).reshape((-1, max_paragraph_length, 5)).permute(1,0,2)
        labels = labels.permute(1,0)
 
        loss = -1 * self.crf_model(preds, labels)
        
        if return_outputs: 
            return (loss, (loss, outputs)) 
        else:
            return loss

In [None]:
def model_predict(model, tokenizer, dataloader, device, global_target_sentence_index, max_paragraph_length, crf_model):
    """ Utility function to set the model to GPU and infer of given dataloader.
    """
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            batch_sequences = []
            batch_sep_positions = []
            batch_token_type_ids = []
            for sequence_list in batch['contexts']:
              augmented_sequence = ''
              for sentence in sequence_list:
                  augmented_sequence += '[SEP]' + sentence
              augmented_sequence.strip()
              batch_sequences.append(augmented_sequence)
            encoded_batch = tokenizer(batch_sequences, padding='longest', truncation=True, max_length=512, return_tensors='pt').to(device)
            for encoded_sequence in encoded_batch['input_ids']:
                sep_positions = [index for index in range(len(encoded_sequence)) if encoded_sequence[index]==102]
                while len(sep_positions) < max_paragraph_length + 1: # repeat last sep position to get full sequence 
                  sep_positions.append(sep_positions[-1])
                batch_sep_positions.append(sep_positions)
                if global_target_sentence_index in range(len(sep_positions)-1): # check to see that the target sentence is actually part of the context
                  custom_token_type_ids = [ 1 if index in range(sep_positions[global_target_sentence_index], sep_positions[global_target_sentence_index+1]+1) else 0 for index in range(len(encoded_sequence))]
                else:
                  custom_token_type_ids = [1 for index in range(len(encoded_sequence))]
                batch_token_type_ids.append(custom_token_type_ids)
            
            encoded_batch['token_type_ids'] = torch.tensor(batch_token_type_ids).to(device)
            output = model(**encoded_batch) 
            batch_preds = []
            
            for i, sentence_sep_positions in zip(range(output.shape[0]), batch_sep_positions):
              sentence_preds = []
              for j in sentence_sep_positions:
                sentence_preds.append(output[i,j])
              batch_preds.append(torch.cat(sentence_preds[:-1])) #ignore last sep token as we don't predict from it

            preds = torch.cat(batch_preds).reshape((-1, max_paragraph_length, 5)).permute(1,0,2)
            predicted_tags = [tag[global_target_sentence_index] for tag in crf_model.decode(preds)]
            predictions.extend(predicted_tags)
    return predictions


In [21]:
# Instantiate and train model
model = Sustainable_BERT.from_pretrained('bert-base-uncased').to(device) 

total_epochs = 10   
learning_rate = 1e-5 

# Create evaluation metric F1 score 
metric = load_metric('f1')

def compute_metrics(eval_pred, sep_positions = dev_sep_positions, global_target_sentence_index = global_target_sentence_index):
    raw_predictions, raw_labels = eval_pred 
    pooled_labels = [label[global_target_sentence_index] for label in raw_labels]
    pooled_predictions = []
    for i, sentence_sep_positions in zip(range(len(raw_predictions)), sep_positions):
        sentence_preds = []
        for j in sentence_sep_positions:
            sentence_preds.append(torch.tensor(raw_predictions[i,j]))
        pred = torch.cat(sentence_preds).reshape((-1, max_paragraph_length, 5)).permute(1,0,2).to(device)
        predicted_tag = trainer.crf_model.decode(pred)[0][global_target_sentence_index] 
        pooled_predictions.append(predicted_tag)   
    torch.save(trainer.crf_model, f='crf_model_IOBES_5seq_paper.pt')  # save trained crf model to use at inference time
    return metric.compute(predictions=pooled_predictions, references=pooled_labels, average = 'macro')

# Define optimizer and lr schedule
optimizer = transformers.AdamW(model.parameters(),
                  lr = learning_rate, 
                  )

total_steps = len(train_loader) * total_epochs 
warmup = 0.06 * total_steps
 
# Create the learning rate scheduler.
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup, 
                                            num_training_steps = total_steps)

# Create training arguments
training_args = TrainingArguments(
    output_dir='./bert_sustainable_CRF_5seq_paper',
    save_total_limit = 2,
    learning_rate = learning_rate, 
    logging_strategy = 'epoch',
    per_device_train_batch_size=8, 
    num_train_epochs = total_epochs, 
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    do_eval = True,
    evaluation_strategy = 'epoch',
    metric_for_best_model = 'f1',
    eval_accumulation_steps=0.1*len(dev_loader),
    gradient_accumulation_steps = 2, # effective training batch size of 16
    )

# Define trainer module
trainer = Trainer_Sustainable(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,                   
    data_collator=train_dataset.collate_fn,
    callbacks =[transformers.EarlyStoppingCallback(early_stopping_patience = 5, early_stopping_threshold=-0.03)],
    compute_metrics = compute_metrics,
    optimizers = (optimizer, scheduler),
    crf_model = CRF(num_tags=5).to(device),
    )

trainer.train()

trainer.save_model('./final_bert_sustainable_CRF_5seq_paper')


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing Sustainable_BERT: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing Sustainable_BERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Sustainable_BERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Sustainable_BERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['proj

Downloading:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

***** Running training *****
  Num examples = 36920
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 23070
  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Epoch,Training Loss,Validation Loss,F1
0,17.8463,14.524508,0.192279
1,11.9857,17.910591,0.20963
2,9.5975,18.553713,0.257413
3,8.1373,19.403378,0.26965
4,7.187,21.352606,0.261746
5,6.2958,21.9375,0.271273
6,5.2382,22.279305,0.324231
7,4.4576,24.008759,0.3093
8,3.9224,25.032305,0.31056
9,3.5823,25.234215,0.31954


***** Running Evaluation *****
  Num examples = 20402
  Batch size = 8
Saving model checkpoint to ./bert_sustainable_CRF_5seq_paper/checkpoint-2307
Configuration saved in ./bert_sustainable_CRF_5seq_paper/checkpoint-2307/config.json
Model weights saved in ./bert_sustainable_CRF_5seq_paper/checkpoint-2307/pytorch_model.bin
Deleting older checkpoint [bert_sustainable_CRF_5seq_paper/checkpoint-4614] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20402
  Batch size = 8
Saving model checkpoint to ./bert_sustainable_CRF_5seq_paper/checkpoint-4614
Configuration saved in ./bert_sustainable_CRF_5seq_paper/checkpoint-4614/config.json
Model weights saved in ./bert_sustainable_CRF_5seq_paper/checkpoint-4614/pytorch_model.bin
Deleting older checkpoint [bert_sustainable_CRF_5seq_paper/checkpoint-6921] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20402
  Batch size = 8
Saving model checkpoint to ./bert_sustainable_CRF_5seq_paper/checkpoint

In [22]:
def reconcile_mapping(model_data, pre_labeled_data, model_predictions):
    """ Utility function to reconcile mapping between pre-labeled data and model predictions.
    params: predictions: list of model predictions
            pre_labeled_data: dictionary outputted from reader function
    returns: pred_mapping: dict
             predictions: dict
    """
    pred_mapping = {}
    for dataset_text, text_position, prediction in zip(model_data['texts'], model_data['positions'], model_predictions):
        pred_mapping[text_position] = (dataset_text, prediction)

    pre_labeled_mapping = {}
    for text, pos, label in zip(pre_labeled_data['texts'], pre_labeled_data['positions'], pre_labeled_data['labels']):
        pre_labeled_mapping[pos] = (text, label)


    pred_mapping.update(pre_labeled_mapping)

    pred_mapping = {k: v for k, v in sorted(pred_mapping.items(), key=lambda item: item[0])}

    predictions =[element[1] for element in list(pred_mapping.values())] 

    return pred_mapping, predictions

In [23]:
# Create predictions dictionary spanning initiatives
def sentence_to_initiative_aggregation(predictions, predictions_report_numbers):
    """ Utility function which takes in a list of IOBES predictions per sentence and aggregates these into a dictionary of initiatives.
    params: predictions: list of multi-class predictions
    returns: predictions_dict: {initiative_number_report_number:list of sentence positions}
    """
    predictions_dict = {}
    initiative_index = 0
    prediction_index = 0
    while prediction_index < len(predictions):
      if predictions[prediction_index] == 0: #no initiative
        prediction_index += 1
      elif predictions[prediction_index] == 1: #singleton
        prediction_span = [prediction_index]
        predictions_dict[str(initiative_index)+'_'+str(predictions_report_numbers[prediction_index])] = prediction_span
        initiative_index += 1
        prediction_index += 1
      elif predictions[prediction_index] == 2: #beginning of initiative
        if predictions[prediction_index + 1] == 4: # 2 sentence initiative
          prediction_span = [prediction_index, prediction_index + 1]
          predictions_dict[str(initiative_index)+'_'+str(predictions_report_numbers[prediction_index])] = prediction_span
          initiative_index += 1
          prediction_index += 2
        elif (predictions[prediction_index + 1] == 3) and (predictions[prediction_index + 2] == 4): #3 sentence initiative
          prediction_span = [prediction_index, prediction_index + 1, prediction_index + 2]
          predictions_dict[str(initiative_index)+'_'+str(predictions_report_numbers[prediction_index])] = prediction_span
          initiative_index += 1
          prediction_index += 3
        elif (predictions[prediction_index + 1] == 3) and (predictions[prediction_index + 2] == 3) and (predictions[prediction_index + 3] == 4): #4 sentence initiative
          prediction_span = [prediction_index, prediction_index + 1, prediction_index + 2, prediction_index + 3]
          predictions_dict[str(initiative_index)+'_'+str(predictions_report_numbers[prediction_index])] = prediction_span
          initiative_index += 1
          prediction_index += 4
        elif (predictions[prediction_index + 1] == 3) and (predictions[prediction_index + 2] == 3) and (predictions[prediction_index + 3] == 3) and (predictions[prediction_index + 4] == 4): #5 sentence initiative
          prediction_span = [prediction_index, prediction_index + 1, prediction_index + 2, prediction_index + 3, prediction_index + 3]
          predictions_dict[str(initiative_index)+'_'+str(predictions_report_numbers[prediction_index])] = prediction_span
          initiative_index += 1
          prediction_index += 5
        else:
          prediction_span = [prediction_index]
          predictions_dict[str(initiative_index)+'_'+str(predictions_report_numbers[prediction_index])] = prediction_span
          initiative_index += 1
          prediction_index += 1
      else: # all other initiative predictions which do not form a complete BIE structure are labeled as individual singletons
        prediction_span = [prediction_index]
        predictions_dict[str(initiative_index)+'_'+str(predictions_report_numbers[prediction_index])] = prediction_span
        initiative_index += 1
        prediction_index += 1

    return predictions_dict

In [24]:
class Initiative_Evaluation():
    """ Class used to evaluate what % of initiatives have been correctly indentified.
    """
    def __init__(self, initiative_dict, predictions_dict):
        self.initiative_dict = initiative_dict
        self.predictions_dict = predictions_dict
        self.no_initiatives = len(self.initiative_dict)
    
    def evaluate(self):
        if len(self.initiative_dict) == len(self.predictions_dict) == 0:
            fully_correctly_labeled_proportion =  1
            half_correctly_labeled_proportion = 1
            min_correctly_labeled_proportion = 1
            fully_correct_F1 = 1
            half_correct_F1 = 1
            min_correct_F1 = 1
            return fully_correctly_labeled_proportion, half_correctly_labeled_proportion, min_correctly_labeled_proportion, fully_correct_F1, half_correct_F1, min_correct_F1
        else:
            # initiatize counters for true positive predictions
            fully_correct_TP = 0
            half_correct_TP = 0
            min_correct_TP = 0
            
            # initialize lists which contain prediction IDs\
            #  for the first correct prediction encountered across all initatives
            fully_correct_double_count = []
            half_correct_double_count = []
            min_correct_double_count = []

            for initiative_ID, initiative_positions_list in self.initiative_dict.items():
                # Keep a record of the first prediction id considered to be a success for each initiative
                fully_correct_match_pred_ID = []
                half_correct_match_pred_ID = []
                min_correct_match_pred_ID = []
                for prediction_ID, prediction_positions_list in self.predictions_dict.items():
                    if set(initiative_positions_list).intersection(prediction_positions_list): #check if the initiative span overlaps with the predicted span
                        if (len(set(initiative_positions_list).intersection(prediction_positions_list))/len(initiative_positions_list) == 1)\
                            and (len(set(prediction_positions_list).intersection(initiative_positions_list))/len(prediction_positions_list) == 1):
                                if (len(fully_correct_match_pred_ID) == 0) and (prediction_ID not in fully_correct_double_count): 
                                    fully_correct_match_pred_ID.append(prediction_ID)
                                    fully_correct_TP += 1
                        if(len(set(initiative_positions_list).intersection(prediction_positions_list))/len(initiative_positions_list) >= 0.5)\
                            and (len(set(prediction_positions_list).intersection(initiative_positions_list))/len(prediction_positions_list) >= 0.5):
                                if (len(half_correct_match_pred_ID) == 0) and (prediction_ID not in half_correct_double_count):
                                    half_correct_match_pred_ID.append(prediction_ID)
                                    half_correct_TP += 1
                        if(len(set(initiative_positions_list).intersection(prediction_positions_list))/len(initiative_positions_list) > 0)\
                            and (len(set(prediction_positions_list).intersection(initiative_positions_list))/len(prediction_positions_list) > 0):
                                if (len(min_correct_match_pred_ID) == 0) and (prediction_ID not in min_correct_double_count): 
                                        min_correct_match_pred_ID.append(prediction_ID)
                                        min_correct_TP += 1
                fully_correct_double_count.extend(fully_correct_match_pred_ID)
                half_correct_double_count.extend(half_correct_match_pred_ID)
                min_correct_double_count.extend(min_correct_match_pred_ID)
                        

            fully_correct_FN, fully_correct_FP = self.compute_FN_FP(fully_correct_TP)
            fully_correct_F1, fully_correct_precision, fully_correct_recall = self.compute_F1(fully_correct_TP, fully_correct_FP, fully_correct_FN)

            half_correct_FN, half_correct_FP = self.compute_FN_FP(half_correct_TP)
            half_correct_F1, half_correct_precision, half_correct_recall = self.compute_F1(half_correct_TP, half_correct_FP, half_correct_FN)

            min_correct_FN, min_correct_FP = self.compute_FN_FP(min_correct_TP)
            min_correct_F1, min_correct_precision, min_correct_recall = self.compute_F1(min_correct_TP, min_correct_FP, min_correct_FN)

            fully_correctly_labeled_proportion = fully_correct_TP/self.no_initiatives
            half_correctly_labeled_proportion = half_correct_TP/self.no_initiatives
            min_correctly_labeled_proportion = min_correct_TP/self.no_initiatives
            
            return fully_correctly_labeled_proportion, half_correctly_labeled_proportion, min_correctly_labeled_proportion, fully_correct_F1, half_correct_F1, min_correct_F1, fully_correct_precision, fully_correct_recall, half_correct_precision, half_correct_recall, min_correct_precision, min_correct_recall
    
    def compute_F1(self, TP, FP, FN):
        """ Utility method to compute F1 score
        """
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        if precision == recall == 0:
            F1 = 0
        else:
            F1 = 2 * precision * recall /(precision + recall)
        return F1, precision, recall
    
    def compute_FN_FP(self, TP):
        """ Utility method to compute FN and FP initiatives given the no of TP 
        (defined as the set intersection between gold initiative span and prediction span)
        """
        FN = len(self.initiative_dict) - TP
        FP = len(self.predictions_dict) - TP
        return FN, FP

In [25]:
# Unit tests for Initiative_Evaluation Class
mock_initiative_dict_1 = {1:[1,2], 2:[3,4]}
mock_predictions_dict_1 = {1: [1,2], 2:[3], 3:[4]}
mock_evaluation_1 = Initiative_Evaluation(mock_initiative_dict_1, mock_predictions_dict_1)
mock_init_strict_accuracy_1, mock_init_medium_accuracy_1, mock_init_lenient_accuracy_1, mock1_fully_correct_F1, mock1_half_correct_F1, mock1_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_1.evaluate()
assert mock_init_strict_accuracy_1 == 0.5
assert mock_init_medium_accuracy_1 == 1
assert mock_init_lenient_accuracy_1 == 1
assert mock1_fully_correct_F1 == 0.4
assert mock1_half_correct_F1 == mock1_min_correct_F1 == 0.8


mock_initiative_dict_2 = {1:[1,2], 2:[4,5,6]}
mock_predictions_dict_2 = {1: [1,2], 2:[4,5,6]}
mock_evaluation_2 = Initiative_Evaluation(mock_initiative_dict_2, mock_predictions_dict_2)
mock_init_strict_accuracy_2, mock_init_medium_accuracy_2, mock_init_lenient_accuracy_2, mock2_fully_correct_F1, mock2_half_correct_F1, mock2_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_2.evaluate()
assert mock_init_strict_accuracy_2 == 1
assert mock_init_medium_accuracy_2 == 1
assert mock_init_lenient_accuracy_2 == 1
assert mock2_fully_correct_F1 == mock2_half_correct_F1 == mock2_min_correct_F1 == 1

mock_initiative_dict_3 = {1:[1,2], 2:[3,4]}
mock_predictions_dict_3 = {1: [1,2], 2:[3,4]}
mock_evaluation_3 = Initiative_Evaluation(mock_initiative_dict_3, mock_predictions_dict_3)
mock_init_strict_accuracy_3, mock_init_medium_accuracy_3, mock_init_lenient_accuracy_3, mock3_fully_correct_F1, mock3_half_correct_F1, mock3_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_3.evaluate()
assert mock_init_strict_accuracy_3 == 1
assert mock_init_medium_accuracy_3 == 1
assert mock_init_lenient_accuracy_3 == 1
assert mock3_fully_correct_F1 == mock3_half_correct_F1 == mock3_min_correct_F1 == 1

mock_initiative_dict_4 = {}
mock_predictions_dict_4 = {}
mock_evaluation_4 = Initiative_Evaluation(mock_initiative_dict_4, mock_predictions_dict_4)
mock_init_strict_accuracy_4, mock_init_medium_accuracy_4, mock_init_lenient_accuracy_4, mock4_fully_correct_F1, mock4_half_correct_F1, mock4_min_correct_F1 = mock_evaluation_4.evaluate()
assert mock_init_strict_accuracy_4 == 1
assert mock_init_medium_accuracy_4 == 1
assert mock_init_lenient_accuracy_4 == 1
assert mock4_fully_correct_F1 == mock4_half_correct_F1 == mock4_min_correct_F1 == 1

mock_initiative_dict_5 = {1:[1,2], 2:[3,4,5], 3:[6]}
mock_predictions_dict_5 = {1:[1], 2:[2], 3:[3], 4:[4], 5:[5]}
mock_evaluation_5 = Initiative_Evaluation(mock_initiative_dict_5, mock_predictions_dict_5)
mock_init_strict_accuracy_5, mock_init_medium_accuracy_5, mock_init_lenient_accuracy_5, mock5_fully_correct_F1, mock5_half_correct_F1, mock5_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_5.evaluate()
assert mock_init_strict_accuracy_5 == 0
assert mock_init_medium_accuracy_5 == 1/3
assert mock_init_lenient_accuracy_5 == 2/3
assert mock5_fully_correct_F1 == 0
assert mock5_half_correct_F1 == 0.25
assert mock5_min_correct_F1 == 0.5

mock_initiative_dict_6 = {1:[1,2], 2:[3,4,5], 3:[6]}
mock_predictions_dict_6 = {1:[1,2,3,4,5,6]}
mock_evaluation_6 = Initiative_Evaluation(mock_initiative_dict_6, mock_predictions_dict_6)
mock_init_strict_accuracy_6, mock_init_medium_accuracy_6, mock_init_lenient_accuracy_6, mock6_fully_correct_F1, mock6_half_correct_F1, mock6_min_correct_F1,_,_,_,_,_,_ = mock_evaluation_6.evaluate()
assert mock_init_strict_accuracy_6 == 0
assert mock_init_medium_accuracy_6 == 1/3
assert mock_init_lenient_accuracy_6 == 1/3
assert mock6_fully_correct_F1 == 0
assert mock6_half_correct_F1 == mock6_min_correct_F1 == 0.5

In [26]:
# Perform context predictions on dev dataset
start_time = time.time()
sustainable_model = Sustainable_BERT.from_pretrained('./final_bert_sustainable_CRF_5seq_paper/') 
loaded_crf_model = torch.load('crf_model_IOBES_5seq_paper.pt') 
dev_predictions_list = model_predict(sustainable_model, tokenizer, dev_loader, device, global_target_sentence_index = global_target_sentence_index, max_paragraph_length = max_paragraph_length, crf_model = loaded_crf_model)
end_time = time.time()
print(f'Predicting results on dev set took {end_time-start_time} seconds')

# Reconcile predictions on the dev set
dev_pred_mapping, dev_predictions = reconcile_mapping(dev_data, pre_labeled_dev_data, dev_predictions_list)

loading configuration file ./final_bert_sustainable_CRF_5seq_paper/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "Sustainable_BERT"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./final_bert_sustainable_CRF_5seq_paper/pytorch_model.bin
All model checkpoint weights were used when initializing Sustainable_BERT.

All the weights of Sustainable_BERT were initialized from the model checkpoi

Predicting results on dev set took 275.13120007514954 seconds


In [27]:
# Extract ground truth dev data labels
dev_label_values = []
dev_report_numbers = []
for sent_no in range(len(development_data)):
  dev_report_numbers.append(development_data[sent_no][0]['report_no'])
  if development_data[sent_no][3]['list_of_initiatives']:
    initiative_unique_reference = development_data[sent_no][3]['list_of_initiatives'][0] + '_' + str(development_data[sent_no][0]['report_no'])
    if len(dev_initiative_dict[initiative_unique_reference]) == 1:
      dev_label_values.append(development_data[sent_no][2]['has_initiative']) # append 1 for singletons or 0 for non-initiative sentences
    elif dev_initiative_dict[initiative_unique_reference].index(sent_no) == 0:
      dev_label_values.append(2) #append 2 for beginning of initiative
    elif dev_initiative_dict[initiative_unique_reference].index(sent_no) == (len(dev_initiative_dict[initiative_unique_reference]) - 1):
      dev_label_values.append(4) #append 4 for end of initiative
    else:
      dev_label_values.append(3) #append 3 for inside an initiative
  else:
    dev_label_values.append(development_data[sent_no][2]['has_initiative'])


target_names = ['No Initiative', 'Singleton', 'Beginning', 'Inside', 'End']
print(f'Classification Report on the Development Dataset \n')
print(classification_report(dev_label_values, np.array(dev_predictions), target_names = target_names, digits=4))

Classification Report on the Development Dataset 

               precision    recall  f1-score   support

No Initiative     0.9787    0.9904    0.9845     53854
    Singleton     0.3487    0.1349    0.1946       504
    Beginning     0.2440    0.2100    0.2257       481
       Inside     0.1884    0.0419    0.0686       310
          End     0.1769    0.1684    0.1725       481

     accuracy                         0.9635     55630
    macro avg     0.3873    0.3091    0.3292     55630
 weighted avg     0.9554    0.9635    0.9587     55630



In [28]:
dev_binary_predictions = [1 if prediction else 0 for prediction in dev_predictions]
dev_label_binary_values = [1 if value else 0 for value in dev_label_values]
print(f'Binary Classification Report on the Development Dataset \n')
target_names = ['No Initiative', 'Initiative']
print(classification_report(dev_label_binary_values, np.array(dev_binary_predictions), target_names = target_names, digits=4))

Binary Classification Report on the Development Dataset 

               precision    recall  f1-score   support

No Initiative     0.9787    0.9904    0.9845     53854
   Initiative     0.5440    0.3480    0.4245      1776

     accuracy                         0.9699     55630
    macro avg     0.7614    0.6692    0.7045     55630
 weighted avg     0.9649    0.9699    0.9667     55630



In [29]:
dev_predictions_dict = sentence_to_initiative_aggregation(dev_predictions, dev_report_numbers)
dev_init_evaluation = Initiative_Evaluation(dev_initiative_dict, dev_predictions_dict)
dev_init_strict_accuracy, dev_init_medium_accuracy, dev_init_lenient_accuracy, dev_strict_F1, dev_medium_F1, dev_lenient_F1, dev_strict_precision, dev_strict_recall, dev_medium_precision, dev_medium_recall, dev_lenient_precision, dev_lenient_recall = dev_init_evaluation.evaluate()

print(f'Percentage of correctly predicted initiatives where at least 1 sentence is identified is {dev_init_lenient_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where more than 50% of sentences are identified is {dev_init_medium_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where 100% of sentences are identified is {dev_init_strict_accuracy:.2%} \n')
print(f'F1 score where at least 1 sentence is identified is {dev_lenient_F1:.2%} \n')
print(f'Precision score where at least 1 sentence is identified is {dev_lenient_precision:.2%} \n')
print(f'Recall score where at least 1 sentence is identified is {dev_lenient_recall:.2%} \n')
print(f'F1 score where 50% of sentences are identified is {dev_medium_F1:.2%} \n')
print(f'Precision score where 50% of sentences are identified is {dev_medium_precision:.2%} \n')
print(f'Recal score where 50% of sentences are identified is {dev_medium_recall:.2%} \n')
print(f'F1 score where 100% of sentences are identified is {dev_strict_F1:.2%} \n')
print(f'Precision score where 100% of sentences are identified is {dev_strict_precision:.2%} \n')
print(f'Recall score where 100% of sentences are identified is {dev_strict_recall:.2%} \n')

Percentage of correctly predicted initiatives where at least 1 sentence is identified is 43.35% 

Percentage of correctly predicted initiatives where more than 50% of sentences are identified is 36.85% 

Percentage of correctly predicted initiatives where 100% of sentences are identified is 20.10% 

F1 score where at least 1 sentence is identified is 43.73% 

Precision score where at least 1 sentence is identified is 44.11% 

Recall score where at least 1 sentence is identified is 43.35% 

F1 score where 50% of sentences are identified is 37.17% 

Precision score where 50% of sentences are identified is 37.50% 

Recal score where 50% of sentences are identified is 36.85% 

F1 score where 100% of sentences are identified is 20.28% 

Precision score where 100% of sentences are identified is 20.45% 

Recall score where 100% of sentences are identified is 20.10% 



In [30]:
# Perform predictions on test dataset
start_time = time.time()
test_predictions_list = model_predict(sustainable_model, tokenizer, test_loader, device,  global_target_sentence_index = global_target_sentence_index, max_paragraph_length = max_paragraph_length, crf_model = loaded_crf_model)
end_time = time.time()
print(f'Predicting results on test set took {end_time-start_time} seconds')

# Reconcile predictions on the train set
test_pred_mapping, test_predictions = reconcile_mapping(test_data, pre_labeled_test_data, test_predictions_list)

Predicting results on test set took 286.40934896469116 seconds


In [31]:
test_label_values = []
test_report_numbers = []
for sent_no in range(len(testing_data)):
  test_report_numbers.append(testing_data[sent_no][0]['report_no'])
  if testing_data[sent_no][3]['list_of_initiatives']:
    initiative_unique_reference = testing_data[sent_no][3]['list_of_initiatives'][0] + '_' + str(testing_data[sent_no][0]['report_no'])
    if len(test_initiative_dict[initiative_unique_reference]) == 1:
      test_label_values.append(testing_data[sent_no][2]['has_initiative']) # append 1 for singletons or 0 for non-initiative sentences
    elif test_initiative_dict[initiative_unique_reference].index(sent_no) == 0:
      test_label_values.append(2) #append 2 for beginning of initiative
    elif test_initiative_dict[initiative_unique_reference].index(sent_no) == (len(test_initiative_dict[initiative_unique_reference]) - 1):
      test_label_values.append(4) #append 4 for end of initiative
    else:
      test_label_values.append(3) #append 3 for inside an initiative
  else:
    test_label_values.append(testing_data[sent_no][2]['has_initiative'])


target_names = ['No Initiative', 'Singleton', 'Beginning', 'Inside', 'End']
print(f'Classification Report on the Test Dataset \n')
print(classification_report(test_label_values, np.array(test_predictions), target_names = target_names, digits=4))

Classification Report on the Test Dataset 

               precision    recall  f1-score   support

No Initiative     0.9764    0.9803    0.9783     48175
    Singleton     0.2518    0.1802    0.2101       577
    Beginning     0.1674    0.1633    0.1653       447
       Inside     0.1410    0.0472    0.0707       233
          End     0.1336    0.1745    0.1513       447

     accuracy                         0.9521     49879
    macro avg     0.3340    0.3091    0.3152     49879
 weighted avg     0.9493    0.9521    0.9505     49879



In [32]:
test_binary_predictions = [1 if prediction else 0 for prediction in test_predictions]
test_label_binary_values = [1 if value else 0 for value in test_label_values]
print(f'Binary Classification Report on the Test Dataset \n')
target_names = ['No Initiative', 'Initiative']
print(classification_report(test_label_binary_values, np.array(test_binary_predictions), target_names = target_names, digits=4))

Binary Classification Report on the Test Dataset 

               precision    recall  f1-score   support

No Initiative     0.9764    0.9803    0.9783     48175
   Initiative     0.3719    0.3298    0.3496      1704

     accuracy                         0.9581     49879
    macro avg     0.6742    0.6551    0.6640     49879
 weighted avg     0.9557    0.9581    0.9569     49879



In [34]:
test_predictions_dict = sentence_to_initiative_aggregation(test_predictions, test_report_numbers)
test_init_evaluation = Initiative_Evaluation(test_initiative_dict, test_predictions_dict)
test_init_strict_accuracy, test_init_medium_accuracy, test_init_lenient_accuracy, test_strict_F1, test_medium_F1, test_lenient_F1, test_strict_precision, test_strict_recall, test_medium_precision, test_medium_recall, test_lenient_precision, test_lenient_recall  = test_init_evaluation.evaluate()

print(f'Percentage of correctly predicted initiatives where at least 1 sentence is identified is {test_init_lenient_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where more than 50% of sentences are identified is {test_init_medium_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where 100% of sentences are identified is {test_init_strict_accuracy:.2%} \n')
print(f'F1 score where at least 1 sentence is identified is {test_lenient_F1:.2%} \n')
print(f'Precision score where at least 1 sentence is identified is {test_lenient_precision:.2%} \n')
print(f'Recall score where at least 1 sentence is identified is {test_lenient_recall:.2%} \n')
print(f'F1 score where 50% of sentences are identified is {test_medium_F1:.2%} \n')
print(f'Precision score where 50% of sentences are identified is {test_medium_precision:.2%} \n')
print(f'Recal score where 50% of sentences are identified is {test_medium_recall:.2%} \n')
print(f'F1 score where 100% of sentences are identified is {test_strict_F1:.2%} \n')
print(f'Precision score where 100% of sentences are identified is {test_strict_precision:.2%} \n')
print(f'Recall score where 100% of sentences are identified is {test_strict_recall:.2%} \n')

Percentage of correctly predicted initiatives where at least 1 sentence is identified is 39.55% 

Percentage of correctly predicted initiatives where more than 50% of sentences are identified is 33.59% 

Percentage of correctly predicted initiatives where 100% of sentences are identified is 19.34% 

F1 score where at least 1 sentence is identified is 34.18% 

Precision score where at least 1 sentence is identified is 30.09% 

Recall score where at least 1 sentence is identified is 39.55% 

F1 score where 50% of sentences are identified is 29.03% 

Precision score where 50% of sentences are identified is 25.56% 

Recal score where 50% of sentences are identified is 33.59% 

F1 score where 100% of sentences are identified is 16.71% 

Precision score where 100% of sentences are identified is 14.71% 

Recall score where 100% of sentences are identified is 19.34% 



In [35]:
# Perform predictions on train dataset
start_time = time.time()
train_predictions_list = model_predict(sustainable_model, tokenizer, train_loader, device,  global_target_sentence_index = global_target_sentence_index, max_paragraph_length = max_paragraph_length, crf_model = loaded_crf_model)
end_time = time.time()
print(f'Predicting results on train set took {end_time-start_time} seconds')

# Reconcile predictions on the train set
train_pred_mapping, train_predictions = reconcile_mapping(train_data, pre_labeled_train_data, train_predictions_list)

Predicting results on train set took 476.2395040988922 seconds


In [36]:
training_labels = []
train_report_numbers = []
for sent_no in range(len(training_data)):
  train_report_numbers.append(training_data[sent_no][0]['report_no'])
  if training_data[sent_no][3]['list_of_initiatives']:
    initiative_unique_reference = training_data[sent_no][3]['list_of_initiatives'][0] + '_' + str(training_data[sent_no][0]['report_no'])
    if len(train_initiative_dict[initiative_unique_reference]) == 1:
      training_labels.append(training_data[sent_no][2]['has_initiative']) # append 1 for singletons or 0 for non-initiative sentences
    elif train_initiative_dict[initiative_unique_reference].index(sent_no) == 0:
      training_labels.append(2) #append 2 for beginning of initiative
    elif train_initiative_dict[initiative_unique_reference].index(sent_no) == (len(train_initiative_dict[initiative_unique_reference]) - 1):
      training_labels.append(4) #append 4 for end of initiative
    else:
      training_labels.append(3) #append 3 for inside an initiative
  else:
    training_labels.append(training_data[sent_no][2]['has_initiative'])
    
target_names = ['No Initiative', 'Singleton', 'Beginning', 'Inside', 'End']
print(f'Classification Report on the Training Dataset \n')
print(classification_report(training_labels, np.array(train_predictions), target_names = target_names, digits = 4))

Classification Report on the Training Dataset 

               precision    recall  f1-score   support

No Initiative     0.9983    0.9870    0.9926     83801
    Singleton     0.6730    0.9100    0.7738      1045
    Beginning     0.6603    0.7678    0.7100       995
       Inside     0.6598    0.9621    0.7828       633
          End     0.7032    0.7930    0.7454       995

     accuracy                         0.9812     87469
    macro avg     0.7389    0.8840    0.8009     87469
 weighted avg     0.9847    0.9812    0.9825     87469



In [37]:
start_time = time.time()
train_predictions_dict = sentence_to_initiative_aggregation(train_predictions, train_report_numbers)
train_init_evaluation = Initiative_Evaluation(train_initiative_dict, train_predictions_dict)
train_init_strict_accuracy, train_init_medium_accuracy, train_init_lenient_accuracy, train_strict_F1, train_medium_F1, train_lenient_F1, train_strict_precision, train_strict_recall, train_medium_precision, train_medium_recall, train_lenient_precision, train_lenient_recall  = train_init_evaluation.evaluate()

print(f'Percentage of correctly predicted initiatives where at least 1 sentence is identified is {train_init_lenient_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where more than 50% of sentences are identified is {train_init_medium_accuracy:.2%} \n')
print(f'Percentage of correctly predicted initiatives where 100% of sentences are identified is {train_init_strict_accuracy:.2%} \n')
print(f'F1 score where at least 1 sentence is identified is {train_lenient_F1:.2%} \n')
print(f'F1 score where 50% of sentences are identified is {train_medium_F1:.2%} \n')
print(f'F1 score where 100% of sentences are identified is {train_strict_F1:.2%} \n')
end_time = time.time()
print(f'Evaluating initiatives on the train set took {end_time-start_time} seconds')

Percentage of correctly predicted initiatives where at least 1 sentence is identified is 96.76% 

Percentage of correctly predicted initiatives where more than 50% of sentences are identified is 85.88% 

Percentage of correctly predicted initiatives where 100% of sentences are identified is 72.55% 

F1 score where at least 1 sentence is identified is 68.19% 

F1 score where 50% of sentences are identified is 60.52% 

F1 score where 100% of sentences are identified is 51.12% 

Evaluating initiatives on the train set took 2.879420042037964 seconds
