In [8]:
from datasets import load_dataset
from utils import Evaluator
from transformers import AutoTokenizer

### LOAD BEST MODEL RESULTS

In [9]:
checkpoint = "ferrazzipietro/LS_Llama-2-7b-hf_adapters_en.layer1_NoQuant_16_32_0.05_2_0.0002_3EpochsLast"
data = load_dataset(checkpoint, split="test")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data

Dataset({
    features: ['sentence', 'entities', 'original_text', 'original_id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels', 'predictions', 'ground_truth_labels'],
    num_rows: 681
})

In [57]:
import pandas as pd
from datasets import Dataset

class Evaluator():
    def __init__(self, data, tokenizer):
        self.data = data
        # self.evaluation_table = pd.DataFrame(columns=['TP', 'FP', 'FN'])
        self.evaluation_table = {}
        self.tokenizer = tokenizer
        
     
    def _compare_prediction_label_one_example_token_by_token(self, example) -> (int, int, int):
        """
        Compare the prediction with the label of one sentence.
        Args:
        predictions (list[str]): the list of the predicted labels
        labels (list[str]): the list of the true labels
        return:
        int: the number of false positives
        int: the number of false negatives
        int: the number of true positives
        """
        predictions = example['predictions']
        labels = example['ground_truth_labels']
        TP, FP, FN = 0, 0, 0
        # labels = ['O'] + labels[:-1] 
        for pred, lab in zip(predictions, labels):
            TP = TP + (1 if pred == lab and lab!='O' else 0)
            FP = FP + (1 if pred != lab and lab =='O' else 0)
            FN = FN + (1 if pred != lab and pred =='O' else 0)
        try:
            precision = TP / (TP + FP)
        except:
            precision = 0
        try:
            recall = TP / (TP + FN)
        except:
            recall = 0
        try:
            f1 = 2 * (precision * recall) / (precision + recall)
        except:
            f1 = 0
        
        example['TP'] = TP
        example['FP'] = FP
        example['FN'] = FN
        example['precision'] = precision
        example['recall'] = recall
        example['f1'] = f1
        return example
    
    def extract_FP_FN_TP_token_by_token(self) -> (int, int, int):
        """
        Extract the number of False Positives, False Negatives and True Positives from the model output and the ground truth.
        Args:
        predictions (list[str]): the list of the predicted labels
        labels (list[str]): the list of the true labels
        return:
        int: the number of false positives
        int: the number of false negatives
        int: the number of true positives
        """
        self.data = self.data.map(self._compare_prediction_label_one_example_token_by_token, batched=False)


    def create_evaluation_table(self):
        tmp_data = pd.DataFrame(self.data)
        TP = tmp_data['TP'].sum()
        FP = tmp_data['FP'].sum()
        FN = tmp_data['FN'].sum()
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        f1 = 2 * (precision * recall) / (precision + recall)
        self.evaluation_table = {'TP': TP, 'FP': FP, 'FN': FN,
                                  'precision':precision, 'recall':recall, 'f1':f1}
        
        return self.evaluation_table
    
    def get_examples_based_on_metric(self, metric, threshold, lower=False):
        """
        Select the examples based on the metric and the threshold.
        Args:
        metric (str): the metric to consider
        threshold (float): the threshold to consider
        return:
        list: the list of examples that satisfy the condition
        """
        if lower:
            out = [example for example in self.data if example[metric] < threshold]
        else:
            out = [example for example in self.data if example[metric] > threshold]
        return(Dataset.from_pandas(pd.DataFrame(out)))
    
    def format_data(self, data):
        """
        """
        for example in data:
            sentence_pred = example['predictions']
            sentence = example['sentence']
            tokenized_input = self.tokenizer(sentence, add_special_tokens=False)
            tokens = self.tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
            print([(t,p, ground_truth_label) for t, p, ground_truth_label in zip(tokens, sentence_pred, example['ground_truth_labels'])])
        return data

    def print_disallined_Is(self):
        """
        """
        counter = 0
        tot_tokens = 0
        print('(token, prediction, ground_truth_label)')
        self.disallined_df = pd.DataFrame(columns=['id', 'token', 'prediction', 'ground_truth_label'])
        id_sentence = 0
        for example in self.data:
            sentence_pred = example['predictions']
            sentence = example['sentence']
            previous = '' 
            appened = False
            for token in sentence_pred:
                if token=='I' and previous=='O':
                    appened = True
                    counter+=1
                    # print('token:', token, 'previous:', previous, 'position:', i, 'sentence:', sentence_pred)
                previous = token
                tot_tokens += 1
            if appened:
                tokenized_input = self.tokenizer(sentence, add_special_tokens=False)
                tokens = self.tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
                self.disallined_df = pd.concat([self.disallined_df, pd.DataFrame([{'id': id_sentence,'token':t, 'prediction': p, 'ground_truth_label': ground_truth_label} for t, p, ground_truth_label in zip(tokens, sentence_pred, example['ground_truth_labels'])])])
                print([(t,p, ground_truth_label) for t, p, ground_truth_label in zip(tokens, sentence_pred, example['ground_truth_labels'])])
                id_sentence += 1

In [62]:
eval = Evaluator(data, tokenizer)
eval.extract_FP_FN_TP_token_by_token()
eval.create_evaluation_table()
print(eval.evaluation_table)
#eval.print_disallined_Is()
#eval.disallined_df

{'TP': 7085, 'FP': 1438, 'FN': 3409, 'precision': 0.8312800657045641, 'recall': 0.6751477034495903, 'f1': 0.7451227848766893}


In [66]:
data_f1Lower30 = eval.get_examples_based_on_metric('f1', 0.1, lower=True)
eval.format_data(data_f1Lower30)

[('▁In', 'O', 'O'), ('▁mut', 'O', 'O'), ('ated', 'I', 'O'), ('▁SER', 'O', 'O'), ('P', 'I', 'O'), ('IN', 'I', 'O'), ('C', 'O', 'O'), ('1', 'O', 'O'), ('▁protein', 'O', 'O'), ('▁a', 'O', 'O'), ('▁new', 'O', 'O'), ('▁N', 'O', 'O'), ('-', 'O', 'O'), ('link', 'O', 'O'), ('ed', 'I', 'O'), ('▁g', 'O', 'O'), ('ly', 'O', 'O'), ('cos', 'O', 'O'), ('yl', 'O', 'O'), ('ation', 'O', 'O'), ('▁site', 'O', 'O'), ('▁is', 'O', 'O'), ('▁formed', 'O', 'B'), (',', 'O', 'I'), ('▁however', 'O', 'O'), (',', 'O', 'O'), ('▁it', 'O', 'O'), ('▁is', 'O', 'O'), ('▁unclear', 'O', 'B'), ('▁if', 'O', 'O'), ('▁the', 'O', 'O'), ('▁g', 'O', 'O'), ('ly', 'O', 'O'), ('cos', 'O', 'O'), ('yl', 'O', 'O'), ('ation', 'O', 'O'), ('▁at', 'O', 'O'), ('▁', 'B', 'O'), ('2', 'I', 'O'), ('1', 'I', 'O'), ('9', 'O', 'O'), ('-', 'I', 'O'), ('2', 'I', 'O'), ('2', 'I', 'O'), ('1', 'O', 'O'), ('▁site', 'O', 'O'), ('▁is', 'O', 'O'), ('▁possible', 'O', 'O'), ('.', 'O', 'O')]
[('▁Before', 'O', 'O'), ('▁starting', 'O', 'B'), ('▁cort', 'O', 'B'),

Dataset({
    features: ['sentence', 'entities', 'original_text', 'original_id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels', 'predictions', 'ground_truth_labels', 'TP', 'FP', 'FN', 'precision', 'recall', 'f1'],
    num_rows: 18
})