# HebSafeHarbor Model Evaluation

In [1]:
import sys
sys.path.append('../')

In [2]:
from ner_evaluation.ner_eval import collect_named_entities
from ner_evaluation.ner_eval import compute_metrics, Evaluator
from ner_evaluation.ner_eval import compute_precision_recall_wrapper
from ner_evaluation.ner_eval import Entity

from hebsafeharbor import HebSafeHarbor

from glob import glob
import os
import logging
import re
import numpy as np
import pandas as pd
from difflib import SequenceMatcher


In [3]:
def fb_score(precision:float,recall:float,beta:int=2)->float:
    '''
    Compute F beta score of a model
    :param precision: the model's precision score
    :param recall: the model's recall score
    :param beta: which metric to compute (1 for F1 score, 2 for F2 score etc.)
    '''
    return (1+(beta**2))*(precision*recall)/(((beta**2)*precision)+recall)

def flatten_results(res):
    '''
    Takes a list of Evaluator outputs and inserts it into a pandas dataframe
    :param res: a list of Evaluator results
    :returns a pandas dataframe 
    '''
    entity_list = []
    for item in res:
        for match_type in item['results'].keys():
            
            for entities in item['results'][match_type]:
                item_dict = {}
                item_dict['idx'] = item['idx']
                item_dict['match_type'] = match_type
                if match_type =='spurious':
                    item_dict['pred_entity'] = entities.e_type
                    item_dict['pred_start'] = entities.start_offset
                    item_dict['pred_end'] = entities.end_offset
                elif match_type =='missed':
                    item_dict['true_entity'] = entities.e_type
                    item_dict['true_start'] = entities.start_offset
                    item_dict['true_end'] = entities.end_offset
                else:
                    item_dict['pred_entity'] = entities[1].e_type
                    item_dict['pred_start'] = entities[1].start_offset
                    item_dict['pred_end'] = entities[1].end_offset
                    item_dict['true_entity'] = entities[0].e_type
                    item_dict['true_start'] = entities[0].start_offset
                    item_dict['true_end'] = entities[0].end_offset
                entity_list.append(item_dict)
    return entity_list




# Load Documents

In [4]:
annotation_date = "08-03-2022"
folders = glob(f"/Users/ayabellicha/Documents/phi_evaluation_set/phi_annotations_{annotation_date}/*/", recursive = True)
annotations_list = []
txt_list = []
i=0
idx_to_folder = {}
for folder in folders:
    annotations_fname = glob(f"{folder}*.ann", recursive = True)[0]
    idx_to_folder[i] = '/'.join(annotations_fname.split('/')[-3:-1])
    txt_fname = glob(f"{folder}*.txt", recursive = True)[0]

    with open(annotations_fname) as f:
        annotations_list.append(f.readlines())

    with open(txt_fname) as f:
        txt_list.append(' '.join(f.readlines()))

    i+=1

# Run HebSafeHarbor NER

In [5]:
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [6]:
doc_list = [{"text": txt} for txt in txt_list]
hebrew_phi = HebSafeHarbor()
output = hebrew_phi(doc_list);



# Map Entity Types

In [7]:
# Annotated entity types to include in the analysis
monitored_entities = ['ETHNICITY','NAME','ADDRESS','DATE','PHONE_OR_FAX','ID','EMAIL','URL','IP_ADDRESS','ORGANIZATION']

#annot_entity_mapping = {'SOCIAL_STATUS':'ORG','CARE_ENVIRONMENT':'ORG','ADDRESS':'LOC'}
annot_entity_mapping = {'ADDRESS':'LOC','ORGANIZATION':'ORG'}
sh_entity_mapping = {'MEDICAL_DATE':'DATE','BIRTH_DATE':'DATE','CITY':'LOC','COUNTRY':'LOC',
                     'EMAIL_ADDRESS':'EMAIL','ISRAELI_ID_NUMBER':'ID','PER':'NAME','PERS':'NAME',
                     'PHONE_NUMBER':'PHONE_OR_FAX','FAC':'LOC','GPE':'LOC','MISC__AFF':'ETHNICITY'}

# Entity types included in the analysis
tags = ['LOC','EMAIL','ID','ORG','DATE','NAME','ETHNICITY','PHONE_OR_FAX','URL','IP_ADDRESS'] 

# Create Entity Lists

In [8]:
agg_predictions = []
for prediction in output:
    predicted_entity_list = []
    for entity in prediction.granular_analyzer_results:
        entity_dict = entity.__dict__
        entity_type = sh_entity_mapping.get(entity_dict['entity_type'],entity_dict['entity_type'])
        predicted_entity_list.append(Entity(entity_type,entity_dict['start'],entity_dict['end']))
    agg_predictions.append(predicted_entity_list)



In [9]:

annotations = annotations_list[0]
agg_true = []
for annotations in annotations_list:
    entity_list = []
    for a in annotations:
        entity = re.split('\t|\n|\s',a) 
        if (len(entity[0])<1) or (entity[0][0]!='T') or not(entity[1] in monitored_entities):
            continue
        entity_type = annot_entity_mapping.get(entity[1],entity[1])
        entity_list.append(Entity(entity_type,int(entity[2]),int(entity[3])))
    agg_true.append(entity_list)
    




# Run Model Evaluator

In [10]:
evaluator = Evaluator(agg_true,agg_predictions,tags)
metrics = evaluator.evaluate()

# Create a dataframe containing the results

In [11]:

examples = flatten_results(metrics[2])
entities_df = pd.DataFrame(examples)

entities_df['pred_text'] = entities_df.apply(lambda x: txt_list[x['idx']][int(x['pred_start']):int(x['pred_end'])] if x['pred_entity'] is not np.nan else None,axis=1)
entities_df['true_text'] = entities_df.apply(lambda x: txt_list[x['idx']][int(x['true_start']):int(x['true_end'])] if x['true_entity'] is not np.nan else None,axis=1)

# Analyze Results

## Match type

In [29]:
entities_df['match_type'].value_counts()

strict      530
type        139
spurious     72
missed       57
partial      43
exact        36
Name: match_type, dtype: int64

## Classifications and Misclassifications

In [30]:
entities_df[(entities_df.match_type != 'spurious') & (entities_df.match_type != 'missed')].groupby(['pred_entity','true_entity'])['idx'].count()

pred_entity   true_entity 
DATE          DATE            121
EMAIL         EMAIL            24
ETHNICITY     ETHNICITY        12
              ORG               1
ID            DATE              2
              ID              138
              LOC               9
              PHONE_OR_FAX     18
LOC           ETHNICITY         1
              LOC             101
              ORG              18
NAME          ETHNICITY         1
              LOC               1
              NAME            182
              ORG               8
ORG           DATE              3
              LOC               8
              NAME              8
              ORG              84
PHONE_OR_FAX  PHONE_OR_FAX      2
URL           EMAIL             1
              URL               5
Name: idx, dtype: int64

## F2-Score

### The evaluator's built in metrics
The metric penalized partial matches at 50%

In [68]:
f2_dict = {}
f2_dict['semeval'] = {}
for k in metrics[0].keys():
    print(f'{k} F2 score: ',fb_score(metrics[0][k]['precision'],metrics[0][k]['recall']))
    f2_dict['semeval'][k] = fb_score(metrics[0][k]['precision'],metrics[0][k]['recall'])

ent_type F2 score:  0.827970297029703
partial F2 score:  0.8131188118811882
strict F2 score:  0.6559405940594059
exact F2 score:  0.7004950495049505


### Give partial matches an equal weight

In [70]:
TP = entities_df[~entities_df['match_type'].isin(['spurious','missed'])].shape[0]
FP = entities_df[entities_df['match_type'].isin(['spurious'])].shape[0]
FN = entities_df[entities_df['match_type'].isin(['missed'])].shape[0]

precision = TP/(TP+FP)
recall = TP/(TP+FN)
print(f'Partial F2 score: {fb_score(precision=precision,recall=recall)}')

Partial F2 score: 0.9257425742574259


In [71]:
f2_dict['equal_weight'] = {}
f2_dict['equal_weight']['partial'] = fb_score(precision=precision,recall=recall)

### Weighted F2 score 
Weight partial matches according to their overlap ratio

In [73]:

tp_df = entities_df[~entities_df['match_type'].isin(['spurious','missed'])].copy()
tp_df['weight'] = tp_df.apply(lambda x: SequenceMatcher(None,x['pred_text'],x['true_text']).ratio(),axis=1)
weighted_TP = tp_df['weight'].sum()
precision = weighted_TP/(weighted_TP+FP)
recall = weighted_TP/(weighted_TP+FN)
fb_score(precision=precision,recall=recall)
print(f'Weighted partial F2 score: {fb_score(precision=precision,recall=recall)}')

Weighted partial F2 score: 0.9190370169212858


In [74]:
f2_dict['weighted_score'] = {}
f2_dict['weighted_score']['partial'] = fb_score(precision=precision,recall=recall)

In [77]:
f2_df = pd.DataFrame(f2_dict)

## Match types by predicted entity type

In [16]:
entities_df.groupby(['pred_entity','match_type'])['idx'].count().reset_index().rename(columns={'idx':'count'}).sort_values(['pred_entity','count'],ascending=[True,False]).set_index(['pred_entity','match_type'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
pred_entity,match_type,Unnamed: 2_level_1
DATE,strict,116
DATE,spurious,5
DATE,type,5
EMAIL,strict,24
ETHNICITY,strict,10
ETHNICITY,spurious,5
ETHNICITY,type,2
ETHNICITY,partial,1
ID,strict,138
ID,exact,21


## Match types by annotated entity type

In [17]:
entities_df.groupby(['true_entity','match_type'])['idx'].count().reset_index().rename(columns={'idx':'count'}).sort_values(['true_entity','count'],ascending=[True,False]).set_index(['true_entity','match_type'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
true_entity,match_type,Unnamed: 2_level_1
DATE,strict,116
DATE,type,5
DATE,partial,3
DATE,exact,2
EMAIL,strict,24
EMAIL,exact,1
ETHNICITY,strict,10
ETHNICITY,type,2
ETHNICITY,exact,1
ETHNICITY,partial,1


## Spurious Entities

In [18]:
spurious_df = entities_df[entities_df['match_type']=='spurious']
spurious_df['pred_context'] = spurious_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['pred_start'])-50)):(int(x['pred_end'])+50)],axis=1)
spurious_df = spurious_df.groupby(['pred_entity','idx'])[['pred_text','pred_context']].agg(list).reset_index() #.to_csv('spurious.csv',encoding = 'utf-8-sig')

spurious_df['folder_name'] = spurious_df['idx'].apply(lambda x: '/'.join(folders[x].split('/')[-3:]))
#spurious_df
spurious_df.to_csv('spurious.csv',encoding = 'utf-8-sig')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spurious_df['pred_context'] = spurious_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['pred_start'])-50)):(int(x['pred_end'])+50)],axis=1)


## Missed Entities

In [21]:
missed_df = entities_df[entities_df['match_type']=='missed']
missed_df['true_context'] = missed_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['true_start'])-50)):(int(x['true_end'])+50)],axis=1)
missed_df = missed_df.groupby(['true_entity','idx'])[['true_text','true_context']].agg(list).reset_index() #.to_csv('spurious.csv',encoding = 'utf-8-sig')

missed_df['folder_name'] = missed_df['idx'].apply(lambda x: '/'.join(folders[x].split('/')[-3:]))
#spurious_df
missed_df.to_csv('missed.csv',encoding = 'utf-8-sig')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missed_df['true_context'] = missed_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['true_start'])-50)):(int(x['true_end'])+50)],axis=1)


## Misclassifications

In [23]:
missclass_df = entities_df[(entities_df['match_type']!='missed') & (entities_df['match_type']!='spurious') & (entities_df['pred_entity']!=entities_df['true_entity'])]
missclass_df['true_context'] = missclass_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['true_start'])-50)):(int(x['true_end'])+50)],axis=1)
missclass_df['pred_context'] = missclass_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['pred_start'])-50)):(int(x['pred_end'])+50)],axis=1)
#missclass_df = missed_df.groupby(['true_entity','pred_entity','idx'])[['true_text','true_context']].agg(list).reset_index() #.to_csv('spurious.csv',encoding = 'utf-8-sig')

missclass_df['folder_name'] = missclass_df['idx'].apply(lambda x: '/'.join(folders[x].split('/')[-3:]))
missclass_df = missclass_df.sort_values(['idx','match_type','true_entity','pred_entity'])[['idx','match_type','pred_entity','true_entity','pred_text','true_text','true_context','folder_name']]
#spurious_df
missclass_df.to_csv('missclass.csv',encoding = 'utf-8-sig')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missclass_df['true_context'] = missclass_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['true_start'])-50)):(int(x['true_end'])+50)],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missclass_df['pred_context'] = missclass_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['pred_start'])-50)):(int(x['pred_end'])+50)],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pyd

## Partial Matches

In [26]:
partial_df = entities_df[(entities_df['match_type']=='type')]
partial_df['true_context'] = partial_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['true_start'])-50)):(int(x['true_end'])+50)],axis=1)
partial_df['pred_context'] = partial_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['pred_start'])-50)):(int(x['pred_end'])+50)],axis=1)
#missclass_df = missed_df.groupby(['true_entity','pred_entity','idx'])[['true_text','true_context']].agg(list).reset_index() #.to_csv('spurious.csv',encoding = 'utf-8-sig')

partial_df['folder_name'] = partial_df['idx'].apply(lambda x: '/'.join(folders[x].split('/')[-3:]))
partial_df = partial_df.sort_values(['idx','match_type','true_entity','pred_entity'])[['idx','match_type','pred_entity','true_entity','pred_text','true_text','true_context','folder_name']]
#spurious_df
partial_df.to_csv('partial.csv',encoding = 'utf-8-sig')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_df['true_context'] = partial_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['true_start'])-50)):(int(x['true_end'])+50)],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_df['pred_context'] = partial_df.apply(lambda x: txt_list[x['idx']][(max(0,int(x['pred_start'])-50)):(int(x['pred_end'])+50)],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

In [53]:
#files = glob(f"/Users/ayabellicha/Documents/HebSafeHarbor/evaluation/*", recursive = True)

#fname = glob(f"{folder}*.ann", recursive = True)[0]

['6']

In [59]:
!mkdir results

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
mkdir: results: File exists


In [88]:

files = glob(f"{os.getcwd()}/results/FP_and_FN_{annotation_date}*")
if len(files)>0 and len(re.findall(r'_v(\d).xlsx',files[0]))>0:
    version = max(np.array([int(re.findall(r'_v(\d).xlsx',x)[0])+1 for x in files]))
else:
    version = 1

options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False

with pd.ExcelWriter(f'results/FP_and_FN_{annotation_date}_v{version}.xlsx',engine='xlsxwriter',engine_kwargs={'options':options}) as writer:
    f2_df.to_excel(writer,sheet_name='F2 score',index=True,encoding = 'utf-8-sig')
    spurious_df.to_excel(writer,sheet_name='FP',index=False,encoding = 'utf-8-sig')
    missed_df.to_excel(writer,sheet_name='FN',index=False,encoding = 'utf-8-sig')
    missclass_df.to_excel(writer,sheet_name='Misclassification',index=False,encoding = 'utf-8-sig')
    partial_df.to_excel(writer,sheet_name='Partial match',index=False,encoding = 'utf-8-sig')
    