In [1]:
from ner_eval import Evaluator
# ref: http://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/
# Common scenarios in NER evaluation:
# Message Understanding Conference (MUC)
# MUC introduced detailed metrics in an evaluation considering different categories of errors, ]
# these metrics can be defined as in terms of comparing the response of a system against the golden annotation:

# Correct (COR) : both are the same;
# Incorrect (INC) : the output of a system and the golden annotation don’t match;
# Partial (PAR) : system and the golden annotation are somewhat “similar” but not the same;
# Missing (MIS) : a golden annotation is not captured by a system;
# Spurius (SPU) : system produces a response which doesn’t exit in the golden annotation;

# scenario 1: Surface string and entity type match (COR)
true1 = ['O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O']
pred1 = ['O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O']
# scenario 2: System hypothesized an entity (SPU)
true2 = ['O', 'O', 'O', 'O', 'O', 'O']
pred2 = ['O', 'B-MISC', 'I-MISC', 'O', 'O', 'O']
# scenario 3: System misses an entity (MIS)
true3 = ['O', 'B-MISC', 'I-MISC', 'O', 'O', 'O']
pred3 = ['O', 'O', 'O', 'O', 'O', 'O']
# scenario 4: System assigns the wrong entity type (INC)
true4 = ['O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O']
pred4 = ['O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O']
# scenario 5: System gets the boundaries of the surface string wrong (PAR)
true5 = ['O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O']
pred5 = ['O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O']
# scenario 6: System gets the boundaries and entity type wrong (INC)
true6 = ['O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O']
pred6 = ['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O']


true_labels = [true1, true2, true3, true4, true5, true6]
pred_labels = [pred1, pred2, pred3, pred4, pred5, pred6]

In [2]:
# The SemEval’13 introduced four different ways to measure precision/recall/f1-score results 
# based on the metrics defined by MUC.

# Strict (****): exact boundary surface string match and entity type;
# Exact (***): exact boundary match over the surface string, regardless of the type;
# Type (**): some overlap between the system tagged entity and the gold annotation is required;
# Partial (*): partial boundary match over the surface string, regardless of the type;

# each of these ways to measure the performance accounts for 
# correct, incorrect, partial, missed and spurious (as in Message Understanding Conference (MUC)) 
# in different ways.

# scenario 1: Strict (1), Exact (1), Partial (1), Type (1)
# scenario 2: Strict (0), Exact (0), Partial (0), Type (0)
# scenario 3: Strict (0), Exact (0), Partial (0), Type (0)
# scenario 4: Strict (0), Exact (1), Partial (1), Type (0)
# scenario 5: Strict (0), Exact (0), Partial (1), Type (1)
# scenario 6: Strict (0), Exact (0), Partial (1), Type (0)

In [3]:
evaluator = Evaluator(true_labels, pred_labels, ['LOC', 'MISC', 'PER', 'ORG'])
results, results_agg = evaluator.evaluate()

2020-06-04 20:30:35 root INFO: Imported 6 predictions for 6 true examples


In [4]:
for k in results:
    p = results[k]['precision']
    r = results[k]['recall']
    f1 = 2*(p*r)/(p+r) if (p+r)>0 else 0
    results[k]['f1'] = f1
    
results

{'ent_type': {'correct': 3,
  'incorrect': 2,
  'partial': 0,
  'missed': 1,
  'spurious': 1,
  'possible': 6,
  'actual': 6,
  'precision': 0.5,
  'recall': 0.5,
  'f1': 0.5},
 'partial': {'correct': 3,
  'incorrect': 0,
  'partial': 2,
  'missed': 1,
  'spurious': 1,
  'possible': 6,
  'actual': 6,
  'precision': 0.6666666666666666,
  'recall': 0.6666666666666666,
  'f1': 0.6666666666666666},
 'strict': {'correct': 2,
  'incorrect': 3,
  'partial': 0,
  'missed': 1,
  'spurious': 1,
  'possible': 6,
  'actual': 6,
  'precision': 0.3333333333333333,
  'recall': 0.3333333333333333,
  'f1': 0.3333333333333333},
 'exact': {'correct': 3,
  'incorrect': 2,
  'partial': 0,
  'missed': 1,
  'spurious': 1,
  'possible': 6,
  'actual': 6,
  'precision': 0.5,
  'recall': 0.5,
  'f1': 0.5}}

In [5]:
for label in results_agg:
    for k in results_agg[label]:
        p = results_agg[label][k]['precision']
        r = results_agg[label][k]['recall']
        f1 = 2*(p*r)/(p+r) if (p+r)>0 else 0
        results_agg[label][k]['f1'] = f1

results_agg

{'LOC': {'ent_type': {'correct': 1,
   'incorrect': 1,
   'partial': 0,
   'missed': 0,
   'spurious': 1,
   'possible': 2,
   'actual': 3,
   'precision': 0.3333333333333333,
   'recall': 0.5,
   'f1': 0.4},
  'partial': {'correct': 2,
   'incorrect': 0,
   'partial': 0,
   'missed': 0,
   'spurious': 1,
   'possible': 2,
   'actual': 3,
   'precision': 0.6666666666666666,
   'recall': 1.0,
   'f1': 0.8},
  'strict': {'correct': 1,
   'incorrect': 1,
   'partial': 0,
   'missed': 0,
   'spurious': 1,
   'possible': 2,
   'actual': 3,
   'precision': 0.3333333333333333,
   'recall': 0.5,
   'f1': 0.4},
  'exact': {'correct': 2,
   'incorrect': 0,
   'partial': 0,
   'missed': 0,
   'spurious': 1,
   'possible': 2,
   'actual': 3,
   'precision': 0.6666666666666666,
   'recall': 1.0,
   'f1': 0.8}},
 'MISC': {'ent_type': {'correct': 0,
   'incorrect': 0,
   'partial': 0,
   'missed': 1,
   'spurious': 1,
   'possible': 1,
   'actual': 1,
   'precision': 0.0,
   'recall': 0.0,
   'f1': 0