# New valuation for realistic run with morphology
1. Create a set of mentions from every sentence: sent_pred_set, sent_true_set
   1. add numbering to mentions if they are the same
1. Join sets with sentence identifer: pred_set, true_set
1. Precision: intersection(pred_set, true_set) / true_set
1. Recall: intersection(pred_set, true_set) / pred_set

## Prepare corpus

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline

In [4]:
import pandas as pd
import numpy as np

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [6]:
from conlleval import evaluate

In [7]:
df = (pd.read_csv('curation.csv.gz')
      .assign(sent=lambda x: x.file+'_'+x.sent.astype(str)))

In [8]:
df.head()

Unnamed: 0,sent_tok_num,tok_offset,token,FEAT_gender,FEAT_number,FEAT_case,FEAT_degree,FEAT_transitivity,FEAT_tense,FEAT_mood,...,dep_flavor,dep_lex_morph_pos,dep_arc,EXTRA,sent,ner_layers,ner_type,is_ner,biose,file
0,1-1,0-5,עשרות,Fem,Plur,*,*,*,*,*,...,basic,1-2,,,dev_1-100.tsv_1,0,_,False,O,dev_1-100.tsv
1,1-2,6-11,אנשים,Masc,Plur,*,*,*,*,*,...,basic,1-3,,,dev_1-100.tsv_1,0,_,False,O,dev_1-100.tsv
2,1-3,12-18,מגיעים,Masc,Plur,*,*,*,*,*,...,basic,1-3,,,dev_1-100.tsv_1,0,_,False,O,dev_1-100.tsv
3,1-4,19-20,מ,_,_,_,_,_,_,_,...,basic,1-5,,,dev_1-100.tsv_1,0,_,False,O,dev_1-100.tsv
4,1-5,21-27,תאילנד,_,_,_,_,_,_,_,...,basic,1-3,,,dev_1-100.tsv_1,1,GPE,True,S-GPE,dev_1-100.tsv


In [9]:
df.head().T

Unnamed: 0,0,1,2,3,4
sent_tok_num,1-1,1-2,1-3,1-4,1-5
tok_offset,0-5,6-11,12-18,19-20,21-27
token,עשרות,אנשים,מגיעים,מ,תאילנד
FEAT_gender,Fem,Masc,Masc,_,_
FEAT_number,Plur,Plur,Plur,_,_
FEAT_case,*,*,*,_,_
FEAT_degree,*,*,*,_,_
FEAT_transitivity,*,*,*,_,_
FEAT_tense,*,*,*,_,_
FEAT_mood,*,*,*,_,_


In [44]:
x=df
sents = x.groupby('sent')[['token', 'biose', 'ner_type']].apply(lambda x: x.values.tolist())
sents[:5]

sent
dev_1-100.tsv_1      [[עשרות, O, _], [אנשים, O, _], [מגיעים, O, _],...
dev_1-100.tsv_10     [[ישראל, B-PER, PER], [ארד, E-PER, PER], [,, O...
dev_1-100.tsv_100    [[טום, B-PER, PER], [הארקין, E-PER, PER], [ה, ...
dev_1-100.tsv_11     [[ח"כ, O, _], [אלי, B-PER, PER], [דיין, E-PER,...
dev_1-100.tsv_12     [[חברות, O, _], [ה, O, _], [מעסיקות, O, _], [ע...
dtype: object

## Create set of mentions from every sentence

In [50]:
sents_fixed = list(zip(list(sents.index), [[(tok, bio[0], cat) for tok, bio, cat in sent] for sent in sents]))

sents_fixed[:2]

[('dev_1-100.tsv_1',
  [('עשרות', 'O', '_'),
   ('אנשים', 'O', '_'),
   ('מגיעים', 'O', '_'),
   ('מ', 'O', '_'),
   ('תאילנד', 'S', 'GPE'),
   ('ל', 'O', '_'),
   ('ישראל', 'S', 'GPE'),
   ('כש', 'O', '_'),
   ('הם', 'O', '_'),
   ('נרשמים', 'O', '_'),
   ('כ', 'O', '_'),
   ('מתנדבים', 'O', '_'),
   (',', 'O', '_'),
   ('אך', 'O', '_'),
   ('למעשה', 'O', '_'),
   ('משמשים', 'O', '_'),
   ('עובדים', 'O', '_'),
   ('שכירים', 'O', '_'),
   ('זולים', 'O', '_'),
   ('.', 'O', '_')]),
 ('dev_1-100.tsv_10',
  [('ישראל', 'B', 'PER'),
   ('ארד', 'E', 'PER'),
   (',', 'O', '_'),
   ('סמנכ"ל', 'O', '_'),
   ('ה', 'B', 'ORG'),
   ('ביטוח', 'I', 'ORG'),
   ('ה', 'I', 'ORG'),
   ('לאומי', 'E', 'ORG'),
   (',', 'O', '_'),
   ('אמר', 'O', '_'),
   ('כי', 'O', '_'),
   ('ממלא', 'O', '_'),
   ('מקום', 'O', '_'),
   ('שר', 'O', '_'),
   ('ה', 'B', 'ORG'),
   ('עבודה', 'I', 'ORG'),
   ('ו', 'I', 'ORG'),
   ('ה', 'I', 'ORG'),
   ('רווחה', 'E', 'ORG'),
   (',', 'O', '_'),
   ('דוד', 'B', 'PER'),
   ('מגן'

In [115]:
from collections import defaultdict
from itertools import islice

def sent_to_mentions_dict(sent, sent_id, truncate=80):
    mentions = defaultdict(lambda: 0)
    current_mention= None
    current_cat = None
    for tok, bio, cat in islice(sent, truncate):
        if bio=='S':
            mentions[(sent_id, tok, cat)]+=1
        if bio=='B':
            current_mention = [tok]
            current_cat = cat
        if bio=='I' and current_mention is not None:
            current_mention.append(tok)
        if bio=='E' and current_mention is not None:
            current_mention.append(tok)
            mentions[(sent_id, ' '.join(current_mention), current_cat)]+=1
        if bio=='O':
            current_mention = None
            current_cat = None
    return mentions
ments = [sent_to_mentions_dict(sent, sent_id) for sent_id, sent in sents_fixed]
        

In [116]:
ments[:5]

[defaultdict(<function __main__.sent_to_mentions_dict.<locals>.<lambda>()>,
             {('dev_1-100.tsv_1', 'תאילנד', 'GPE'): 1,
              ('dev_1-100.tsv_1', 'ישראל', 'GPE'): 1}),
 defaultdict(<function __main__.sent_to_mentions_dict.<locals>.<lambda>()>,
             {('dev_1-100.tsv_10', 'ישראל ארד', 'PER'): 1,
              ('dev_1-100.tsv_10', 'ה ביטוח ה לאומי', 'ORG'): 1,
              ('dev_1-100.tsv_10', 'ה עבודה ו ה רווחה', 'ORG'): 1,
              ('dev_1-100.tsv_10', 'דוד מגן', 'PER'): 1}),
 defaultdict(<function __main__.sent_to_mentions_dict.<locals>.<lambda>()>,
             {('dev_1-100.tsv_100', 'טום הארקין', 'PER'): 1,
              ('dev_1-100.tsv_100', 'טום טקי', 'PER'): 1,
              ('dev_1-100.tsv_100', 'ה מפרץ ה פרסי', 'LOC'): 1}),
 defaultdict(<function __main__.sent_to_mentions_dict.<locals>.<lambda>()>,
             {('dev_1-100.tsv_11', 'אלי דיין', 'PER'): 1,
              ('dev_1-100.tsv_11', 'מערך', 'ORG'): 1}),
 defaultdict(<function __main__.sent

In [117]:
def get_ment_set(ments):
    ment_set = []
    for ment in ments:
        for k, val in ment.items():
            for i in range(val):
                ment_set.append((k[0], k[1], k[2], i+1))
    return ment_set
ment_set = ment_set[:5]
            

In [118]:
def sents_to_mentions(sents):
    sents_fixed = zip(list(sents.index), [[(tok, bio[0], cat) for tok, bio, cat in sent] for sent in sents])
    ments = [sent_to_mentions_dict(sent, sent_id) for sent_id, sent in sents_fixed]
    ment_set = get_ment_set(ments)
    return ment_set

In [178]:
import pickle
with open('treebank_results_2_test25.pkl', 'rb') as f:
    tb_results = pickle.load(f)
len(tb_results)

8

In [193]:
all_preds = [r[2] for r in tb_results]
all_preds[0][0][:20]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'E-PER',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'E-ORG',
 'O',
 'O',
 'O']

In [161]:
from sklearn.model_selection import train_test_split

splits = [train_test_split(range(len(sents)), 
                           sents, test_size=0.25, 
                           random_state=42+i) 
          for i in range(3)]

In [162]:
sum([len(s)-80 for s in splits[0][3] if len(s)>80])

138

In [194]:
sum(len(s) for s in splits[0][3]), len(all_preds[0][0])

(39576, 39438)

In [196]:
def get_sents_with_pred_tags(splits, preds, truncate=80):
    sents_preds = []
    for split, pred in zip(splits, preds):
        spl_preds = []
        test_sents = split[3]
        
        i=0
        for sent in test_sents:
            new_sent = []
            for tok, bio, cat in islice(sent, truncate):
                pred_tag = pred[i].split('-')
                pred_bio = pred_tag[0]
                if len(pred_tag)>1:
                    pred_cat = pred_tag[1]
                else:
                    pred_cat = '_'
                new_sent.append((tok, pred_bio, pred_cat))
                i+=1
            spl_preds.append(new_sent)
        spl_preds = pd.Series(spl_preds, index=test_sents.index)
        sents_preds.append(spl_preds)
    return sents_preds

all_sents_preds = [get_sents_with_pred_tags(splits, preds) for preds in all_preds]

In [200]:
splits_sents = [sents.iloc[split[1]] for split in splits]
splits_ments = [sents_to_mentions(ss) for ss in splits_sents]
all_ments_preds = [[sents_to_mentions(sp) for sp in sents_preds] for sents_preds in all_sents_preds]

In [201]:
splits_ments[0][:10]


[('train_1801-1900.tsv_97', 'גדעון שור', 'PER', 1),
 ('train_2501-2600.tsv_79', 'אלן מקויסט', 'PER', 1),
 ('train_2501-2600.tsv_79', 'ריינגרס', 'ORG', 1),
 ('train_4101-4200.tsv_81', 'רוסיה', 'GPE', 1),
 ('train_1001-1100.tsv_28', 'מוסקווה', 'GPE', 1),
 ('train_1001-1100.tsv_28', 'בלניגרד', 'GPE', 1),
 ('train_1601-1700.tsv_42', 'משטרת לייפציג', 'ORG', 1),
 ('train_1501-1600.tsv_42', 'ארה"ב', 'GPE', 1),
 ('train_1501-1600.tsv_42', 'ה_ מפרץ', 'LOC', 1),
 ('train_1501-1600.tsv_42', 'מלחמת ה עולם ה שנייה', 'EVE', 1)]

In [202]:
all_ments_preds[0][0][:10]

[('train_1801-1900.tsv_97', 'גדעון שור', 'PER', 1),
 ('train_1801-1900.tsv_97', 'מחלקת ה הסברה ב ה_ בנק', 'ORG', 1),
 ('train_2501-2600.tsv_79', 'אלן מקויסט', 'PER', 1),
 ('train_4101-4200.tsv_81', 'רוסיה', 'GPE', 1),
 ('train_1001-1100.tsv_28', 'מוסקווה', 'GPE', 1),
 ('train_1601-1700.tsv_42', 'משטרת לייפציג', 'ORG', 1),
 ('train_1501-1600.tsv_42', 'ארה"ב', 'GPE', 1),
 ('train_1501-1600.tsv_42', 'ה_ מפרץ צבא', 'ORG', 1),
 ('train_1501-1600.tsv_42', 'מלחמת ה עולם ה שנייה', 'ORG', 1),
 ('train_5701-5725.tsv_22', 'עדי גורדון', 'PER', 1)]

In [210]:
def evaluate_mentions(true_ments, pred_ments):
        t, p = set(true_ments), set(pred_ments)
        correct = p.intersection(t)
        prec = len(correct) / len(t)
        recall = len(correct) / len(p)
        f1 = 2*prec*recall/(prec+recall)
        print('Precision:', round(prec, 2))
        print('Recall:   ', round(recall, 2))
        print('F1:       ', round(f1, 2))
        print('FP ex.:', [e[1] for e in list(p-t)[:5]])
        print('FN ex.:', [e[1] for e in list(t-p)[:5]])
        return prec, recall, f1
for ments_preds in all_ments_preds:
    for t, p in zip(splits_ments, ments_preds):
        print(len(t), 'mentions,', len(p), 'found,', len(set(p).intersection(set(t))), 'correct.')
        evaluate_mentions(t, p)

1893 mentions, 1574 found, 1084 correct.
Precision: 0.57
Recall:    0.69
F1:        0.63
FP ex.: ['ה מועצה ל ענף ה לול להנהיג תלושים', 'סאלם איסמעיל מובארק לכדי נזיפה', '73', 'ה_ גליל', 'אוצר']
FN ex.: ['ואליום', 'אוסישקין', 'סומייל', 'אולפני מ - ג', 'לשכת עורכי - ה דין']
1905 mentions, 1712 found, 1174 correct.
Precision: 0.62
Recall:    0.69
F1:        0.65
FP ex.: ['רולטה התכוננו', 'בנק ל פיתוח ה תעשייה', 'אופטימי', 'שורת ה הספדים', 'אירופה']
FN ex.: ['האפט את האפט', 'אוסישקין', 'אושוויץ', 'בקעה', 'ה רשות ל שיתוף פעולה']
1982 mentions, 1786 found, 1222 correct.
Precision: 0.62
Recall:    0.68
F1:        0.65
FP ex.: ['קלן', 'יעקבי התראיין', 'סייף', 'הובסו', 'מדליק נר נשמה']
FN ex.: ['אינדיאנה', 'האפט את האפט', 'ה ספרייה ה לאומית', 'דורטמונד', 'ימק"א']
1893 mentions, 1794 found, 1341 correct.
Precision: 0.71
Recall:    0.75
F1:        0.73
FP ex.: ['לאומי', 'ה_ גליל', 'אירופה', 'אוצר', 'ניו - יורק']
FN ex.: ['ואליום', 'אוסישקין', 'סומייל', 'אולפני מ - ג', 'לשכת עורכי - ה דין']
1905 m

## Ru