In [170]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [171]:
%matplotlib inline

In [172]:
import pandas as pd
import numpy as np

In [173]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [174]:
import sys
sys.path.append('/home/nlp/danb')
sys.path.append('/home/nlp/danb/NER')

import bclm
import ne_evaluate_mentions as nem

## Get all OOV words


In [7]:
data_folder = '../NER/data/for_ncrf'

datasets = {
    'morph': {
        '_unit': 'morpheme',
        '_scheme': 'bioes',
        'train_dir': 'morph_gold_train.bmes',
        'dev_dir': 'morph_gold_dev.bmes',
        'test_dir': 'morph_gold_test.bmes', 
    },
    'token': {
        '_unit': 'token',
        '_scheme': 'bioes',
        'train_dir': 'token_gold_train_fix.bmes',
        'dev_dir': 'token_gold_dev_fix.bmes',
        'test_dir': 'token_gold_test_fix.bmes',
    },
    'multitok': {
        '_unit': 'token',
        '_scheme': 'concat_bioes',
        'seg': False,
        'train_dir': 'token_gold_train_concat.bmes',
        'dev_dir': 'token_gold_dev_concat.bmes',
        'test_dir': 'token_gold_test_concat.bmes',
    },
}

### out-of-training-vocabulary words (OOTV)

In [8]:
import os

In [9]:
from collections import Counter

train_words = {}
for n, ds in datasets.items():
    for k in ds:
        if 'train' in k:
            train_words[n] = []
            path = os.path.join(data_folder, ds[k])
            print(path)
            for line in open(path, 'r'):
                word = line.split(' ')[0].strip()
                if word!='':
                    train_words[n].append(word)
            train_words[n] = set(train_words[n])
                
[len(x) for x in train_words.values()]

../NER/data/for_ncrf/morph_gold_train.bmes
../NER/data/for_ncrf/token_gold_train_fix.bmes
../NER/data/for_ncrf/token_gold_train_concat.bmes


[15959, 24889, 24889]

### out-of-embedding-vocabulary words (OOEV) 

In [10]:
word_embedding_files = {
    'ft_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.fasttext_skipgram.model.vec.nofirstline',
    'ft_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.fasttext_skipgram.model.vec.nofirstline',
    'ft_oov_yap': 'data/htb_all_words.wikipedia.alt_tok.yap_form.fasttext_skipgram.txt',
    'ft_oov_tok': 'data/htb_all_words.wikipedia.alt_tok.tokenized.fasttext_skipgram.txt',
    #'w2v_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.word2vec_skipgram.txt.nofirstline',
    #'w2v_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.word2vec_skipgram.txt.nofirstline',
    'glv_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.glove.txt',
    'glv_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.glove.txt',
    'no_word': None,
}

In [11]:
emb_words = {}

def get_words_from_emb_file(path):
    words = []
    for line in open(path):
        line = line.strip()
        if line!='':
            words.append(line.split()[0])
    return set(words)

emb_words['tok'] = get_words_from_emb_file(word_embedding_files['glv_tok'])
emb_words['yap'] = get_words_from_emb_file(word_embedding_files['glv_yap'])

In [12]:
len(emb_words['tok']), len(emb_words['yap'])

(359320, 245642)

In [18]:
for k, v in word_embedding_files.items():
    if v is not None:
        !wc -l {v}


245641 ../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.fasttext_skipgram.model.vec.nofirstline
359320 ../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.fasttext_skipgram.model.vec.nofirstline
32852 data/htb_all_words.wikipedia.alt_tok.yap_form.fasttext_skipgram.txt
32852 data/htb_all_words.wikipedia.alt_tok.tokenized.fasttext_skipgram.txt
245642 ../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.glove.txt
359320 ../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.glove.txt


### IV (in-vocabulary)

In [13]:
inter = {}
for ds, trw in train_words.items():
    inter[ds] = {}
    for eu, emw in emb_words.items():
        inter[ds][eu] = trw & emw
        print (ds, eu, len(inter[ds][eu]))

morph tok 14753
morph yap 14926
token tok 22487
token yap 14175
multitok tok 22487
multitok yap 14175


### out-of-both-vocabulary words (OOBV)

anything that isn't in the other three

## Find mentions that contain (parts of) unknown tokens

1. Token unknown (T)
2. At least one morpheme unknown (M)
3. At least one lemma unknown (L)
4. All possible combinations

In [175]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]

### Full unknown
Token unknown and at least one morpheme unknown (most obvious when token is made up of 1 morpheme)

In [176]:
train_gold = spdf[spdf.set=='train']
dev_gold = spdf[spdf.set=='dev']
test_gold = spdf[spdf.set=='test']

In [177]:
train_tokens = set(train_gold.token_str.tolist())
dev_tokens =   set(dev_gold.token_str.tolist())
test_tokens =  set(test_gold.token_str.tolist())

In [178]:
dev_unk_tokens = dev_tokens-train_tokens
test_unk_tokens = test_tokens-train_tokens
len(dev_unk_tokens), len(dev_tokens)

(1500, 3995)

In [179]:
dev_gold[~dev_gold.token_str.isin(train_tokens)].shape, dev_gold.shape

((2738, 27), (11301, 27))

In [180]:
dev_gold[~dev_gold.token_str.isin(train_tokens)].groupby(['sent_id', 'token_id']).size().shape, dev_gold.groupby(['sent_id', 'token_id']).size().shape

((1748,), (8531,))

In [181]:
train_morphs  = set(train_gold.form.tolist())
dev_morphs    =   set(dev_gold.form.tolist())
test_morphs   =  set(test_gold.form.tolist())

In [182]:
train_lemmas  =   set(train_gold.lemma.tolist())
dev_lemmas    =   set(dev_gold.lemma.tolist())
test_lemmas  =    set(test_gold.lemma.tolist())

In [183]:
dev_gold.head()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,token_id,sent_id,token_str,global_sent_id,...,deps,misc,ner_escaped,set,duplicate_sent_id,very_similar_sent_id,biose_layer0,biose_layer1,biose_layer2,biose_layer3
0,1,עשרות,עשר,CDT,CDT,gen=F|num=P,1,1,עשרות,1,...,_,_,_,dev,,,O,O,O,O
1,2,אנשים,איש,NN,NN,gen=M|num=P,2,1,אנשים,1,...,_,_,_,dev,,,O,O,O,O
2,3,מגיעים,הגיע,BN,BN,gen=M|num=P|per=A|HebBinyan=HIFIL,3,1,מגיעים,1,...,_,_,_,dev,,,O,O,O,O
3,4,מ,מ,PREPOSITION,PREPOSITION,_,4,1,מתאילנד,1,...,_,_,_,dev,,,O,O,O,O
4,5,תאילנד,תאילנד,NNP,NNP,_,4,1,מתאילנד,1,...,_,_,GPE,dev,,,S-GPE,O,O,O


In [184]:
def unk_funq(x):
    def get_unk_type(t, m, l):
        ut = ''
        if t:
            ut+='T'
        if m:
            ut+='M'
#        if l:
#            ut+='L'
        return ut
    a = {}
    a['token_unk'] = not (x.token_str.iat[0] in train_tokens)
    a['morph_unk'] = (~x.form.isin(train_morphs)).any()
    a['lemma_unk'] = (~x.lemma.isin(train_lemmas)).any()
    a['unk_type'] = get_unk_type(a['token_unk'], a['morph_unk'], a['lemma_unk'])
    a['has_ner'] = not (x.biose_layer0=='O').all()
    a['morpheme_count'] = len(x)
    a['token_id_rep'] = x.token_id.iat[0]

    return pd.Series(a)

dev_unk = (dev_gold.groupby(['sent_id', 'token_id', 'token_str'])
                           .apply(unk_funq)).reset_index()

In [185]:
dev_unk.groupby(['unk_type', 'has_ner']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
unk_type,has_ner,Unnamed: 2_level_1
,False,6252
,True,519
M,False,12
T,False,617
T,True,53
TM,False,820
TM,True,258


In [186]:
def unk_funq(x):
    def get_unk_type(t, m, l):
        ut = ''
        if t:
            ut+='T'
        if m:
            ut+='M'
#        if l:
#            ut+='L'
        return ut
    a = {}
    a['token_unk'] = not (x.token_str.iat[0] in train_tokens)
    a['morph_unk'] = (~x.form.isin(train_morphs)).any()
    a['lemma_unk'] = (~x.lemma.isin(train_lemmas)).any()
    a['unk_type'] = get_unk_type(a['token_unk'], a['morph_unk'], a['lemma_unk'])
    a['has_ner'] = not (x.biose_layer0=='O').all()
    a['morpheme_count'] = len(x)
    a['token_id_rep'] = x.token_id.iat[0]
    return pd.Series(a)

test_unk = (test_gold.groupby(['sent_id', 'token_id', 'token_str'])
                           .apply(unk_funq)).reset_index()

In [187]:
test_unk.groupby(['unk_type', 'has_ner']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
unk_type,has_ner,Unnamed: 2_level_1
,False,8746
,True,940
M,False,38
M,True,2
T,False,944
T,True,102
TM,False,1389
TM,True,458


## Create evalation

In [188]:
mev = pd.read_pickle('final_setup/mev2.pkl')
mev.head()

Unnamed: 0,gold_name,unit,arch,w_embed,seed_num,p_m,r_m,f_m,pred_set,acc,...,seg,model_file_name,dset_file_name,relevant_score,input_unit,embed_unit,embed_type,cm,pred_set_sub,pred_set_main
0,morph_dev_gold,morph,char_cnn,ft_yap,44_seed,0.859155,0.733467,0.791351,dev_gold,0.9667,...,,morph.char_cnn.ft_yap.44_seed.83.model,morph.char_cnn.ft_yap.44_seed.dset,0.7905,morph,morph,ft,Match,gold,dev
1,morph_dev_yap,morph,char_cnn,ft_yap,44_seed,0.780193,0.647295,0.707558,dev_yap,0.9667,...,,morph.char_cnn.ft_yap.44_seed.83.model,morph.char_cnn.ft_yap.44_seed.dset,0.7905,morph,morph,ft,Match,yap,dev
2,morph_test_gold,morph,char_cnn,ft_yap,44_seed,0.80485,0.747854,0.775306,test_gold,0.9667,...,,morph.char_cnn.ft_yap.44_seed.83.model,morph.char_cnn.ft_yap.44_seed.dset,0.7905,morph,morph,ft,Match,gold,test
3,morph_dev_gold,morph,char_cnn,ft_oov_tok,44_seed,0.843612,0.767535,0.803778,dev_gold,0.969,...,,morph.char_cnn.ft_oov_tok.44_seed.183.model,morph.char_cnn.ft_oov_tok.44_seed.dset,0.8038,morph,token,ft_oov,Clash,gold,dev
4,morph_test_yap,morph,char_cnn,ft_yap,44_seed,0.721411,0.636266,0.676169,test_yap,0.9667,...,,morph.char_cnn.ft_yap.44_seed.83.model,morph.char_cnn.ft_yap.44_seed.dset,0.7905,morph,morph,ft,Match,yap,test


In [698]:
output_folder = 'final_setup/decode_output'
pred_samp = nem.read_file_sents(os.path.join('final_setup/decode_output/morph_dev_gold.morph.char_cnn.ft_oov_tok.44_seed.bmes'))
pred_samp.head()

0    [(עשרות, O), (אנשים, O), (מגיעים, O), (מ, O), ...
1    [(תופעה, O), (זו, O), (התבררה, O), (אתמול, O),...
2    [(יו"ר, O), (ה, O), (וועדה, O), (,, O), (ח"כ, ...
3    [(מ, O), (צד, O), (אחד, O), (רוצה, O), (ה, B-O...
4    [(נמיר, S-PER), (הודיעה, O), (כי, O), (תפנה, O...
dtype: object

In [699]:
for_df = []
for i, line in pred_samp.iteritems():
    sent_id=i+1
    for j, (tok, bio) in enumerate(line):
        for_df.append((sent_id, j+1, bio))
ner_samp = pd.DataFrame(for_df, columns=['sent_id', 'id', 'biose_layer0'])
ner_samp.head()

Unnamed: 0,sent_id,id,biose_layer0
0,1,1,O
1,1,2,O
2,1,3,O
3,1,4,O
4,1,5,S-GPE


In [700]:
yap_out = bclm.read_yap_output(treebank_set=None,
                               tokens_path=bclm.TREEBANK_TOKEN_PATHS['dev'], 
                                dep_path='final_setup/pruned/yap_output/dev.multitok.char_cnn.ft_oov_tok.53_seed.conll',
                                map_path='final_setup/pruned/yap_output/dev.multitok.char_cnn.ft_oov_tok.53_seed.map',)
yap_out.head()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,misc,sent_id,token_id,token_str
0,1,עשרות,עשר,CDT,CDT,gen=F|num=P,2,num,_,_,1,1,עשרות
1,2,אנשים,איש,NN,NN,gen=M|num=P,3,subj,_,_,1,2,אנשים
2,3,מגיעים,הגיע,BN,BN,gen=M|num=P|per=A,14,conj,_,_,1,3,מגיעים
3,4,מ,מ,PREPOSITION,PREPOSITION,,3,comp,_,_,1,4,מתאילנד
4,5,תאילנד,תאילנד,NNP,NNP,gen=F|num=S,4,pobj,_,_,1,4,מתאילנד


In [701]:
dev_gold = spdf[spdf.set=='dev']
dev_gold = dev_gold.merge(dev_unk, how='left')

In [702]:
full_samp = yap_out.merge(ner_samp)
full_samp = full_samp.merge(dev_unk, how='left')
full_samp.tail()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,misc,sent_id,token_id,token_str,biose_layer0,token_unk,morph_unk,lemma_unk,unk_type,has_ner,morpheme_count
11220,1,מתחיל,התחייל,BNT,BNT,gen=M|num=S|per=A,0,ROOT,_,_,500,1,מתחיל,O,False,False,False,,False,1
11221,2,מסע,מסע,NNT,NNT,gen=M|num=S,1,gobj,_,_,500,2,מסע,O,False,False,False,,False,1
11222,3,ה,ה,DEF,DEF,,4,def,_,_,500,3,הנקמה,O,False,False,False,,False,2
11223,4,נקמה,נקמה,NN,NN,gen=F|num=S,2,gobj,_,_,500,3,הנקמה,O,False,False,False,,False,2
11224,5,.,,yyDOT,yyDOT,,1,punct,_,_,500,4,.,O,False,False,False,,False,1


In [703]:
full_samp = dev_gold.copy()
full_samp['biose_layer0'] = ner_samp['biose_layer0']
full_samp = full_samp.merge(dev_unk, how='left')
full_samp.tail()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,token_id,sent_id,token_str,global_sent_id,...,biose_layer0,biose_layer1,biose_layer2,biose_layer3,token_unk,morph_unk,lemma_unk,unk_type,has_ner,morpheme_count
11296,1,מתחיל,התחיל,BN,BN,gen=M|num=S|per=A,1,500,מתחיל,500,...,O,O,O,O,False,False,False,,False,1
11297,2,מסע,מסע,NNT,NNT,gen=M|num=S,2,500,מסע,500,...,O,O,O,O,False,False,False,,False,1
11298,3,ה,ה,DEF,DEF,_,3,500,הנקמה,500,...,O,O,O,O,False,False,False,,False,2
11299,4,נקמה,נקמה,NN,NN,gen=F|num=S,3,500,הנקמה,500,...,O,O,O,O,False,False,False,,False,2
11300,5,.,_,yyDOT,yyDOT,_,4,500,.,500,...,O,O,O,O,False,False,False,,False,1


In [704]:
full_samp.shape, yap_out.shape, ner_samp.shape

((11301, 33), (11301, 13), (11301, 3))

In [678]:
tuple(sorted(set([1,4,1,5,2])))

(1, 2, 4, 5)

## Grouped mention evaluation

In [28]:
from collections import defaultdict

In [48]:
def ut_gm(x):
    filt_x = list(filter(lambda a: a != '', x))
    return tuple(sorted(set(filt_x)))
    
def sent_to_mentions_dict(sent, sent_id, truncate=None, group_maker=ut_gm, ment_len=False,
                         ment_cat=False):
    mentions = defaultdict(lambda: defaultdict(lambda: 0))
    current_mention= None
    current_cat = None
    current_group = []
    if truncate is not None:
        it = islice(sent, truncate)
    else:
        it = sent
    
    for tok, bio, cat, group in it:
        if bio=='S':
            current_group = [group]
            final_group = group_maker(current_group)
            if ment_len:
                if final_group is None:
                    final_group = 1
                else:
                    final_group = (final_group, 1)
            if ment_cat:
                final_group = (final_group, cat)
            mentions[final_group][(sent_id, tok, cat)]+=1
            current_mention= None
            current_cat = None
            current_group = []

        if bio=='B':
            current_mention = [tok]
            current_cat = cat
            current_group.append(group)
        if bio=='I' and current_mention is not None:
            current_mention.append(tok)
            current_group.append(group)
        if bio=='E' and current_mention is not None:
            current_mention.append(tok)
            current_group.append(group)
            final_group = group_maker(current_group)
            if ment_len:
                if final_group is None:
                    final_group = len(current_mention)
                else:
                    final_group = (final_group, len(current_mention))
                if ment_cat:
                    final_group = (final_group, current_cat)
            mentions[final_group][(sent_id, ' '.join(current_mention), current_cat)]+=1
            current_mention = None
            current_cat = None
            current_group = []
        if bio=='O':
            current_mention = None
            current_cat = None
            current_group = []

    return mentions

In [30]:
def get_sent(g, cols):
    sent = []
    for i, r in g.iterrows():
        sent.append(tuple(r[cols].tolist()))
    return sent

In [31]:
def get_sents_fixed(sents):
    sf = []
    for sent in sents:
        new_sent = []
        for tok, biose, *group in sent:
            if (len(group)==1):
                group = group[0]
            else:
                group = tuple(group)
            tag = biose.split('-')
            biose = tag[0]
            if len(tag)>1:
                cat = tag[1]
            else:
                cat = '_'
            new_sent.append((tok, biose, cat, group))
        sf.append(new_sent)
    sf = list(zip(list(sents.index), sf))
    return sf

In [32]:
def get_ment_set(sents):
    ment_set = defaultdict(list)
    for sent in sents:
        for group, gment in sent.items():
            for k, val in gment.items():
                for i in range(val):
                    ment_set[group].append((k[0], k[1], k[2], i+1))
    return ment_set

In [230]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([('T',), (), ('TM',), ('T', 'TM')])

In [231]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([('T',), (), ('TM',), ('T', 'TM')])

In [232]:
ment_set.keys() | gold_ment_set.keys()

{(), ('T',), ('T', 'TM'), ('TM',)}

In [233]:
for k in ment_set.keys() | gold_ment_set.keys():
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k])

('T', 'TM')
6 mentions, 6 found, 3 correct.
Precision: 0.5
Recall:    0.5
F1:        0.5
FP ex.: ['שכונת קטמון', 'ארגון נפגעי ה משכנתאות', 'כביש באב אל - ואד']
FN ex.: ['נאום אימפריית ה רשע', 'ארגון נפגעי ה משכנתאות ו חסרי ה דיור', 'ה קרב על מנזר סן סימון']
('TM',)
204 mentions, 178 found, 151 correct.
Precision: 0.85
Recall:    0.74
F1:        0.79
FP ex.: ['ל אקספרס', 'קטמון', 'איסט סייד', 'ירסקו', 'אולין']
FN ex.: ['חברה קדישא', 'נווה - מונסון', 'מפם', 'טום דאיין', 'ה שדולה ה פרו - ישראלית']
()
244 mentions, 236 found, 194 correct.
Precision: 0.82
Recall:    0.8
F1:        0.81
FP ex.: ['גרוסבורד', 'מסצוסטס', 'שלום ה רב כהנא', 'קרן גון', 'ה שטחים']
FN ex.: ['ה ליגה ל זכויות ה אדם', 'מסצוסטס', 'פיו', 'ה התאחדות ל עולם טוב יותר', 'פיו']
('T',)
45 mentions, 26 found, 23 correct.
Precision: 0.88
Recall:    0.51
F1:        0.65
FP ex.: ['ישיבת ה רעיון', 'ה ארה"ב', 'ה מסצוסטס']
FN ex.: ['ארה"ב', 'תנועת ה מושבים', 'ה מצביע ה אמריקאי', 'ה קרב על סן סימון', 'תנועת ה מושבים']


## Morpheme count

In [705]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'morpheme_count'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id, group_maker=max) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([2, 3, 1])

In [706]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'morpheme_count'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id, group_maker=max) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([2, 3, 1])

In [707]:
ment_set.keys() | gold_ment_set.keys()

{1, 2, 3}

In [708]:
for k in ment_set.keys() | gold_ment_set.keys():
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k], examples=11)

1
275 mentions, 260 found, 225 correct.
Precision: 0.87
Recall:    0.82
F1:        0.84
FP ex.: ['גרוסבורד', 'מסצוסטס', 'קרן גון', 'סן סימון', 'קטמון', 'יונה', 'ירסקו', 'אולין', 'קטמון', 'וייטנאם', 'רעיה']
FN ex.: ['מפם', 'טום דאיין', 'פיו', 'מסצוסטס', 'טובייה', 'פיו', 'ורמונט', 'דה מוין', 'מקארתור', 'סמית - ריצרדסון', 'קטמון']
2
188 mentions, 164 found, 133 correct.
Precision: 0.81
Recall:    0.71
F1:        0.76
FP ex.: ['ליון דן', 'שלום ה רב כהנא', 'ה שמרנים', 'לוס אוחוס', 'שיקאגו', 'ה ארץ', 'מדינות ה ברית', 'ישיבת ה רעיון ה יהודי', 'מקורות', 'ה ארץ', 'קטיף ה הדרים']
FN ex.: ['ה ליגה ל זכויות ה אדם', 'חברה קדישא', 'ה שדולה ה פרו - ישראלית', 'קרן פורד', 'ליון', 'תנועת ה מושבים', 'ה מצביע ה אמריקאי', 'מלאך ה מוות', 'אושוויץ', 'ה ברוקרים של ה רעיונות : צוותות חשיבה ו עלייתה של עלית מדיניות חדשה', 'ה קרב על סן סימון']
3
36 mentions, 30 found, 25 correct.
Precision: 0.83
Recall:    0.69
F1:        0.76
FP ex.: ['ה שטחים', 'ה קטיף', 'ה קטיף', 'ה קטיף', 'ה קטיף']
FN ex.: ['ה בית ה לבן', 'ה

In [None]:
['ה בית ה לבן', 'ה מלחמה ה קרה', 'ה קרב על ירושלים', 'ה קרב על מנזר סן סימון', 'ה קונסוליה ה ישראלית ב שיקאגו', 'מרכז ה מידע ל זכויות ה אדם ב ה שטחים', 'ה בית ה לבן', 'איסט סייד ה תחתית', 'ה עולם ה שלישי', 'ה מלחמה ה קרה']
['ה בית ה לבן', 'ה מלחמה ה קרה', 'ה קרב על ירושלים', 'ה קרב על מנזר סן סימון', 'ה קונסוליה ה ישראלית ב שיקאגו', 'מרכז ה מידע ל זכויות ה אדם ב ה שטחים', 'ה בית ה לבן', 'איסט סייד ה תחתית', 'ה עולם ה שלישי', 'ה מלחמה ה קרה', 'ה עבודה ו ה רווחה']

In [683]:
pred_samp = nem.read_file_sents(os.path.join('final_setup/decode_output/morph_dev_gold.morph.char_cnn.glv_yap.44_seed.bmes'))
pred_samp.head()

0    [(עשרות, O), (אנשים, O), (מגיעים, O), (מ, O), ...
1    [(תופעה, O), (זו, O), (התבררה, O), (אתמול, O),...
2    [(יו"ר, O), (ה, O), (וועדה, O), (,, O), (ח"כ, ...
3    [(מ, O), (צד, O), (אחד, O), (רוצה, O), (ה, B-O...
4    [(נמיר, S-PER), (הודיעה, O), (כי, O), (תפנה, O...
dtype: object

In [684]:
for_df = []
for i, line in pred_samp.iteritems():
    sent_id=i+1
    for j, (tok, bio) in enumerate(line):
        for_df.append((sent_id, j+1, bio))
ner_samp = pd.DataFrame(for_df, columns=['sent_id', 'id', 'biose_layer0'])
ner_samp.head()

Unnamed: 0,sent_id,id,biose_layer0
0,1,1,O
1,1,2,O
2,1,3,O
3,1,4,O
4,1,5,S-GPE


In [685]:
yap_out = bclm.read_yap_output(treebank_set=None,
                               tokens_path=bclm.TREEBANK_TOKEN_PATHS['dev'], 
                                dep_path=txt_map[('dev', 'char_cnn', 'glv_yap', 'conll')],
                                map_path=txt_map[('dev', 'char_cnn', 'glv_yap', 'map')],)
yap_out.head()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,misc,sent_id,token_id,token_str
0,1,עשרות,עשר,CDT,CDT,gen=F|num=P,2,num,_,_,1,1,עשרות
1,2,אנשים,איש,NN,NN,gen=M|num=P,3,subj,_,_,1,2,אנשים
2,3,מגיעים,הגיע,BN,BN,gen=M|num=P|per=A,14,conj,_,_,1,3,מגיעים
3,4,מ,מ,PREPOSITION,PREPOSITION,,3,comp,_,_,1,4,מתאילנד
4,5,תאילנד,תאילנד,NNP,NNP,gen=F|num=S,4,pobj,_,_,1,4,מתאילנד


In [686]:
full_samp = yap_out.merge(ner_samp)
full_samp = full_samp.merge(dev_unk, how='left')
full_samp.tail()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,misc,sent_id,token_id,token_str,biose_layer0,token_unk,morph_unk,lemma_unk,unk_type,has_ner,morpheme_count
11188,1,מתחיל,התחייל,BNT,BNT,gen=M|num=S|per=A,0,ROOT,_,_,500,1,מתחיל,O,False,False,False,,False,1
11189,2,מסע,מסע,NNT,NNT,gen=M|num=S,1,gobj,_,_,500,2,מסע,O,False,False,False,,False,1
11190,3,ה,ה,DEF,DEF,,4,def,_,_,500,3,הנקמה,O,False,False,False,,False,2
11191,4,נקמה,נקמה,NN,NN,gen=F|num=S,2,gobj,_,_,500,3,הנקמה,O,False,False,False,,False,2
11192,5,.,,yyDOT,yyDOT,,1,punct,_,_,500,4,.,O,False,False,False,,False,1


In [687]:
full_samp = dev_gold.copy()
full_samp['biose_layer0'] = ner_samp['biose_layer0']
full_samp = full_samp.merge(dev_unk, how='left')
full_samp.tail()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,token_id,sent_id,token_str,global_sent_id,...,biose_layer0,biose_layer1,biose_layer2,biose_layer3,token_unk,morph_unk,lemma_unk,unk_type,has_ner,morpheme_count
11296,1,מתחיל,התחיל,BN,BN,gen=M|num=S|per=A,1,500,מתחיל,500,...,O,O,O,O,False,False,False,,False,1
11297,2,מסע,מסע,NNT,NNT,gen=M|num=S,2,500,מסע,500,...,O,O,O,O,False,False,False,,False,1
11298,3,ה,ה,DEF,DEF,_,3,500,הנקמה,500,...,O,O,O,O,False,False,False,,False,2
11299,4,נקמה,נקמה,NN,NN,gen=F|num=S,3,500,הנקמה,500,...,O,O,O,O,False,False,False,,False,2
11300,5,.,_,yyDOT,yyDOT,_,4,500,.,500,...,O,O,O,O,False,False,False,,False,1


In [688]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'morpheme_count'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id, group_maker=max) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([2, 3, 1])

In [689]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'morpheme_count'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id, group_maker=max) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([2, 3, 1])

In [690]:
ment_set.keys() | gold_ment_set.keys()

{1, 2, 3}

In [691]:
for k in ment_set.keys() | gold_ment_set.keys():
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k], examples=10)

1
275 mentions, 268 found, 220 correct.
Precision: 0.82
Recall:    0.8
F1:        0.81
FP ex.: ['אוחדה', 'גרוסבורד', 'הדסון', 'רץ', 'קדישא', 'יונה', 'אולין', 'ברוקינגס', 'רצ"ב', 'וייטנאם']
FN ex.: ['קול ישראל', 'מפם', 'טום דאיין', 'פיו', 'מסצוסטס', 'טובייה', 'פיו', 'ריצארדס', 'דה מוין', 'מקארתור']
2
188 mentions, 156 found, 126 correct.
Precision: 0.81
Recall:    0.67
F1:        0.73
FP ex.: ['ליון דן', 'ה משטרה', 'שיקאגו', 'ה ארץ', 'מדינות ה ברית', 'מקורות', 'ה ארץ', 'שמואל ה נביא', 'עלי', 'מושלת טקסס']
FN ex.: ['ה ליגה ל זכויות ה אדם', 'חברה קדישא', 'נווה - מונסון', 'ה שדולה ה פרו - ישראלית', 'קרן פורד', 'ליון', 'ה התאחדות ל עולם טוב יותר', 'ה מצביע ה אמריקאי', 'מלאך ה מוות', 'ה מיליציה ה צרפתית']
3
36 mentions, 28 found, 26 correct.
Precision: 0.93
Recall:    0.72
F1:        0.81
FP ex.: ['ה שטחים', 'ה איסט סייד']
FN ex.: ['ה בית ה לבן', 'ה מלחמה ה קרה', 'ה קרב על ירושלים', 'ה קרב על מנזר סן סימון', 'מרכז ה מידע ל זכויות ה אדם ב ה שטחים', 'ה בית ה לבן', 'איסט סייד ה תחתית', 'ה עולם 

## Check morpheme count together with unk_type

In [238]:
def gm(l):
    uts, mcs = list(zip(*l))
    ut = ut_gm(uts)
    mc = max(mcs)
    return ut, mc

In [239]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'morpheme_count'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id, group_maker=gm) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([(('T',), 2), ((), 2), ((), 3), ((), 1), (('T',), 1), (('TM',), 1), (('T', 'TM'), 1), (('TM',), 2), (('T', 'TM'), 2), (('TM',), 3), (('T',), 3)])

In [240]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'morpheme_count'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id, group_maker=gm) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([(('T',), 2), ((), 2), ((), 3), ((), 1), (('T',), 3), (('T',), 1), (('TM',), 1), (('T', 'TM'), 3), (('TM',), 2), (('T', 'TM'), 2), (('TM',), 3)])

In [241]:
ment_set.keys() | gold_ment_set.keys()

{((), 1),
 ((), 2),
 ((), 3),
 (('T',), 1),
 (('T',), 2),
 (('T',), 3),
 (('T', 'TM'), 1),
 (('T', 'TM'), 2),
 (('T', 'TM'), 3),
 (('TM',), 1),
 (('TM',), 2),
 (('TM',), 3)}

In [242]:
for k in ment_set.keys() | gold_ment_set.keys():
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k])

(('TM',), 3)
2 mentions, 5 found, 0 correct.
Precision: 0.0
Recall:    0.0
F1:        -1
FP ex.: ['ה קטיף', 'איסט סייד', 'ה קטיף', 'ה קטיף', 'ה קטיף']
FN ex.: ['איסט סייד ה תחתית', 'ה קונסוליה ה ישראלית ב שיקאגו']
(('TM',), 2)
47 mentions, 30 found, 26 correct.
Precision: 0.87
Recall:    0.55
F1:        0.68
FP ex.: ['נווה - מונסון', 'הרי בראדלו', 'קרן פורד שמרנים', 'קטיף ה הדרים']
FN ex.: ['חברה קדישא', 'נווה - מונסון', 'ה שדולה ה פרו - ישראלית', 'ליון', 'קרן פורד']
(('T', 'TM'), 3)
1 mentions, 0 found, 0 correct.
Precision: -1
Recall:    0.0
F1:        0.0
FP ex.: []
FN ex.: ['ה קרב על מנזר סן סימון']
(('T', 'TM'), 2)
5 mentions, 4 found, 3 correct.
Precision: 0.75
Recall:    0.6
F1:        0.67
FP ex.: ['ארגון נפגעי ה משכנתאות']
FN ex.: ['נאום אימפריית ה רשע', 'ארגון נפגעי ה משכנתאות ו חסרי ה דיור']
(('T',), 2)
34 mentions, 20 found, 19 correct.
Precision: 0.95
Recall:    0.56
F1:        0.7
FP ex.: ['ישיבת ה רעיון']
FN ex.: ['ה קרב על סן סימון', 'תנועת ה מושבים', 'ה בית ה לבן', 'ה 

## Check length of mention

In [243]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'morpheme_count'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id, group_maker=lambda x: None, ment_len=True) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([1, 6, 2, 3, 4, 5, 7])

In [244]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'morpheme_count'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id, group_maker=lambda x: None, ment_len=True) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([1, 6, 2, 5, 3, 4, 8, 7, 10, 14])

In [245]:
ment_set.keys() | gold_ment_set.keys()

{1, 2, 3, 4, 5, 6, 7, 8, 10, 14}

In [246]:
for k in sorted(ment_set.keys() | gold_ment_set.keys()):
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k])

1
228 mentions, 207 found, 183 correct.
Precision: 0.88
Recall:    0.8
F1:        0.84
FP ex.: ['גרוסבורד', 'מסצוסטס', 'מסצוסטס', 'קטמון', 'יונה']
FN ex.: ['ארה"ב', 'מפם', 'פיו', 'מסצוסטס', 'ליון']
2
154 mentions, 152 found, 123 correct.
Precision: 0.81
Recall:    0.8
F1:        0.8
FP ex.: ['קרן גון', 'ה שמרנים', 'ל אקספרס', 'ה שטחים', 'איסט סייד']
FN ex.: ['חברה קדישא', 'טום דאיין', 'קרן פורד', 'ה ארץ', 'ה כנסת']
3
48 mentions, 44 found, 30 correct.
Precision: 0.68
Recall:    0.62
F1:        0.65
FP ex.: ['מנזר סן סימון', 'מכון ה דסון', 'סמית - ריצרדסון', 'נווה - מונסון', 'ישיבת ה רעיון']
FN ex.: ['מלחמת ה עצמאות', 'ירושלים ה מערבית', 'נווה - מונסון', 'תנועת ה מושבים', 'אמריקן אקונומיק ריוויו']
4
45 mentions, 29 found, 24 correct.
Precision: 0.83
Recall:    0.53
F1:        0.65
FP ex.: ['שלום ה רב כהנא', 'ה מלחמת ה עצמאות', 'הר - ה בית', 'ארגון נפגעי ה משכנתאות', 'שכונת שמואל ה נביא']
FN ex.: ['ה מצביע ה אמריקאי', 'ה עולם ה שלישי', 'לינד ו הרי בראדלו', 'ה קרב על ירושלים', 'איסט סייד 

## Length+unk_type

In [247]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id, group_maker=ut_gm, ment_len=True) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([(('T',), 1), ((), 1), ((), 6), ((), 2), (('T',), 2), (('T',), 3), ((), 3), ((), 4), ((), 5), (('TM',), 2), (('T', 'TM'), 2), (('TM',), 1), (('TM',), 3), (('T', 'TM'), 5), (('T', 'TM'), 4), ((), 7), (('TM',), 5), (('T',), 5), (('TM',), 4)])

In [248]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id, group_maker=ut_gm, ment_len=True) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([(('T',), 1), ((), 1), ((), 6), ((), 2), (('T',), 5), (('T',), 3), (('T',), 2), ((), 3), ((), 4), ((), 5), (('TM',), 2), (('TM',), 1), (('TM',), 3), (('T', 'TM'), 6), (('TM',), 4), (('T', 'TM'), 8), ((), 7), (('T',), 4), (('TM',), 6), ((), 10), (('T', 'TM'), 2), (('T', 'TM'), 4), (('TM',), 14)])

In [249]:
ment_set.keys() | gold_ment_set.keys()

{((), 1),
 ((), 2),
 ((), 3),
 ((), 4),
 ((), 5),
 ((), 6),
 ((), 7),
 ((), 10),
 (('T',), 1),
 (('T',), 2),
 (('T',), 3),
 (('T',), 4),
 (('T',), 5),
 (('T', 'TM'), 2),
 (('T', 'TM'), 4),
 (('T', 'TM'), 5),
 (('T', 'TM'), 6),
 (('T', 'TM'), 8),
 (('TM',), 1),
 (('TM',), 2),
 (('TM',), 3),
 (('TM',), 4),
 (('TM',), 5),
 (('TM',), 6),
 (('TM',), 14)}

In [250]:
for k in sorted(ment_set.keys() | gold_ment_set.keys()):
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k])

((), 1)
110 mentions, 108 found, 92 correct.
Precision: 0.85
Recall:    0.84
F1:        0.84
FP ex.: ['מסצוסטס', 'הראל', 'גרוסבורד', 'מסצוסטס', 'מסצוסטס']
FN ex.: ['ה', 'כך', 'וייטנאם', 'מסצוסטס', 'פיו']
((), 2)
69 mentions, 69 found, 53 correct.
Precision: 0.77
Recall:    0.77
F1:        0.77
FP ex.: ['מחוז פאריס', 'ה שמרנים', 'סן סימון', 'ה שטחים', 'קרן גון']
FN ex.: ['ה ארץ', 'מחוז פאריס', 'ה גרמנים', 'סנטר 1', 'ניו גרסי']
((), 3)
23 mentions, 23 found, 18 correct.
Precision: 0.78
Recall:    0.78
F1:        0.78
FP ex.: ['רצח ה נזירות', 'מדינות ה ברית', 'רחוב גבעת שאול', 'גורג ה מסכן', 'ה שירות פועל']
FN ex.: ['בית מספר 3', 'ירושלים ה מערבית', 'רחוב גבעת שאול', 'מלחמת ה עצמאות', 'ניו - יורק']
((), 4)
29 mentions, 26 found, 22 correct.
Precision: 0.85
Recall:    0.76
F1:        0.8
FP ex.: ['הר - ה בית', 'שכונת שמואל ה נביא', 'שלום ה רב כהנא', 'ה מלחמת ה עצמאות']
FN ex.: ['חטיבת " הראל "', 'פרס נובל ל שלום', 'ה קרב על ירושלים', 'הר - ה בית', 'שכונת שמואל ה נביא']
((), 5)
6 mentions, 

## Length+unk_type+morpheme_count

In [251]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'morpheme_count'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id, group_maker=gm, ment_len=True) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([((('T',), 2), 1), (((), 2), 1), (((), 3), 6), (((), 2), 2), (((), 1), 2), (((), 1), 1), ((('T',), 1), 2), ((('T',), 2), 3), (((), 2), 3), ((('T',), 2), 2), (((), 2), 4), (((), 3), 5), ((('TM',), 1), 2), (((), 3), 2), ((('T', 'TM'), 1), 2), ((('TM',), 1), 1), ((('TM',), 1), 3), ((('TM',), 2), 3), ((('T', 'TM'), 1), 5), (((), 2), 5), ((('T', 'TM'), 2), 4), (((), 3), 4), ((('TM',), 2), 1), ((('TM',), 2), 2), (((), 2), 7), ((('TM',), 1), 5), ((('TM',), 3), 2), ((('T',), 2), 5), (((), 1), 3), ((('T', 'TM'), 2), 2), ((('T',), 3), 1), ((('TM',), 1), 4), ((('T',), 3), 2), (((), 1), 4)])

In [252]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'morpheme_count'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id, group_maker=gm, ment_len=True) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([((('T',), 2), 1), (((), 2), 1), (((), 3), 6), (((), 2), 2), (((), 1), 2), (((), 1), 1), ((('T',), 3), 5), ((('T',), 2), 3), ((('T',), 1), 2), (((), 2), 3), ((('T',), 2), 2), (((), 2), 4), (((), 3), 5), ((('TM',), 1), 2), ((('TM',), 1), 1), (((), 3), 2), ((('T',), 2), 5), (((), 3), 4), ((('TM',), 1), 3), (((), 1), 4), ((('T', 'TM'), 3), 6), ((('TM',), 2), 3), ((('TM',), 2), 2), ((('TM',), 1), 4), (((), 2), 5), ((('T', 'TM'), 2), 8), ((('TM',), 2), 1), (((), 2), 7), ((('T',), 2), 4), ((('TM',), 2), 6), ((('TM',), 3), 6), (((), 3), 10), ((('TM',), 2), 4), (((), 2), 6), (((), 1), 3), ((('T', 'TM'), 2), 2), ((('T',), 3), 1), ((('TM',), 3), 4), ((('T', 'TM'), 2), 4), ((('T',), 3), 4), ((('TM',), 2), 14)])

In [253]:
ment_set.keys() | gold_ment_set.keys()

{(((), 1), 1),
 (((), 1), 2),
 (((), 1), 3),
 (((), 1), 4),
 (((), 2), 1),
 (((), 2), 2),
 (((), 2), 3),
 (((), 2), 4),
 (((), 2), 5),
 (((), 2), 6),
 (((), 2), 7),
 (((), 3), 2),
 (((), 3), 4),
 (((), 3), 5),
 (((), 3), 6),
 (((), 3), 10),
 ((('T',), 1), 2),
 ((('T',), 2), 1),
 ((('T',), 2), 2),
 ((('T',), 2), 3),
 ((('T',), 2), 4),
 ((('T',), 2), 5),
 ((('T',), 3), 1),
 ((('T',), 3), 2),
 ((('T',), 3), 4),
 ((('T',), 3), 5),
 ((('T', 'TM'), 1), 2),
 ((('T', 'TM'), 1), 5),
 ((('T', 'TM'), 2), 2),
 ((('T', 'TM'), 2), 4),
 ((('T', 'TM'), 2), 8),
 ((('T', 'TM'), 3), 6),
 ((('TM',), 1), 1),
 ((('TM',), 1), 2),
 ((('TM',), 1), 3),
 ((('TM',), 1), 4),
 ((('TM',), 1), 5),
 ((('TM',), 2), 1),
 ((('TM',), 2), 2),
 ((('TM',), 2), 3),
 ((('TM',), 2), 4),
 ((('TM',), 2), 6),
 ((('TM',), 2), 14),
 ((('TM',), 3), 2),
 ((('TM',), 3), 4),
 ((('TM',), 3), 6)}

In [254]:
for k in sorted(ment_set.keys() | gold_ment_set.keys()):
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k])

(((), 1), 1)
73 mentions, 68 found, 56 correct.
Precision: 0.82
Recall:    0.77
F1:        0.79
FP ex.: ['מסצוסטס', 'גרוסבורד', 'הראל', 'מסצוסטס', 'מסצוסטס']
FN ex.: ['ה', 'כך', 'וייטנאם', 'מסצוסטס', 'פיו']
(((), 1), 2)
39 mentions, 42 found, 36 correct.
Precision: 0.86
Recall:    0.92
F1:        0.89
FP ex.: ['מחוז פאריס', 'קרן גון', 'סן סימון', 'נא מילוא', 'שלום עכשיו']
FN ex.: ['מחוז פאריס', 'בית נבחרים', 'סנטר 1']
(((), 1), 3)
4 mentions, 3 found, 3 correct.
Precision: 1.0
Recall:    0.75
F1:        0.86
FP ex.: []
FN ex.: ['בית מספר 3']
(((), 1), 4)
2 mentions, 1 found, 1 correct.
Precision: 1.0
Recall:    0.5
F1:        0.67
FP ex.: []
FN ex.: ['חטיבת " הראל "']
(((), 2), 1)
37 mentions, 40 found, 36 correct.
Precision: 0.9
Recall:    0.97
F1:        0.94
FP ex.: ['ירושלים', 'מקורות', 'אושוויץ', 'שיקאגו']
FN ex.: ['אושוויץ']
(((), 2), 2)
19 mentions, 17 found, 9 correct.
Precision: 0.53
Recall:    0.47
F1:        0.5
FP ex.: ['ה שמרנים', 'ה ארץ', 'ה שמרנים', 'ה ארץ', 'ה קב"ה']
FN

## Check OOEV

In [269]:
full_samp['ooev'] = ~full_samp.form.isin(emb_words['tok'])
full_samp.ooev.value_counts()

False    11127
True       174
Name: ooev, dtype: int64

In [270]:
ooev_tok_part = (full_samp.groupby(['sent_id', 'token_id'])
                 .ooev.apply(lambda x: x.any())
                 .reset_index().rename(columns={'ooev': 'ooev_tok_part'}))
ooev_tok_part.ooev_tok_part.value_counts()

False    8357
True      174
Name: ooev_tok_part, dtype: int64

In [271]:
full_samp = full_samp.merge(ooev_tok_part, how='left')
dev_gold = dev_gold.merge(ooev_tok_part, how='left')
full_samp.shape, dev_gold.shape

((11301, 22), (11301, 34))

In [272]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'ooev_tok_part'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([(False,), (False, True), (True,)])

In [273]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'ooev_tok_part'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([(False,), (False, True), (True,)])

In [274]:
ment_set.keys() | gold_ment_set.keys()

{(False,), (False, True), (True,)}

In [275]:
for k in ment_set.keys() | gold_ment_set.keys():
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k])

(False, True)
24 mentions, 24 found, 17 correct.
Precision: 0.71
Recall:    0.71
F1:        0.71
FP ex.: ['סמית - ריצרדסון', 'אייפא"ק טום דאיין את הארקין', 'אמריקן אקונומיק ריוויו', 'הרי בראדלו', 'מחוז זירונד']
FN ex.: ['אמריקן אקונומיק ריוויו', 'לינד ו הרי בראדלו', 'שרה סקאיף', 'גנאדאס דל ואיה', 'סמית - ריצרדסון']
(False,)
441 mentions, 389 found, 330 correct.
Precision: 0.85
Recall:    0.75
F1:        0.8
FP ex.: ['שלום ה רב כהנא', 'קרן גון', 'סן סימון', 'ל אקספרס', 'קטמון']
FN ex.: ['ה ליגה ל זכויות ה אדם', 'טום דאיין', 'טובייה', 'ה ברוקרים של ה רעיונות : צוותות חשיבה ו עלייתה של עלית מדיניות חדשה', 'ה קרב על מנזר סן סימון']
(True,)
34 mentions, 33 found, 24 correct.
Precision: 0.73
Recall:    0.71
F1:        0.72
FP ex.: ['מסצוסטס', 'גרוסבורד', 'פראפראז', 'מסצוסטס', 'מסצוסטס']
FN ex.: ['מסצוסטס', 'נבראסקה', 'מסצוסטס', 'ה קב"ה', 'אייפא"ק']


## OOEV + unk_type

In [281]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'ooev_tok_part'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id, group_maker=gm) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([(('T',), False), ((), False), (('TM',), False), (('T', 'TM'), False), (('TM',), True), ((), True), (('T',), True), (('T', 'TM'), True)])

In [280]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'ooev_tok_part'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id, group_maker=gm) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([(('T',), False), ((), False), (('TM',), False), (('T', 'TM'), False), (('TM',), True), ((), True), (('T',), True), (('T', 'TM'), True)])

In [282]:
ment_set.keys() | gold_ment_set.keys()

{((), False),
 ((), True),
 (('T',), False),
 (('T',), True),
 (('T', 'TM'), False),
 (('T', 'TM'), True),
 (('TM',), False),
 (('TM',), True)}

In [283]:
for k in ment_set.keys() | gold_ment_set.keys():
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k])

(('TM',), False)
170 mentions, 146 found, 128 correct.
Precision: 0.88
Recall:    0.75
F1:        0.81
FP ex.: ['מנזר סן סימון', 'מכון ה דסון', 'נווה - מונסון', 'ל אקספרס', 'קטמון']
FN ex.: ['חברה קדישא', 'נווה - מונסון', 'מפם', 'טום דאיין', 'ה שדולה ה פרו - ישראלית']
(('T', 'TM'), True)
1 mentions, 1 found, 1 correct.
Precision: 1.0
Recall:    1.0
F1:        1.0
FP ex.: []
FN ex.: []
(('T', 'TM'), False)
5 mentions, 5 found, 2 correct.
Precision: 0.4
Recall:    0.4
F1:        0.4
FP ex.: ['שכונת קטמון', 'ארגון נפגעי ה משכנתאות', 'כביש באב אל - ואד']
FN ex.: ['נאום אימפריית ה רשע', 'ארגון נפגעי ה משכנתאות ו חסרי ה דיור', 'ה קרב על מנזר סן סימון']
(('T',), False)
43 mentions, 24 found, 22 correct.
Precision: 0.92
Recall:    0.51
F1:        0.66
FP ex.: ['ישיבת ה רעיון', 'ה ארה"ב']
FN ex.: ['ארה"ב', 'תנועת ה מושבים', 'ה מצביע ה אמריקאי', 'ה קרב על סן סימון', 'תנועת ה מושבים']
(('T',), True)
2 mentions, 2 found, 1 correct.
Precision: 0.5
Recall:    0.5
F1:        0.5
FP ex.: ['ה מסצוסטס']

## EVERYTHING :OOO

In [284]:
token_unk_sents = full_samp.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'morpheme_count', 'ooev_tok_part'])
token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
ments = [sent_to_mentions_dict(sent, sent_id, ment_len=True) for sent_id, sent in token_unk_sents_fixed]
ment_set = get_ment_set(ments)
ment_set.keys()

dict_keys([((('T', 2, False),), 1), ((('', 2, False),), 1), ((('', 2, False), ('', 3, False)), 6), ((('', 2, False),), 2), ((('', 1, False),), 2), ((('', 1, False),), 1), ((('', 1, False), ('', 2, False), ('', 3, False)), 6), ((('', 1, False), ('T', 1, False)), 2), ((('', 2, False), ('T', 2, False)), 3), ((('', 1, False), ('', 2, False)), 3), ((('', 1, False), ('T', 2, False)), 2), ((('', 2, False),), 4), ((('', 2, False), ('', 3, False)), 5), ((('', 1, False), ('TM', 1, False)), 2), ((('', 3, False),), 2), ((('T', 1, False), ('TM', 1, False)), 2), ((('TM', 1, False),), 1), ((('', 1, False), ('TM', 1, False)), 3), ((('', 1, False), ('TM', 1, False), ('TM', 2, False)), 3), ((('TM', 1, False),), 2), ((('', 1, False), ('T', 1, False), ('TM', 1, False)), 5), ((('', 1, False), ('', 2, False)), 5), ((('TM', 1, False), ('TM', 1, True)), 2), ((('', 1, False), ('T', 2, False), ('TM', 1, False)), 4), ((('', 2, False), ('', 3, False)), 4), ((('TM', 2, False),), 1), ((('TM', 1, False), ('TM', 2, F

In [285]:
gold_dev_sents = dev_gold.groupby('sent_id').apply(get_sent, ['form', 'biose_layer0', 'unk_type', 'morpheme_count', 'ooev_tok_part'])
gold_dev_sents_fixed = get_sents_fixed(gold_dev_sents)
gold_ments = [sent_to_mentions_dict(sent, sent_id, ment_len=True) for sent_id, sent in gold_dev_sents_fixed]
gold_ment_set = get_ment_set(gold_ments)
gold_ment_set.keys()

dict_keys([((('T', 2, False),), 1), ((('', 2, False),), 1), ((('', 2, False), ('', 3, False)), 6), ((('', 2, False),), 2), ((('', 1, False),), 2), ((('', 1, False),), 1), ((('', 1, False), ('', 2, False), ('', 3, False)), 6), ((('', 3, False), ('T', 3, False)), 5), ((('', 1, False), ('T', 2, False)), 3), ((('', 1, False), ('T', 1, False)), 2), ((('', 2, False), ('T', 2, False)), 3), ((('', 1, False), ('', 2, False)), 3), ((('', 1, False), ('T', 2, False)), 2), ((('', 2, False),), 4), ((('', 2, False), ('', 3, False)), 5), ((('', 1, False), ('TM', 1, False)), 2), ((('TM', 1, False),), 1), ((('', 3, False),), 2), ((('', 1, False), ('T', 2, False)), 5), ((('', 2, False),), 3), ((('', 1, False), ('', 3, False)), 4), ((('', 1, False), ('TM', 1, False)), 3), ((('', 1, False),), 4), ((('', 1, False), ('T', 3, False), ('TM', 1, False)), 6), ((('', 1, False), ('TM', 2, False)), 3), ((('TM', 2, False),), 2), ((('', 1, False), ('TM', 1, False), ('TM', 2, False)), 3), ((('TM', 1, False),), 2), (((

In [286]:
ment_set.keys() | gold_ment_set.keys()

{((('', 1, False),), 1),
 ((('', 1, False),), 2),
 ((('', 1, False),), 3),
 ((('', 1, False),), 4),
 ((('', 1, False), ('', 1, True)), 2),
 ((('', 1, False), ('', 2, False)), 2),
 ((('', 1, False), ('', 2, False)), 3),
 ((('', 1, False), ('', 2, False)), 4),
 ((('', 1, False), ('', 2, False)), 5),
 ((('', 1, False), ('', 2, False)), 6),
 ((('', 1, False), ('', 2, False)), 7),
 ((('', 1, False), ('', 2, False), ('', 3, False)), 6),
 ((('', 1, False), ('', 2, False), ('', 3, False)), 10),
 ((('', 1, False), ('', 2, False), ('T', 1, False)), 4),
 ((('', 1, False), ('', 2, False), ('T', 2, False)), 5),
 ((('', 1, False), ('', 2, False), ('T', 2, False), ('TM', 1, False)), 8),
 ((('', 1, False), ('', 2, False), ('TM', 1, False)), 3),
 ((('', 1, False), ('', 2, False), ('TM', 1, False), ('TM', 2, False)), 14),
 ((('', 1, False), ('', 2, False), ('TM', 2, False)), 6),
 ((('', 1, False), ('', 3, False)), 4),
 ((('', 1, False), ('T', 1, False)), 2),
 ((('', 1, False), ('T', 1, False), ('TM', 1,

In [287]:
for k in ment_set.keys() | gold_ment_set.keys():
    print(k)
    nem.evaluate_mentions(gold_ment_set[k], ment_set[k])

((('', 2, False),), 4)
12 mentions, 12 found, 11 correct.
Precision: 0.92
Recall:    0.92
F1:        0.92
FP ex.: ['ה מלחמת ה עצמאות']
FN ex.: ['ה עולם ה שלישי']
((('', 1, False), ('TM', 1, False)), 4)
1 mentions, 0 found, 0 correct.
Precision: -1
Recall:    0.0
F1:        0.0
FP ex.: []
FN ex.: ['באב אל - ואד']
((('', 1, False),), 2)
34 mentions, 37 found, 31 correct.
Precision: 0.84
Recall:    0.91
F1:        0.87
FP ex.: ['מחוז פאריס', 'קרן גון', 'סן סימון', 'נא מילוא', 'שלום עכשיו']
FN ex.: ['מחוז פאריס', 'בית נבחרים', 'סנטר 1']
((('', 1, False), ('', 2, False)), 4)
7 mentions, 7 found, 4 correct.
Precision: 0.57
Recall:    0.57
F1:        0.57
FP ex.: ['הר - ה בית', 'שכונת שמואל ה נביא', 'שלום ה רב כהנא']
FN ex.: ['הר - ה בית', 'שכונת שמואל ה נביא', 'פרס נובל ל שלום']
((('', 1, False), ('', 2, False), ('', 3, False)), 10)
1 mentions, 0 found, 0 correct.
Precision: -1
Recall:    0.0
F1:        0.0
FP ex.: []
FN ex.: ['מרכז ה מידע ל זכויות ה אדם ב ה שטחים']
((('T', 2, False), ('TM',

## Evaluate all

In [167]:
mev = pd.read_pickle('final_setup/mev2.pkl')
mev.head()

Unnamed: 0,gold_name,unit,arch,w_embed,seed_num,p_m,r_m,f_m,pred_set,acc,...,seg,model_file_name,dset_file_name,relevant_score,input_unit,embed_unit,embed_type,cm,pred_set_sub,pred_set_main
0,morph_dev_gold,morph,char_cnn,ft_yap,44_seed,0.859155,0.733467,0.791351,dev_gold,0.9667,...,,morph.char_cnn.ft_yap.44_seed.83.model,morph.char_cnn.ft_yap.44_seed.dset,0.7905,morph,morph,ft,Match,gold,dev
1,morph_dev_yap,morph,char_cnn,ft_yap,44_seed,0.780193,0.647295,0.707558,dev_yap,0.9667,...,,morph.char_cnn.ft_yap.44_seed.83.model,morph.char_cnn.ft_yap.44_seed.dset,0.7905,morph,morph,ft,Match,yap,dev
2,morph_test_gold,morph,char_cnn,ft_yap,44_seed,0.80485,0.747854,0.775306,test_gold,0.9667,...,,morph.char_cnn.ft_yap.44_seed.83.model,morph.char_cnn.ft_yap.44_seed.dset,0.7905,morph,morph,ft,Match,gold,test
3,morph_dev_gold,morph,char_cnn,ft_oov_tok,44_seed,0.843612,0.767535,0.803778,dev_gold,0.969,...,,morph.char_cnn.ft_oov_tok.44_seed.183.model,morph.char_cnn.ft_oov_tok.44_seed.dset,0.8038,morph,token,ft_oov,Clash,gold,dev
4,morph_test_yap,morph,char_cnn,ft_yap,44_seed,0.721411,0.636266,0.676169,test_yap,0.9667,...,,morph.char_cnn.ft_yap.44_seed.83.model,morph.char_cnn.ft_yap.44_seed.dset,0.7905,morph,morph,ft,Match,yap,test


In [189]:
dev_gold = spdf[spdf.set=='dev']
dev_gold = dev_gold.merge(dev_unk, how='left')
test_gold = spdf[spdf.set=='test']
test_gold = test_gold.merge(test_unk, how='left')
test_sent_id_map = (test_gold.groupby('sent_id').size()
                    .reset_index().drop(0, axis=1).reset_index()
                    .assign(index=lambda x: x+1).set_index('sent_id')['index'])

test_gold['sent_id'] = test_gold.sent_id.map(test_sent_id_map)
test_unk['sent_id'] = test_unk.sent_id.map(test_sent_id_map)
test_gold.tail()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,token_id,sent_id,token_str,global_sent_id,...,biose_layer1,biose_layer2,biose_layer3,token_unk,morph_unk,lemma_unk,unk_type,has_ner,morpheme_count,token_id_rep
16823,5,אך,אך,CC,CC,_,4,706,אך,6153,...,O,O,O,False,False,False,,False,1,4
16824,6,לא,לא,RB,RB,_,5,706,לא,6153,...,O,O,O,False,False,False,,False,1,5
16825,7,ה,ה,DEF,DEF,_,6,706,התרופה,6153,...,O,O,O,False,False,False,,False,2,6
16826,8,תרופה,תרופה,NN,NN,gen=F|num=S,6,706,התרופה,6153,...,O,O,O,False,False,False,,False,2,6
16827,9,.,_,yyDOT,yyDOT,_,7,706,.,6153,...,O,O,O,False,False,False,,False,1,7


In [190]:
test_unk.tail()

Unnamed: 0,sent_id,token_id,token_str,token_unk,morph_unk,lemma_unk,unk_type,has_ner,morpheme_count,token_id_rep
12614,706,3,טובה,False,False,False,,False,1,3
12615,706,4,אך,False,False,False,,False,1,4
12616,706,5,לא,False,False,False,,False,1,5
12617,706,6,התרופה,False,False,False,,False,2,6
12618,706,7,.,False,False,False,,False,1,7


In [191]:
dev_gold['ooev_tok'] = ~dev_gold.form.isin(emb_words['tok'])
dev_gold['ooev_yap'] = ~dev_gold.form.isin(emb_words['yap'])

test_gold['ooev_tok'] = ~test_gold.form.isin(emb_words['tok'])
test_gold['ooev_yap'] = ~test_gold.form.isin(emb_words['yap'])

In [192]:
dev_gold['ooev_word'] = True
test_gold['ooev_word'] = True


In [193]:
def get_df_from_sents(sents):
    for_df = []
    for i, line in sents.iteritems():
        sent_id=i+1
        for j, (tok, bio) in enumerate(line):
            for_df.append((sent_id, j+1, bio))
    df = pd.DataFrame(for_df, columns=['sent_id', 'id', 'biose_layer0'])
    return df

In [194]:
yap_output_folder = 'final_setup/pruned/yap_output'

txt_map = defaultdict(dict)

for file in os.scandir(yap_output_folder):
    if file.name in ('.ipynb_checkpoints', '.conll', '.seg', '.map'):
        continue
    ds, unit, arch, w_embed, seed_num, output_type = file.name.split('.')
    if '_tok' in w_embed:
        w_embed = w_embed.replace('_tok', '_yap')
    else:
        w_embed = w_embed.replace('_yap', '_tok')
    txt_map[(ds, arch, w_embed, output_type)] = file.path


In [195]:
from functools import lru_cache

@lru_cache(32)
def get_matching_yap_out(ds_sub, ds, arch, w_embed):
    if ds_sub=='yap':
        yap_out = bclm.read_yap_output(treebank_set=ds)
    if ds_sub=='pruned':
        yap_out = bclm.read_yap_output(treebank_set=None,
                                       tokens_path=bclm.TREEBANK_TOKEN_PATHS[ds], 
                                        dep_path=txt_map[(ds, arch, w_embed, 'conll')],
                                        map_path=txt_map[(ds, arch, w_embed, 'map')],)
    return yap_out

    

In [196]:
gold_dfs = {'dev': dev_gold, 'test': test_gold}

In [197]:
unks = {'dev': dev_unk, 'test': test_unk}

In [198]:
emb_words['word'] = []

In [519]:
scores2 = pickle.load(open('final_setup/oov_scores.pkl', 'rb'))

In [521]:
new_scores2 = {}
for sc_id in scores2:
    if not 'test' in sc_id[0]:
        new_scores2[sc_id] = scores2[sc_id]

In [524]:
scores2 = new_scores2

## TOKEN

In [199]:
tok_dev_gold = (bclm
                .get_token_df(spdf[spdf.set=='dev'], biose=['biose_layer0'], add_set=False))
tok_dev_gold = tok_dev_gold.merge(dev_unk, how='left')
tok_test_gold = bclm.get_token_df(spdf[spdf.set=='test'], biose=['biose_layer0'], add_set=False)
tok_test_gold['sent_id'] = tok_test_gold.sent_id.map(test_sent_id_map)
tok_test_gold = tok_test_gold.merge(test_unk, how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [200]:
tok_dev_gold['ooev_tok'] = ~tok_dev_gold.token_str.isin(emb_words['tok'])
tok_dev_gold['ooev_yap'] = ~tok_dev_gold.token_str.isin(emb_words['yap'])

tok_test_gold['ooev_tok'] = ~tok_test_gold.token_str.isin(emb_words['tok'])
tok_test_gold['ooev_yap'] = ~tok_test_gold.token_str.isin(emb_words['yap'])

In [201]:
tok_dev_gold['ooev_word'] = True
tok_test_gold['ooev_word'] = True


In [202]:
tok_gold_dfs = {'dev': tok_dev_gold, 'test': tok_test_gold}

In [204]:
tok_cols = ['token_str', 'biose_layer0', 'unk_type', 'morpheme_count', 'ooev', 'token_id_rep']

tok_gold_ment_sets = {}
for pred_set, df in tok_gold_dfs.items():
    for emb_unit in ('tok', 'yap', 'word'):
        tok_gold_dfs[pred_set]['ooev'] = tok_gold_dfs[pred_set]['ooev_'+emb_unit]
        gold_sents = tok_gold_dfs[pred_set].groupby('sent_id').apply(get_sent, tok_cols)
        gold_sents_fixed = get_sents_fixed(gold_sents)
        tok_gold_ments = [sent_to_mentions_dict(sent, sent_id, ment_len=True, ment_cat=True) 
                      for sent_id, sent 
                      in gold_sents_fixed]
        gold_ment_set = get_ment_set(tok_gold_ments)
        tok_gold_ment_sets[(pred_set, emb_unit)] = gold_ment_set

In [222]:
scores5 = {}

In [None]:
output_folder = 'final_setup/decode_output'
cols = ['form', 'biose_layer0', 'unk_type', 'morpheme_count', 'ooev_tok_part', 'token_id_rep']
tok_cols = ['token_str', 'biose_layer0', 'unk_type', 'morpheme_count', 'ooev', 'token_id_rep']

for file in os.scandir(output_folder):
    if file.name=='.ipynb_checkpoints':
        continue
        
    sc_id = tuple(file.name.split('.')[:-1])
    gold_name, inp, arch, w_embed, seed_num = sc_id
    emb_unit = w_embed.split('_')[-1]

    if sc_id not in scores5:
        print(file.path)
        scores5[sc_id] = {}
        ner_df = get_df_from_sents(nem.read_file_sents(file.path))

        if len(gold_name.split('_'))>2:
            unit, pred_set, ps_sub = gold_name.split('_')
            
            if ps_sub in ('pruned', 'yap'):
                yap_out = get_matching_yap_out(ps_sub, pred_set, arch, w_embed)
                yap_out['ooev'] = ~yap_out.form.isin(emb_words[emb_unit])
                full_df = yap_out.merge(ner_df)

            elif ps_sub == 'gold':
                full_df = gold_dfs[pred_set].copy()
                full_df['biose_layer0'] = ner_df.biose_layer0
                full_df['ooev'] = full_df['ooev_'+emb_unit]
                
            full_df = full_df.merge(unks[pred_set], how='left')
            ooev_tok_part = (full_df.groupby(['sent_id', 'token_id'])
                             .ooev.apply(lambda x: x.any())
                             .reset_index().rename(columns={'ooev': 'ooev_tok_part'}))

            full_df = full_df.merge(ooev_tok_part, how='left')
            gold_ooev = gold_dfs[pred_set].merge(ooev_tok_part, how='left')
            token_unk_sents = full_df.groupby('sent_id').apply(get_sent, cols)
            token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
            ments = [sent_to_mentions_dict(sent, sent_id, 
                                           ment_len=True, ment_cat=True) 
                     for sent_id, sent 
                     in token_unk_sents_fixed]
            ment_set = get_ment_set(ments)
            gold_sents = gold_ooev.groupby('sent_id').apply(get_sent, cols)
            gold_sents_fixed = get_sents_fixed(gold_sents)
            gold_ments = [sent_to_mentions_dict(sent, sent_id, 
                                                ment_len=True, ment_cat=True) 
                          for sent_id, sent 
                          in gold_sents_fixed]
            gold_ment_set = get_ment_set(gold_ments)
            for k in ment_set.keys() | gold_ment_set.keys():
                scores5[sc_id][k] = nem.evaluate_mentions(gold_ment_set[k], 
                                                        ment_set[k], 
                                                        verbose=False, 
                                                        return_tpc=True)

        ##TOKEN
        else:
            ner_df = ner_df.rename(columns= {'id': 'token_id'})
            unit, pred_set = gold_name.split('_')
            full_df = tok_gold_dfs[pred_set].copy()
            full_df['biose_layer0'] = ner_df.biose_layer0
            full_df['ooev'] = full_df['ooev_'+emb_unit]
                
            token_unk_sents = full_df.groupby('sent_id').apply(get_sent, tok_cols)
            token_unk_sents_fixed = get_sents_fixed(token_unk_sents)
            ments = [sent_to_mentions_dict(sent, sent_id, 
                                           ment_len=True, ment_cat=True) 
                     for sent_id, sent 
                     in token_unk_sents_fixed]
            ment_set = get_ment_set(ments)

            gold_ment_set = tok_gold_ment_sets[(pred_set, emb_unit)]
            for k in ment_set.keys() | gold_ment_set.keys():
                scores5[sc_id][k] = nem.evaluate_mentions(gold_ment_set[k], 
                                                        ment_set[k], 
                                                        verbose=False, 
                                                        return_tpc=True)
            

    

final_setup/decode_output/morph_dev_gold.morph.char_cnn.ft_yap.44_seed.bmes
final_setup/decode_output/morph_dev_yap.morph.char_cnn.ft_yap.44_seed.bmes
final_setup/decode_output/morph_test_gold.morph.char_cnn.ft_yap.44_seed.bmes
final_setup/decode_output/morph_dev_gold.morph.char_cnn.ft_oov_tok.44_seed.bmes
final_setup/decode_output/morph_test_yap.morph.char_cnn.ft_yap.44_seed.bmes
final_setup/decode_output/morph_dev_yap.morph.char_cnn.ft_oov_tok.44_seed.bmes
final_setup/decode_output/morph_test_gold.morph.char_cnn.ft_oov_tok.44_seed.bmes
final_setup/decode_output/morph_dev_gold.morph.char_cnn.glv_tok.45_seed.bmes
final_setup/decode_output/morph_dev_yap.morph.char_cnn.glv_tok.45_seed.bmes
final_setup/decode_output/morph_test_yap.morph.char_cnn.ft_oov_tok.44_seed.bmes
final_setup/decode_output/morph_test_gold.morph.char_cnn.glv_tok.45_seed.bmes
final_setup/decode_output/token_dev.multitok.char_cnn.glv_yap.44_seed.bmes
final_setup/decode_output/token_test.multitok.char_cnn.glv_yap.44_seed

In [225]:
import pickle
pickle.dump(scores5, open('final_setup/oov_scores5.pkl', 'wb'))

In [224]:
len(scores5)

2100

In [226]:
def prf_from_tpc(t, p, c):
    if p==0:
        prec=-1
    else:
        prec = c / p
    
    if t==0:
        recall=-1
    else:
        recall = c / t

    if prec+recall==0:
        f1=-1
    else:
        f1 = 2*prec*recall/(prec+recall)    
    return prec, recall, f1

In [238]:
same = lambda x: x

def ignore(x):
    return 'all'

four_up = lambda x: x if x<=3 else 4

def ut_gm(x):
    filt_x = list(filter(lambda a: a != '', x))
    return tuple(sorted(set(filt_x)))

lenset = lambda x: min(len(set(x)), 4)

In [259]:
scid_names = ['gold_name', 'unit', 'arch', 'w_embed', 'seed_num']
def group_scores(scs, groupers, names = ['unk_type', 'morpheme_count', 'ooev', 'token_id_rep'], 
                 ment_grouper=same, ment_cat_grouper=ignore):
    grouped_scs = []
    for sc_id, keys in scs.items():
        single_gsc = defaultdict(lambda: [0, 0, 0])
        for k in keys:
            score = scs[sc_id][k]
            (umo, ment_len), ment_cat = k
            ment_len = ment_grouper(ment_len)
            ment_cat = ment_cat_grouper(ment_cat)
            umo = list(zip(*umo))
            grouped_umo = [grouper(x) 
                                 for grouper, x in zip(groupers, umo)]
            grouped_umo.append(ment_len)
            grouped_umo.append(ment_cat)
            grouped_umo = tuple(grouped_umo)
            single_gsc[grouped_umo][0] += score[3]
            single_gsc[grouped_umo][1] += score[4]
            single_gsc[grouped_umo][2] += score[5]

        
        gsc = []
        for grp_k in single_gsc:
            t,p,c = single_gsc[grp_k]
            prec, recall, f1 = prf_from_tpc(t,p,c)
            if f1==-1:
                f1=0
            gsc.append(tuple(list(sc_id)+list(grp_k)+[t,p,c, prec, recall, f1]))
        grouped_scs.extend(gsc)
        
    names = scid_names+names+['ment_len', 'ment_cat']+['true','pred','correct', 'p', 'r', 'f']
    return pd.DataFrame(grouped_scs, columns=names)



In [248]:
gsc = group_scores(scores5, groupers=[ignore, ignore, ignore, lenset], ment_grouper=ignore)
gsc.head()

Unnamed: 0,gold_name,unit,arch,w_embed,seed_num,unk_type,morpheme_count,ooev,token_id_rep,ment_len,ment_cat,true,pred,correct,p,r,f
0,morph_dev_gold,morph,char_cnn,ft_yap,44_seed,all,all,all,1,all,all,255,224,194,0.866071,0.760784,0.810021
1,morph_dev_gold,morph,char_cnn,ft_yap,44_seed,all,all,all,2,all,all,183,159,141,0.886792,0.770492,0.824561
2,morph_dev_gold,morph,char_cnn,ft_yap,44_seed,all,all,all,4,all,all,15,8,6,0.75,0.4,0.521739
3,morph_dev_gold,morph,char_cnn,ft_yap,44_seed,all,all,all,3,all,all,46,35,25,0.714286,0.543478,0.617284
4,morph_dev_yap,morph,char_cnn,ft_yap,44_seed,all,all,all,1,all,all,255,222,166,0.747748,0.65098,0.696017


In [249]:
gsc.groupby('w_embed').size()

w_embed
ft_oov_tok    1200
ft_oov_yap    1200
ft_tok        1200
ft_yap        1200
glv_tok       1200
glv_yap       1200
no_word       1200
dtype: int64

In [250]:
gsc = group_scores(scores5, groupers=[ignore, ignore, ignore, lenset], ment_grouper=ignore, ment_cat_grouper=ignore)
(gsc[gsc.gold_name.str.contains('dev')]
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'token_id_rep'])
 .f.mean().mul(100).round(2).unstack())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token_id_rep,1,2,3,4
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph_dev_gold,morph,char_cnn,ft_oov_tok,81.62,83.52,61.51,44.72
morph_dev_gold,morph,char_cnn,ft_oov_yap,81.71,83.81,61.97,51.07
morph_dev_gold,morph,char_cnn,ft_tok,80.98,83.42,61.36,48.6
morph_dev_gold,morph,char_cnn,ft_yap,80.02,83.43,61.79,48.8
morph_dev_gold,morph,char_cnn,glv_tok,80.26,81.46,59.17,41.3
morph_dev_gold,morph,char_cnn,glv_yap,80.14,81.09,60.94,42.8
morph_dev_gold,morph,char_cnn,no_word,61.72,69.73,49.43,38.58
morph_dev_gold,morph,char_lstm,ft_oov_tok,81.15,83.06,61.48,45.7
morph_dev_gold,morph,char_lstm,ft_oov_yap,82.21,83.06,59.19,44.97
morph_dev_gold,morph,char_lstm,ft_tok,80.37,82.91,60.72,47.9


## OOTV

In [251]:
gsc = group_scores(scores5, groupers=[ut_gm, ignore, ignore, ignore], ment_grouper=ignore, ment_cat_grouper=ignore)
(gsc[gsc.gold_name.str.contains('morph_dev')]
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'unk_type'])
 .f.mean().unstack())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unk_type,(),"(T,)","(T, TM)","(TM,)"
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph_dev_gold,morph,char_cnn,ft_oov_tok,0.822178,0.687515,0.533535,0.793864
morph_dev_gold,morph,char_cnn,ft_oov_yap,0.830871,0.694968,0.457172,0.790695
morph_dev_gold,morph,char_cnn,ft_tok,0.821035,0.681985,0.469091,0.789779
morph_dev_gold,morph,char_cnn,ft_yap,0.832321,0.693843,0.418535,0.761516
morph_dev_gold,morph,char_cnn,glv_tok,0.818826,0.659414,0.211555,0.766338
morph_dev_gold,morph,char_cnn,glv_yap,0.820197,0.685386,0.342424,0.757847
morph_dev_gold,morph,char_cnn,no_word,0.787278,0.648385,-0.519596,0.35099
morph_dev_gold,morph,char_lstm,ft_oov_tok,0.815305,0.686438,0.511717,0.79543
morph_dev_gold,morph,char_lstm,ft_oov_yap,0.825925,0.696072,0.401616,0.790227
morph_dev_gold,morph,char_lstm,ft_tok,0.817964,0.686331,0.47596,0.777748


In [260]:
gsc = group_scores(scores5, groupers=[ut_gm, ignore, ignore, ignore], ment_grouper=ignore, ment_cat_grouper=ignore)
(gsc[((gsc.gold_name.isin(['morph_dev_gold','morph_dev_pruned']))
      & (gsc.arch=='char_cnn')
      & (gsc.w_embed=='ft_oov_yap')

     )
    |
     (
        (gsc.gold_name.isin(['token_dev']))
      & (gsc.arch=='char_lstm')
      & (gsc.w_embed=='ft_oov_tok')

     )]
 .groupby(['gold_name', 'unit', 'unk_type'])
 .f.mean().mul(100).round(2).unstack())

Unnamed: 0_level_0,unk_type,(),"(T,)","(T, TM)","(TM,)"
gold_name,unit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
morph_dev_gold,morph,83.09,69.5,45.72,79.07
morph_dev_pruned,morph,81.39,66.48,45.72,77.03
token_dev,multitok,80.06,63.4,32.36,78.23
token_dev,token,81.64,64.21,26.87,77.67


In [267]:
def unk_type_replace(t):
    if t==('T',):
        return 'Comp'
    if t==():
        return 'Known'
    if t==('TM',):
        return 'Real'
    if t==('T', 'TM'):
        return 'R+C'

In [265]:
x.columns

Index([(), ('T',), ('T', 'TM'), ('TM',)], dtype='object', name='unk_type')

In [274]:
gsc = (group_scores(scores5, groupers=[ut_gm, ignore, ignore, ignore], ment_grouper=ignore, ment_cat_grouper=ignore)
      .assign(unk_type = lambda x: x.unk_type.apply(unk_type_replace)))
x = (gsc[((gsc.gold_name.isin(['morph_dev_gold','morph_dev_pruned']))
      & (gsc.arch=='char_cnn')
      & (gsc.w_embed=='ft_oov_yap')

     )
    |
     (
        (gsc.gold_name.isin(['token_dev']))
      & (gsc.arch=='char_lstm')
      & (gsc.w_embed=='ft_oov_tok')

     )]
 .groupby(['gold_name', 'unit', 'unk_type'])
 .f.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str)+'$')[['mean']]
         .unstack())
x.columns = x.columns.droplevel(0)
print (x[['Known', 'Real', 'Comp', 'R+C']].to_latex(bold_rows=True).replace('±', '\pm').replace('\$', '$'))

\begin{tabular}{llllll}
\toprule
          & \textbf{unk\_type} &          Known &           Real &           Comp &            R+C \\
\textbf{gold\_name} & \textbf{unit} &                &                &                &                \\
\midrule
\textbf{morph\_dev\_gold} & \textbf{morph} &  $83.09 \pm 0.9$ &  $79.07 \pm 1.4$ &  $69.50 \pm 1.3$ &  $45.72 \pm 9.9$ \\
\textbf{morph\_dev\_pruned} & \textbf{morph} &  $81.39 \pm 1.0$ &  $77.03 \pm 1.5$ &  $66.48 \pm 1.2$ &  $45.72 \pm 9.9$ \\
\textbf{token\_dev} & \textbf{multitok} &  $80.06 \pm 1.0$ &  $78.23 \pm 1.2$ &  $63.40 \pm 2.6$ &  $32.36 \pm 7.3$ \\
          & \textbf{token} &  $81.64 \pm 0.9$ &  $77.67 \pm 0.8$ &  $64.21 \pm 2.5$ &  $26.87 \pm 8.5$ \\
\bottomrule
\end{tabular}



In [275]:
from scipy.stats import ttest_ind

In [283]:
tt = (gsc[(gsc.unk_type=='Comp') & ( ((gsc.gold_name=='morph_dev_pruned')
      & (gsc.arch=='char_cnn')
      & (gsc.w_embed=='ft_oov_yap')

     )
    |
     (
        (gsc.gold_name.isin(['token_dev']))
      & (gsc.arch=='char_lstm')
      & (gsc.w_embed=='ft_oov_tok')

     ))]
)

ttest_ind(tt[tt.unit=='morph'].f, tt[tt.unit=='token'].f)

Ttest_indResult(statistic=2.5674395676052857, pvalue=0.01937938613779738)

In [286]:
tt = (gsc[(gsc.unk_type=='Real') & ( ((gsc.gold_name=='morph_dev_pruned')
      & (gsc.arch=='char_cnn')
      & (gsc.w_embed=='ft_oov_yap')

     )
    |
     (
        (gsc.gold_name.isin(['token_dev']))
      & (gsc.arch=='char_lstm')
      & (gsc.w_embed=='ft_oov_tok')

     ))]
)

ttest_ind(tt[tt.unit=='morph'].f, tt[tt.unit=='multitok'].f)

Ttest_indResult(statistic=-1.9369654553486633, pvalue=0.06860642276609745)

In [140]:
gsc = group_scores(scores4, groupers=[ut_gm, ignore, ignore], ment_grouper=ignore, ment_cat_grouper=ignore)
(gsc[gsc.gold_name.str.contains('token_dev')]
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'unk_type'])
 .f.mean().unstack())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unk_type,(),"(T,)","(T, TM)","(TM,)"
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
token_dev,multitok,char_cnn,ft_oov_tok,0.797595,0.630318,0.308345,0.771243
token_dev,multitok,char_cnn,ft_oov_yap,0.796952,0.597179,0.249293,0.727613
token_dev,multitok,char_cnn,ft_tok,0.800908,0.623261,0.142172,0.715171
token_dev,multitok,char_cnn,ft_yap,0.790428,0.457796,-1.0,0.661115
token_dev,multitok,char_cnn,glv_tok,0.782409,0.544636,-0.046713,0.703852
token_dev,multitok,char_cnn,glv_yap,0.782695,0.340578,-0.875,0.659396
token_dev,multitok,char_cnn,no_word,0.754914,0.086791,-0.7,0.285574
token_dev,multitok,char_lstm,ft_oov_tok,0.800648,0.633958,0.323636,0.782296
token_dev,multitok,char_lstm,ft_oov_yap,0.799922,0.593703,0.291515,0.736229
token_dev,multitok,char_lstm,ft_tok,0.801149,0.622028,0.02803,0.722791


Maybe T has more ORGs, and that's why it's harder?

In [141]:
pd.set_option("max_rows", 300)


In [142]:
gsc = group_scores(scores4, groupers=[ut_gm, ignore, ignore], 
                   ment_grouper=ignore, ment_cat_grouper=same)
xx = (gsc[gsc.gold_name.str.contains('morph_dev')]
 .groupby(['ment_cat', 'unk_type'])
 .true.mean().unstack())
xx

unk_type,(),"(T,)","(T, TM)","(TM,)"
ment_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANG,3.0,0.0,,0.0
DUC,1.0,0.0,0.0,1.0
EVE,5.0,5.0,2.0,0.0
FAC,5.0,5.0,0.0,2.0
GPE,76.0,15.0,,30.0
LOC,20.0,2.0,0.0,6.0
ORG,61.0,11.0,4.0,43.0
PER,70.0,4.0,0.0,119.0
WOA,3.0,3.0,0.0,3.0


In [143]:
(xx.T / xx.T.sum()).T

unk_type,(),"(T,)","(T, TM)","(TM,)"
ment_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANG,1.0,0.0,,0.0
DUC,0.5,0.0,0.0,0.5
EVE,0.416667,0.416667,0.166667,0.0
FAC,0.416667,0.416667,0.0,0.166667
GPE,0.628099,0.123967,,0.247934
LOC,0.714286,0.071429,0.0,0.214286
ORG,0.512605,0.092437,0.033613,0.361345
PER,0.362694,0.020725,0.0,0.61658
WOA,0.333333,0.333333,0.0,0.333333


In [144]:
gsc = group_scores(scores4, groupers=[ut_gm, ignore, ignore], ment_grouper=ignore, ment_cat_grouper=same)
(gsc[(gsc.gold_name.str.contains('morph_dev')) & (gsc.ment_cat=='ORG')]
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'unk_type'])
 .f.mean().unstack())
#0.805956	0.665775	0.521717	0.779888

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unk_type,(),"(T,)","(T, TM)","(TM,)"
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph_dev_gold,morph,char_cnn,ft_oov_tok,0.754281,0.677696,0.732143,0.530702
morph_dev_gold,morph,char_cnn,ft_oov_yap,0.753601,0.705882,0.628571,0.553281
morph_dev_gold,morph,char_cnn,ft_tok,0.757968,0.696315,0.627778,0.53097
morph_dev_gold,morph,char_cnn,ft_yap,0.759375,0.713072,0.594048,0.534189
morph_dev_gold,morph,char_cnn,glv_tok,0.737404,0.650245,0.422619,0.426602
morph_dev_gold,morph,char_cnn,glv_yap,0.751105,0.683893,0.551429,0.450434
morph_dev_gold,morph,char_cnn,no_word,0.696903,0.595184,-0.389286,0.226856
morph_dev_gold,morph,char_lstm,ft_oov_tok,0.757239,0.692565,0.696429,0.544201
morph_dev_gold,morph,char_lstm,ft_oov_yap,0.754131,0.713072,0.571429,0.545209
morph_dev_gold,morph,char_lstm,ft_tok,0.757545,0.700178,0.635714,0.52062


Answer is NO - ORG does have more T's but score is actually better in T and T,TM for ORG than for the rest

## THIS!!!
for morph, ft_oov_yap is significantly better on OOEV than ft_oov_tok, while it is slightly worse on IEV. 

In [151]:
gsc = group_scores(scores5, groupers=[ignore, ignore, max, ignore], ment_grouper=ignore)
gsc[gsc.gold_name.str.contains('morph_dev')].fillna('x').groupby(['gold_name', 'unit', 'arch', 'w_embed', 'ooev'], sort=False).f.mean().unstack().sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ooev,False,True
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1
morph_dev_gold,morph,char_cnn,ft_oov_tok,0.803294,0.752472
morph_dev_gold,morph,char_cnn,ft_oov_yap,0.800105,0.801235
morph_dev_gold,morph,char_cnn,ft_tok,0.801067,0.738266
morph_dev_gold,morph,char_cnn,ft_yap,0.798384,0.724326
morph_dev_gold,morph,char_cnn,glv_tok,0.781946,0.758756
morph_dev_gold,morph,char_cnn,glv_yap,0.782598,0.763396
morph_dev_gold,morph,char_cnn,no_word,,0.630313
morph_dev_gold,morph,char_lstm,ft_oov_tok,0.799784,0.751506
morph_dev_gold,morph,char_lstm,ft_oov_yap,0.797692,0.792757
morph_dev_gold,morph,char_lstm,ft_tok,0.795577,0.729034


In [295]:
def get_cm(s):
    if s.unit=='morph':
        if '_yap' in s.w_embed:
            return 'Match'
        else:
            return 'Clash'
    else:
        if '_tok' in s.w_embed:
            return 'Match'
        else:
            return 'Clash'
    return 'na'

In [297]:
gsc = (group_scores(scores5, groupers=[ignore, ignore, max, ignore], ment_grouper=ignore, ment_cat_grouper=ignore))
gsc['cm'] = gsc.apply(get_cm, axis=1)
gsc['embed_type'] = gsc.w_embed.str.replace('_tok|_yap', '')

In [302]:
x = (gsc[((gsc.gold_name.isin(['morph_dev_gold','morph_dev_pruned']))
      & (gsc.arch=='char_cnn')
      & (gsc.w_embed!='no_word')

     )
    |
     (
        (gsc.gold_name.isin(['token_dev']))
      & (gsc.arch=='char_lstm')
      & (gsc.w_embed!='no_word')

     )]
 .groupby(['gold_name', 'unit', 'cm', 'embed_type', 'ooev'])
 .f.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str)+'$')[['mean']]
         .unstack([-2,-1]))
x.columns = x.columns.droplevel(0)
x[[('glv', False), ('glv', True), ('ft', False), 
       ('ft', True), ('ft_oov', False), ('ft_oov', True)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,embed_type,glv,glv,ft,ft,ft_oov,ft_oov
Unnamed: 0_level_1,Unnamed: 1_level_1,ooev,False,True,False,True,False,True
gold_name,unit,cm,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
morph_dev_gold,morph,Clash,$78.19 ± 0.6$,$75.88 ± 2.3$,$80.11 ± 0.8$,$73.83 ± 3.8$,$80.33 ± 0.6$,$75.25 ± 1.8$
morph_dev_gold,morph,Match,$78.23 ± 0.6$,$76.34 ± 3.3$,$79.84 ± 0.4$,$72.43 ± 2.2$,$80.01 ± 0.6$,$80.12 ± 2.3$
morph_dev_pruned,morph,Clash,$75.51 ± 0.9$,$72.72 ± 2.6$,$76.93 ± 1.0$,$71.06 ± 3.7$,$78.11 ± 0.7$,$73.50 ± 2.3$
morph_dev_pruned,morph,Match,$76.21 ± 0.5$,$74.59 ± 3.0$,$78.08 ± 0.3$,$68.62 ± 1.9$,$78.13 ± 0.6$,$78.14 ± 2.2$
token_dev,multitok,Clash,$74.62 ± 0.9$,$56.17 ± 2.1$,$78.13 ± 1.3$,$56.18 ± 1.1$,$78.16 ± 1.2$,$69.71 ± 1.4$
token_dev,multitok,Match,$74.26 ± 0.8$,$62.79 ± 2.4$,$77.02 ± 0.8$,$62.90 ± 3.1$,$77.52 ± 0.9$,$77.58 ± 2.6$
token_dev,token,Clash,$75.37 ± 1.1$,$55.97 ± 2.0$,$77.67 ± 1.1$,$54.67 ± 1.7$,$78.92 ± 0.7$,$69.26 ± 1.3$
token_dev,token,Match,$74.95 ± 0.7$,$68.80 ± 2.4$,$78.16 ± 0.6$,$67.76 ± 4.2$,$78.44 ± 0.6$,$76.36 ± 2.7$


In [303]:
print (x[[('glv', False), ('glv', True), ('ft', False), 
       ('ft', True), ('ft_oov', False), ('ft_oov', True)]]
       .to_latex(bold_rows=True).replace('±', '\pm').replace('\$', '$'))

\begin{tabular}{lllllllll}
\toprule
          &       & \textbf{embed\_type} & \multicolumn{2}{l}{glv} & \multicolumn{2}{l}{ft} & \multicolumn{2}{l}{ft\_oov} \\
          &       & \textbf{ooev} &          False &          True  &          False &          True  &          False &          True  \\
\textbf{gold\_name} & \textbf{unit} & \textbf{cm} &                &                &                &                &                &                \\
\midrule
\textbf{morph\_dev\_gold} & \textbf{morph} & \textbf{Clash} &  $78.19 \pm 0.6$ &  $75.88 \pm 2.3$ &  $80.11 \pm 0.8$ &  $73.83 \pm 3.8$ &  $80.33 \pm 0.6$ &  $75.25 \pm 1.8$ \\
          &       & \textbf{Match} &  $78.23 \pm 0.6$ &  $76.34 \pm 3.3$ &  $79.84 \pm 0.4$ &  $72.43 \pm 2.2$ &  $80.01 \pm 0.6$ &  $80.12 \pm 2.3$ \\
\textbf{morph\_dev\_pruned} & \textbf{morph} & \textbf{Clash} &  $75.51 \pm 0.9$ &  $72.72 \pm 2.6$ &  $76.93 \pm 1.0$ &  $71.06 \pm 3.7$ &  $78.11 \pm 0.7$ &  $73.50 \pm 2.3$ \\
          &       & \textbf{

In [275]:
from scipy.stats import ttest_ind

In [283]:
tt = (gsc[(gsc.unk_type=='Comp') & ( ((gsc.gold_name=='morph_dev_pruned')
      & (gsc.arch=='char_cnn')
      & (gsc.w_embed=='ft_oov_yap')

     )
    |
     (
        (gsc.gold_name.isin(['token_dev']))
      & (gsc.arch=='char_lstm')
      & (gsc.w_embed=='ft_oov_tok')

     ))]
)

ttest_ind(tt[tt.unit=='morph'].f, tt[tt.unit=='token'].f)

Ttest_indResult(statistic=2.5674395676052857, pvalue=0.01937938613779738)

In [152]:
(gsc[gsc.gold_name.str.contains('morph_dev')]
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'ooev'], sort=False) 
 [['true', 'pred', 'correct']].mean().unstack()).sort_index().round(1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,true,true,pred,pred,correct,correct
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,ooev,False,True,False,True,False,True
gold_name,unit,arch,w_embed,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
morph_dev_gold,morph,char_cnn,ft_oov_tok,441.0,58.0,399.8,56.3,337.7,43.0
morph_dev_gold,morph,char_cnn,ft_oov_yap,434.0,65.0,385.9,63.3,328.0,51.4
morph_dev_gold,morph,char_cnn,ft_tok,441.0,58.0,395.9,50.3,335.2,40.0
morph_dev_gold,morph,char_cnn,ft_yap,434.0,65.0,379.4,50.1,324.7,41.7
morph_dev_gold,morph,char_cnn,glv_tok,441.0,58.0,391.8,50.0,325.6,41.0
morph_dev_gold,morph,char_cnn,glv_yap,434.0,65.0,386.6,54.4,321.1,45.6
morph_dev_gold,morph,char_cnn,no_word,,499.0,,331.4,,261.7
morph_dev_gold,morph,char_lstm,ft_oov_tok,441.0,58.0,404.5,55.9,338.1,42.8
morph_dev_gold,morph,char_lstm,ft_oov_yap,434.0,65.0,383.6,62.4,326.1,50.5
morph_dev_gold,morph,char_lstm,ft_tok,441.0,58.0,400.2,48.4,334.6,38.8


In [153]:
(gsc[gsc.gold_name.str.contains('token_dev')]
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'ooev'], sort=False)
 .f.mean().sort_index().unstack().sort_index())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ooev,False,True
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1
token_dev,multitok,char_cnn,ft_oov_tok,0.773465,0.741613
token_dev,multitok,char_cnn,ft_oov_yap,0.781569,0.681281
token_dev,multitok,char_cnn,ft_tok,0.768752,0.612234
token_dev,multitok,char_cnn,ft_yap,0.773227,0.56252
token_dev,multitok,char_cnn,glv_tok,0.74092,0.62776
token_dev,multitok,char_cnn,glv_yap,0.752973,0.572257
token_dev,multitok,char_cnn,no_word,,0.55015
token_dev,multitok,char_lstm,ft_oov_tok,0.775198,0.775788
token_dev,multitok,char_lstm,ft_oov_yap,0.781631,0.697106
token_dev,multitok,char_lstm,ft_tok,0.770221,0.628964


In [154]:
(gsc[gsc.gold_name.str.contains('dev')]
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'ooev'], sort=False)
 .f.mean().mul(100).round(2).sort_index().unstack().sort_index())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ooev,False,True
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1
morph_dev_gold,morph,char_cnn,ft_oov_tok,80.33,75.25
morph_dev_gold,morph,char_cnn,ft_oov_yap,80.01,80.12
morph_dev_gold,morph,char_cnn,ft_tok,80.11,73.83
morph_dev_gold,morph,char_cnn,ft_yap,79.84,72.43
morph_dev_gold,morph,char_cnn,glv_tok,78.19,75.88
morph_dev_gold,morph,char_cnn,glv_yap,78.26,76.34
morph_dev_gold,morph,char_cnn,no_word,,63.03
morph_dev_gold,morph,char_lstm,ft_oov_tok,79.98,75.15
morph_dev_gold,morph,char_lstm,ft_oov_yap,79.77,79.28
morph_dev_gold,morph,char_lstm,ft_tok,79.56,72.9


## morpheme count
GloVe is consistently better on tokens that were segmented into more morphemes. It is actually the only one better on 3-morpheme than 2-morpheme. 

In [595]:
dev_unk.morpheme_count.value_counts()

1    6078
2    2143
3     303
4       7
Name: morpheme_count, dtype: int64

In [596]:
dev_unk[dev_unk.morpheme_count==4]

Unnamed: 0,sent_id,token_id,token_str,token_unk,morph_unk,lemma_unk,unk_type,has_ner,morpheme_count
583,27,22,שבמנזר,True,True,True,TM,False,4
1581,74,3,ולאינפלציה,True,False,False,T,False,4
1981,98,19,ושהאותוריטה,True,True,True,TM,False,4
4853,259,24,שבעתיד,True,False,False,T,False,4
5265,279,3,שבהצבעה,True,False,False,T,False,4
6494,351,23,ובמקום,True,False,False,T,False,4
8391,487,12,וברגליים,True,False,False,T,False,4


In [159]:
gsc = group_scores(scores5, groupers=[ignore, max, ignore, ignore], ment_grouper=ignore)
g = (gsc[gsc.gold_name.str.contains('dev')].fillna('x')
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'morpheme_count'], sort=False)
 .f.mean().mul(100).round(2).unstack()).sort_index()
g.reindex(sorted(g.columns), axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,morpheme_count,1,2,3,4
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph_dev_gold,morph,char_cnn,ft_oov_tok,83.11,75.21,75.89,0.0
morph_dev_gold,morph,char_cnn,ft_oov_yap,83.55,75.19,76.7,
morph_dev_gold,morph,char_cnn,ft_tok,82.74,74.77,76.48,
morph_dev_gold,morph,char_cnn,ft_yap,82.33,74.17,76.46,
morph_dev_gold,morph,char_cnn,glv_tok,81.18,72.79,78.35,
morph_dev_gold,morph,char_cnn,glv_yap,80.77,73.37,80.08,
morph_dev_gold,morph,char_cnn,no_word,62.45,60.84,78.14,
morph_dev_gold,morph,char_lstm,ft_oov_tok,82.13,76.01,74.67,
morph_dev_gold,morph,char_lstm,ft_oov_yap,83.25,74.75,76.54,
morph_dev_gold,morph,char_lstm,ft_tok,81.77,74.74,76.19,


In [315]:
gsc = group_scores(scores5, groupers=[ignore, lambda x: min(max(x), 2), ignore, ignore], ment_grouper=ignore, ment_cat_grouper=ignore)
gsc['cm'] = gsc.apply(get_cm, axis=1)
gsc['embed_type'] = gsc.w_embed.str.replace('_tok|_yap', '')


In [317]:
x = (gsc[((gsc.gold_name.isin(['morph_dev_gold','morph_dev_pruned']))
      & (gsc.arch=='char_cnn')
      & (gsc.w_embed!='no_word')
      & (gsc.cm=='Match')

     )
    |
     (
        (gsc.gold_name.isin(['token_dev']))
      & (gsc.arch=='char_lstm')
      & (gsc.w_embed!='no_word')
      & (gsc.cm=='Match')
     )]
 .groupby(['gold_name', 'unit', 'morpheme_count', 'embed_type'])
 .f.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str)+'$')[['mean']]
         .unstack([-2,-1]))
x.columns = x.columns.droplevel(0)
x[[(1, 'glv'), (1, 'ft'), (1, 'ft_oov'), 
   (2, 'glv'), (2, 'ft'), (2, 'ft_oov'),  ]]

Unnamed: 0_level_0,morpheme_count,1,1,1,2,2,2
Unnamed: 0_level_1,embed_type,glv,ft,ft_oov,glv,ft,ft_oov
gold_name,unit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
morph_dev_gold,morph,$80.77 ± 0.8$,$82.33 ± 0.7$,$83.55 ± 1.2$,$74.37 ± 0.7$,$74.53 ± 0.6$,$75.43 ± 0.5$
morph_dev_pruned,morph,$79.32 ± 0.9$,$81.52 ± 0.5$,$82.46 ± 1.2$,$71.66 ± 0.5$,$70.83 ± 0.7$,$72.36 ± 0.3$
token_dev,multitok,$76.53 ± 1.0$,$78.76 ± 1.3$,$80.93 ± 0.7$,$67.47 ± 1.0$,$70.09 ± 1.1$,$72.89 ± 1.1$
token_dev,token,$79.79 ± 0.8$,$81.11 ± 0.4$,$81.96 ± 0.8$,$66.43 ± 1.4$,$70.91 ± 0.9$,$72.97 ± 0.9$


In [318]:
print (x[[(1, 'glv'), (1, 'ft'), (1, 'ft_oov'), 
   (2, 'glv'), (2, 'ft'), (2, 'ft_oov'), 
]]
       .to_latex(bold_rows=True).replace('±', '\pm').replace('\$', '$'))

\begin{tabular}{llllllll}
\toprule
          & \textbf{morpheme\_count} & \multicolumn{3}{l}{1} & \multicolumn{3}{l}{2} \\
          & \textbf{embed\_type} &            glv &             ft &         ft\_oov &            glv &             ft &         ft\_oov \\
\textbf{gold\_name} & \textbf{unit} &                &                &                &                &                &                \\
\midrule
\textbf{morph\_dev\_gold} & \textbf{morph} &  $80.77 \pm 0.8$ &  $82.33 \pm 0.7$ &  $83.55 \pm 1.2$ &  $74.37 \pm 0.7$ &  $74.53 \pm 0.6$ &  $75.43 \pm 0.5$ \\
\textbf{morph\_dev\_pruned} & \textbf{morph} &  $79.32 \pm 0.9$ &  $81.52 \pm 0.5$ &  $82.46 \pm 1.2$ &  $71.66 \pm 0.5$ &  $70.83 \pm 0.7$ &  $72.36 \pm 0.3$ \\
\textbf{token\_dev} & \textbf{multitok} &  $76.53 \pm 1.0$ &  $78.76 \pm 1.3$ &  $80.93 \pm 0.7$ &  $67.47 \pm 1.0$ &  $70.09 \pm 1.1$ &  $72.89 \pm 1.1$ \\
          & \textbf{token} &  $79.79 \pm 0.8$ &  $81.11 \pm 0.4$ &  $81.96 \pm 0.8$ &  $66.43 \pm 1.4$ &  $7

In [275]:
from scipy.stats import ttest_ind

In [283]:
tt = (gsc[(gsc.unk_type=='Comp') & ( ((gsc.gold_name=='morph_dev_pruned')
      & (gsc.arch=='char_cnn')
      & (gsc.w_embed=='ft_oov_yap')

     )
    |
     (
        (gsc.gold_name.isin(['token_dev']))
      & (gsc.arch=='char_lstm')
      & (gsc.w_embed=='ft_oov_tok')

     ))]
)

ttest_ind(tt[tt.unit=='morph'].f, tt[tt.unit=='token'].f)

Ttest_indResult(statistic=2.5674395676052857, pvalue=0.01937938613779738)

In [None]:
gsc = group_scores(scores2, groupers=[ignore, max, ignore], ment_grouper=ignore)
g = (gsc[gsc.gold_name.str.contains('morph_dev')].fillna('x')
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'morpheme_count'], sort=False)
 [['true', 'pred', 'correct']].mean().unstack()).sort_index().round(1)
g.reindex(sorted(g.columns), axis=1)

In [None]:
g = (gsc[gsc.gold_name.str.contains('token_dev')].fillna('x')
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'morpheme_count'], sort=False)
 .f.mean().sort_index().unstack())
g.reindex(sorted(g.columns), axis=1)

## Mention length

In [161]:
gsc = group_scores(scores4, groupers=[ignore, ignore, ignore], ment_grouper=four_up)
(gsc[gsc.gold_name.str.contains('dev')].fillna('x')
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'ment_len'])
 .f.mean().mul(100).round(2).unstack())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ment_len,1,2,3,4
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph_dev_gold,morph,char_cnn,ft_oov_tok,84.66,82.72,64.99,63.72
morph_dev_gold,morph,char_cnn,ft_oov_yap,84.18,83.9,65.69,64.77
morph_dev_gold,morph,char_cnn,ft_tok,83.72,82.87,64.78,64.67
morph_dev_gold,morph,char_cnn,ft_yap,82.41,82.99,66.12,64.93
morph_dev_gold,morph,char_cnn,glv_tok,82.38,81.61,65.77,60.04
morph_dev_gold,morph,char_cnn,glv_yap,81.95,81.56,66.25,61.99
morph_dev_gold,morph,char_cnn,no_word,62.13,69.09,55.15,56.93
morph_dev_gold,morph,char_lstm,ft_oov_tok,83.95,82.32,64.62,64.76
morph_dev_gold,morph,char_lstm,ft_oov_yap,84.74,82.63,64.58,63.43
morph_dev_gold,morph,char_lstm,ft_tok,83.17,82.02,64.2,64.88


In [582]:
g = (gsc[gsc.gold_name.str.contains('token_dev')].fillna('x')
 .groupby(['gold_name', 'unit', 'arch', 'w_embed', 'ment_len'], sort=False)
 .f.mean().unstack())
g.reindex(sorted(g.columns), axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ment_len,1,2,3,4
gold_name,unit,arch,w_embed,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
token_dev,multitok,char_lstm,glv_yap,0.699029,0.733733,0.555272,0.4132
token_dev,multitok,char_lstm,ft_oov_yap,0.761828,0.788465,0.620608,0.489524
token_dev,multitok,char_lstm,ft_yap,0.725362,0.757029,0.552,0.450476
token_dev,multitok,char_lstm,ft_oov_tok,0.784379,0.810604,0.615246,0.464762
token_dev,multitok,char_lstm,no_word,0.546456,0.608189,0.455459,0.292277
token_dev,multitok,char_lstm,glv_tok,0.726478,0.773129,0.58693,0.389474
token_dev,multitok,char_lstm,ft_tok,0.746806,0.804697,0.611426,0.473734
token_dev,multitok,no_char,glv_yap,0.657524,0.700506,0.498668,0.188712
token_dev,multitok,no_char,ft_oov_yap,0.745697,0.77527,0.562606,0.447049
token_dev,multitok,no_char,ft_yap,0.695767,0.726564,0.536864,0.255766


In [None]:
pd.set_option("max_rows", 100)
gsc[gsc.gold_name.str.contains('morph_dev')].fillna('x').groupby(['gold_name', 'unit', 'arch', 'w_embed', 'unk_type', 'ooev'], sort=False).f.mean().unstack([-1, -2])