In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

## Get token data

In [5]:
import sys
sys.path.append('/home/nlp/danb')
sys.path.append('/home/nlp/danb/NER')

import bclm
import ne_evaluate_mentions as nem

In [6]:
dev_lat = bclm.read_lattices(bclm.LATTICES_PATHS['dev'])
dev_lat.head(20)

Unnamed: 0,ID1,ID2,form,lemma,upostag,xpostag,feats,token_id,sent_id
0,0,1,עשרות,עשר,CDT,CDT,gen=F|num=P,1,1
1,0,1,עשרות,עשר,CD,CD,gen=F|num=P,1,1
2,1,2,אנשים,הנשים,VB,VB,gen=F|gen=M|num=S|per=1|tense=FUTURE,2,1
3,1,2,אנשים,איש,NN,NN,gen=M|num=P,2,1
4,2,3,מגיעים,הגיע,VB,VB,gen=M|num=P|per=A|tense=BEINONI,3,1
5,2,3,מגיעים,הגיע,BN,BN,gen=M|num=P|per=A,3,1
6,3,4,מ,מ,PREPOSITION,PREPOSITION,_,4,1
7,3,5,מתאילנד,מתאילנד,NNP,NNP,gen=M|num=S,4,1
8,3,5,מתאילנד,מתאילנד,NN,NN,gen=M|num=P|num=S,4,1
9,3,5,מתאילנד,מתאילנד,NN,NN,gen=M|num=S,4,1


In [7]:
test_lat = bclm.read_lattices(bclm.LATTICES_PATHS['test'])
test_lat.head(20)

Unnamed: 0,ID1,ID2,form,lemma,upostag,xpostag,feats,token_id,sent_id
0,0,1,ה,ה,DEF,DEF,_,1,1
1,0,2,ה,ה,REL,REL,_,1,1
2,0,3,הכל,הכיל,VB,VB,gen=M|num=S|per=2|tense=IMPERATIVE,1,1
3,1,3,כל,כול,DTT,DTT,_,1,1
4,2,3,כל,כול,DTT,DTT,_,1,1
5,3,4,נושאים,נשא,VB,VB,gen=M|num=P|per=A|tense=BEINONI,2,1
6,3,4,נושאים,נושא,NN,NN,gen=M|num=P,2,1
7,3,4,נושאים,נשא,BN,BN,gen=M|num=P|per=A,2,1
8,4,5,עם,עם,IN,IN,_,3,1
9,4,6,עמם,עימם,VB,VB,gen=M|num=S|per=2|tense=IMPERATIVE,3,1


In [8]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]
tokens_ner_with_upos = bclm.get_token_df(spdf, fields = ['biose_layer0', 'upostag'])
tokens_ner_with_upos.head()

Unnamed: 0,sent_id,token_id,token_str,biose_layer0,upostag,set
0,1,1,עשרות,O,CDT,dev
1,1,2,אנשים,O,NN,dev
2,1,3,מגיעים,O,BN,dev
3,1,4,מתאילנד,O^S-GPE,PREPOSITION^NNP,dev
4,1,5,לישראל,O^S-GPE,PREPOSITION^NNP,dev


In [9]:
spdf.groupby('set').sent_id.min()

set
dev         1
test     5439
train     501
Name: sent_id, dtype: int64

In [10]:
tok_gold_sents =   tokens_ner_with_upos.groupby('sent_id')[['token_str', 'biose_layer0']].apply(lambda x: x.values.tolist())


In [11]:
def get_biose_count(path, sent_id_shift=1):
    sents = nem.read_file_sents(path, fix_multi_tag=False, sent_id_shift=sent_id_shift)
    bc = []
    for i, sent in sents.iteritems():
        for j, (tok, bio) in enumerate(sent):
            bc.append([i, j+1, tok, bio, len(bio.split('^'))])

    bc = pd.DataFrame(bc, columns=['sent_id', 'token_id', 'token_str', 
                                   'biose', 'biose_count'])
    return bc

In [12]:
import networkx as nx

In [13]:
def get_valid_edges(lattices, bc,
                    non_o_only=True, keep_all_if_no_valid=True):
    valid_edges = []
    for (i, df), (_, biose, biose_count) in zip(lattices.groupby(['sent_id', 'token_id']), 
                                                bc[['biose', 'biose_count']].itertuples()):
        el = df[['ID1', 'ID2']].rename(columns={'ID1': 'source', 'ID2': 'target'})
        #min_node = [n for n,v in G.nodes(data=True) if v['since'] == 'December 2008'][0]

        g = nx.from_pandas_edgelist(el, create_using=nx.DiGraph)
        min_node = el.source.min()
        max_node = el.target.max()
        #print(min_node,max_node)
        #print(biose_count)
        if non_o_only and not '-' in biose:
            vp = list(nx.all_simple_paths(g, min_node, max_node))
        else:
            vp = [path for path in nx.all_simple_paths(g, min_node, max_node, cutoff=biose_count+1) if len(path)==biose_count+1]
        if keep_all_if_no_valid and len(vp)==0:
             vp = nx.all_simple_paths(g, min_node, max_node)
        for path in vp:
            for source, target in zip(path[:-1], path[1:]):
                valid_edges.append((i[0], i[1], source, target))
                
    return valid_edges

In [24]:
def to_lattices(df, path, cols = ['ID1', 'ID2', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'token_id']):
    with open(path, 'w', encoding='utf8') as of:
        for _, sent in df.groupby('sent_id'):
            for _, row in sent[cols].iterrows():
                of.write('\t'.join(row.astype(str).tolist())+'\n')
            of.write('\n')
            
    

In [17]:
erdf = pd.read_pickle('final_setup/ooo_erdf.pkl')
best_multi = (erdf.loc[(erdf.unit=='multitok') 
      & (erdf
         .groupby(['unit', 'arch', 'embed_type', 'cm'])
         .relevant_score
         .transform(max)==erdf.relevant_score),
         ['unit', 'arch', 'embed_type', 'cm', 'acc', 'model_base_name']]
 .sort_values('acc', ascending=False)
)
best_multi.head()


Unnamed: 0,unit,arch,embed_type,cm,acc,model_base_name
1128,multitok,char_lstm,ft_oov,Match,0.9743,multitok.char_lstm.ft_oov_tok.52_seed


In [18]:
import os

In [22]:
cols = ['sent_id', 'token_id', 'ID1', 'ID2']
def get_pruned_lattice(lattices, bc, non_o_only=False):
    valid_edges = get_valid_edges(lattices, bc, non_o_only=non_o_only)
    pruned_lat = lattices[lattices[cols]
                         .apply(lambda x: tuple(x) in valid_edges,
                                axis=1)]
    return pruned_lat

In [19]:
output_folder = 'final_setup/ooo_decode_output'
models_folder = 'final_setup/ooo_models'
pruned_folder = 'final_setup/ooo_pruned/lattices'
    

In [25]:
for i, row in best_multi.iterrows():
    pruned_dev_path =  os.path.join(pruned_folder, 
                                    'dev.'+row.model_base_name+'.lattices')
    if not os.path.exists(pruned_dev_path):
        print(pruned_dev_path)
        dev_path = os.path.join(output_folder, 
                                'token_dev.'+row.model_base_name+'.bmes')
        dev_bc = get_biose_count(dev_path, sent_id_shift=1)
        pdev_lat = get_pruned_lattice(dev_lat, dev_bc)
        to_lattices(pdev_lat, pruned_dev_path)

    pruned_test_path = os.path.join(pruned_folder, 
                                    'test.'+row.model_base_name+'.lattices')    
    if not os.path.exists(pruned_test_path):
        print(pruned_test_path)
        test_path = os.path.join(output_folder, 
                                 'token_test.'+row.model_base_name+'.bmes')
        test_bc = get_biose_count(test_path, sent_id_shift=5439)   
        ptest_lat = get_pruned_lattice(test_lat, test_bc)
        to_lattices(ptest_lat, pruned_test_path)

final_setup/ooo_pruned/lattices/dev.multitok.char_lstm.ft_oov_tok.52_seed.lattices
final_setup/ooo_pruned/lattices/test.multitok.char_lstm.ft_oov_tok.52_seed.lattices


## Run YAP

In [34]:
yap_path = '/home/nlp/danb/yapproj/src/yap/yap'

In [35]:
!export GOPATH=/home/nlp/danb/yapproj

In [36]:
!{yap_path}

/home/nlp/danb/yapproj/src/yap/yap - invoke yap as a standalone app or as an api server

Commands:

    api         start api server
    dep         runs dependency training/parsing
    hebma       run lexicon-based morphological analyzer on raw input
    joint       runs joint morpho-syntactic training and parsing
    ma          run data-driven morphological analyzer on raw input
    md          runs standalone morphological disambiguation training and parsing

Use "/home/nlp/danb/yapproj/src/yap/yap help <command>" for more information about a command.



In [37]:
import os

In [38]:
pruned_folder = 'final_setup/ooo_pruned/lattices'
yap_output_folder = 'final_setup/ooo_pruned/yap_output'

In [39]:
for file in os.scandir(pruned_folder):
    #ds, unit, arch, w_embed, seed_num, _
    base_out = '.'.join(file.name.split('.')[:-1])
    seg_out, map_out, conll_out = [os.path.join(yap_output_folder, base_out+suf)
                                   for suf in ['.seg', '.map', '.conll']]
    if not os.path.exists(seg_out):
        !{yap_path} joint -in {file.path} -os {seg_out} -om {map_out} -oc {conll_out}

2020/01/24 17:27:10.375153 GOMAXPROCS:	40
2020/01/24 17:27:10.375357 
2020/01/24 17:27:10.401402 *** CONFIGURATION ***
2020/01/24 17:27:10.401435 Beam:             	Standard Beam [Not Aligned & Not Averaged]
2020/01/24 17:27:10.401504 Transition System:	Joint Morpho-Syntactic [MD:Morpheme-Based Morphological Disambiguator, ArcSys:Arc Zeager (zpar acl '11) [a.k.a. ArcZEager]] - Strategy: ArcGreedy
2020/01/24 17:27:10.401527 Transition Oracle:	Joint Morpho-Syntactic - Strategy: ArcGreedy
2020/01/24 17:27:10.401544 Iterations:		1
2020/01/24 17:27:10.401564 Beam Size:		64
2020/01/24 17:27:10.401583 Beam Concurrent:	true
2020/01/24 17:27:10.401599 Parameter Func:	Funcs_Main_POS_Both_Prop
2020/01/24 17:27:10.401623 Use Lemmas:		false
2020/01/24 17:27:10.401642 Use POP:		true
2020/01/24 17:27:10.401666 Infuse Gold Dev:	false
2020/01/24 17:27:10.401689 Limit (thousands):	0
2020/01/24 17:27:10.401712 Use CoNLL-U:		false
2020/01/24 17:27:10.401755 
2020/01/24 17:27:10.401781 Features File:	joint

## Create input files for NCRF
with dummy O

In [11]:
from collections import defaultdict
txt_folder = 'final_setup/ooo_pruned/txt'
txt_map = defaultdict(list)

In [128]:
for file in os.scandir(yap_output_folder):
    if file.name.endswith('conll') and file.name!='.conll':
        ds, unit, arch, w_embed, seed_num, _ = file.name.split('.')
        out_name = '.'.join(file.name.split('.')[:-1])+'.txt'
        out_path = os.path.join(txt_folder, out_name)
        if '_tok' in w_embed:
            w_embed = w_embed.replace('_tok', '_yap')
        else:
            w_embed = w_embed.replace('_yap', '_tok')
        txt_map[(arch, w_embed)].append((ds, out_path))
        with open(out_path, 'w') as of:
            for line in open(file.path, 'r'):
                if line=='\n':
                    of.write('\n')
                else:
                    w = line.split('\t')[1]
                    of.write(w+' O\n')

In [12]:
for file in os.scandir(yap_output_folder):
    if file.name.endswith('conll') and file.name!='.conll':
        ds, unit, arch, w_embed, seed_num, _ = file.name.split('.')
        out_name = '.'.join(file.name.split('.')[:-1])+'.txt'
        out_path = os.path.join(txt_folder, out_name)
        if '_tok' in w_embed:
            w_embed = w_embed.replace('_tok', '_yap')
        else:
            w_embed = w_embed.replace('_yap', '_tok')
        txt_map[(arch, w_embed)].append((ds, out_path))

## Create configs for NCRF decode

In [17]:
output_folder = 'final_setup/ooo_decode_output'
decode_conf_folder = 'final_setup/ooo_decode_conf'

params = { 'status': 'decode' }

erdf = pd.read_pickle('final_setup/ooo_erdf.pkl')

for i, row in erdf[(erdf.unit=='morph')].iterrows():
    unit = row['unit']
    for ds, set_path in txt_map[(row.arch, row.w_embed)]:
        name = 'morph_'+ds+'_pruned'
        row_par = params.copy()
        row_par['load_model_dir'] = os.path.join(models_folder, row['model_file_name'])
        row_par['dset_dir'] = os.path.join(models_folder, row['dset_file_name'])
        row_par['decode_dir'] = os.path.join(output_folder, name+'.'+row['model_base_name']+'.bmes')
        row_par['raw_dir'] = set_path
        
        conf_path = os.path.join(decode_conf_folder, name+'.'+row['model_base_name']+'.decode.conf')
        if not os.path.exists(conf_path):
            with open(conf_path, 'w', encoding='utf8') as of:
                for k, v in row_par.items():
                    of.write(k+'='+str(v)+'\n')        
 

## Evaluate segmentation accuracy

In [26]:
dev_gold = spdf[spdf.set=='dev']
test_gold = spdf[spdf.set=='test']

In [27]:
dg = (dev_gold.groupby(['sent_id', 'token_id', 'token_str'])
      .size().reset_index().rename(columns={0: 'morpheme_count'}))
tempn = (dev_gold.groupby(['sent_id', 'token_id', 'token_str'])
         .biose_layer0.apply(lambda x: (x!='O').any()).reset_index()[['biose_layer0']])
dg['ner'] = tempn
tg = (test_gold.groupby(['sent_id', 'token_id', 'token_str']).size().reset_index()
      .rename(columns={0: 'morpheme_count'}))
tempn = (test_gold.groupby(['sent_id', 'token_id', 'token_str'])
         .biose_layer0.apply(lambda x: (x!='O').any()).reset_index()[['biose_layer0']])
tg['ner'] = tempn

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
acc_scores = []
for i, row in erdf[erdf.unit=='multitok'].iterrows():
    dev_path = os.path.join(output_folder, 
                            'token_dev.'+row.model_base_name+'.bmes')
    dev_bc = get_biose_count(dev_path, sent_id_shift=1)
    sc = { 'arch': row.arch,
           'w_embed': row.w_embed,
           'model_base_name': row.model_base_name,
           'pred_set': 'dev'}
    sc['all'] = accuracy_score(dev_bc.biose_count, dg.morpheme_count)
    sc['ner'] = accuracy_score(dev_bc[dg.ner].biose_count, dg[dg.ner].morpheme_count)
    sc['non'] = accuracy_score(dev_bc[~dg.ner].biose_count, dg[~dg.ner].morpheme_count)
    acc_scores.append(sc)
    test_path = os.path.join(output_folder, 
                             'token_test.'+row.model_base_name+'.bmes')
    test_bc = get_biose_count(test_path, sent_id_shift=5439)   
    sc = { 'arch': row.arch,
           'w_embed': row.w_embed,
           'model_base_name': row.model_base_name,
           'pred_set': 'test'}
    sc['all'] = accuracy_score(test_bc.biose_count, tg.morpheme_count)
    sc['ner'] = accuracy_score(test_bc[tg.ner].biose_count, tg[tg.ner].morpheme_count)
    sc['non'] = accuracy_score(test_bc[~tg.ner].biose_count, tg[~tg.ner].morpheme_count)
    acc_scores.append(sc)
    
acc_scores = pd.DataFrame(acc_scores)
acc_scores.head()

Unnamed: 0,arch,w_embed,model_base_name,pred_set,all,ner,non
0,char_lstm,ft_oov_tok,multitok.char_lstm.ft_oov_tok.45_seed,dev,0.973626,0.963855,0.974679
1,char_lstm,ft_oov_tok,multitok.char_lstm.ft_oov_tok.45_seed,test,0.973057,0.964048,0.974274
2,char_lstm,ft_oov_tok,multitok.char_lstm.ft_oov_tok.50_seed,dev,0.97386,0.96506,0.974808
3,char_lstm,ft_oov_tok,multitok.char_lstm.ft_oov_tok.50_seed,test,0.972977,0.960719,0.974633
4,char_lstm,ft_oov_tok,multitok.char_lstm.ft_oov_tok.44_seed,dev,0.973274,0.960241,0.974679


In [32]:
acc_scores = pd.read_pickle('final_setup/ooo_acc_scores.pkl')

In [33]:
x = acc_scores.groupby(['pred_set','w_embed', 'arch']).mean().mul(100).round(2).unstack()
x.columns = x.columns.reorder_levels([1,0])
x.sort_index(axis=1)



Unnamed: 0_level_0,arch,char_lstm,char_lstm,char_lstm
Unnamed: 0_level_1,Unnamed: 1_level_1,all,ner,non
pred_set,w_embed,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
dev,ft_oov_tok,97.35,96.41,97.45
test,ft_oov_tok,97.3,96.43,97.41


In [190]:
ys = bclm.get_token_df(bclm.read_yap_output(treebank_set='dev'), fields=['upostag'])
ys['morpheme_count'] = ys.upostag.apply(lambda x: len(x.split('^')))

In [191]:
accuracy_score(ys.morpheme_count, dg.morpheme_count)

0.9597936935880905

In [192]:
accuracy_score(ys[dg.ner].morpheme_count, dg[dg.ner].morpheme_count)


0.9313253012048193

In [193]:
accuracy_score(ys[~dg.ner].morpheme_count, dg[~dg.ner].morpheme_count)


0.9628619659784443

In [194]:
yt = bclm.get_token_df(bclm.read_yap_output(treebank_set='test'), fields=['upostag'])
yt['morpheme_count'] = yt.upostag.apply(lambda x: len(x.split('^')))

In [196]:
accuracy_score(yt.morpheme_count, tg.morpheme_count)

0.9547507726444251

In [199]:
accuracy_score(yt[tg.ner].morpheme_count, tg[tg.ner].morpheme_count)


0.8961384820239681

In [200]:
accuracy_score(yt[~tg.ner].morpheme_count, tg[~tg.ner].morpheme_count)


0.9626697850139426

In [41]:
yap_out_non_o_keep = bclm.read_yap_output(treebank_set=None, tokens_path=bclm.TREEBANK_TOKEN_PATHS['dev'], 
                                     dep_path='final_setup/ooo_pruned/yap_output/dev.multitok.char_lstm.ft_oov_tok.52_seed.conll',
                                     map_path='final_setup/ooo_pruned/yap_output/dev.multitok.char_lstm.ft_oov_tok.52_seed.map')

In [46]:
dev_gold = bclm.read_dataframe('spmrl', subset='dev')

In [47]:
yap_dev_regular = bclm.read_yap_output()

In [48]:
bclm.evaluate_dfs(dev_gold, yap_dev_regular)

11301 gold tokens/morphems, 11426 predicted, 10445 correct.
Precision: 91.41
Recall:    92.43
F1:        91.92
FP ex.: [(1, 5, 'לישראל', 'NNP'), (1, 8, 'ה', 'DEF'), (2, 11, 'דנה', 'BN'), (3, 4, 'ח"כ', 'NNT'), (3, 28, 'הם', 'S-PRN')]
FN ex.: [(1, 5, 'ישראל', 'NNP'), (1, 5, 'ל', 'PREPOSITION'), (2, 11, 'דנה', 'VB'), (3, 4, 'ח"כ', 'NN'), (3, 28, 'המ', 'S-PRN')]


(91.41431822159986, 92.42544907530306, 91.91710300523606)

In [49]:
bclm.evaluate_dfs(dev_gold, yap_out_non_o_keep)

11301 gold tokens/morphems, 11292 predicted, 10517 correct.
Precision: 93.14
Recall:    93.06
F1:        93.1
FP ex.: [(2, 11, 'דנה', 'BN'), (3, 4, 'ח"כ', 'NNT'), (3, 28, 'הם', 'S-PRN'), (4, 1, 'מצד', 'IN'), (5, 13, 'ה', 'DEF')]
FN ex.: [(2, 11, 'דנה', 'VB'), (3, 4, 'ח"כ', 'NN'), (3, 28, 'המ', 'S-PRN'), (4, 1, 'מ', 'PREPOSITION'), (4, 1, 'צד', 'NN')]


(93.13673397095289, 93.0625608353243, 93.09963262957554)

## No FORM

In [50]:
cols = ['sent_id', 'token_id', 'upostag']

In [51]:
bclm.evaluate_dfs(dev_gold, yap_dev_regular, cols=cols)

11301 gold tokens/morphems, 11426 predicted, 10541 correct.
Precision: 92.25
Recall:    93.27
F1:        92.76
FP ex.: [(1, 8, 'DEF'), (2, 11, 'BN'), (3, 4, 'NNT'), (4, 19, 'NN'), (5, 9, 'DEF')]
FN ex.: [(1, 5, 'PREPOSITION'), (2, 11, 'VB'), (3, 4, 'NN'), (4, 19, 'RB'), (5, 9, 'NNT')]


(92.25450726413443, 93.27493142199805, 92.76191314295771)

In [52]:
bclm.evaluate_dfs(dev_gold, yap_out_non_o_keep, cols=cols)

11301 gold tokens/morphems, 11292 predicted, 10603 correct.
Precision: 93.9
Recall:    93.82
F1:        93.86
FP ex.: [(2, 11, 'BN'), (3, 4, 'NNT'), (4, 1, 'IN'), (5, 13, 'DEF'), (6, 24, 'REL')]
FN ex.: [(2, 11, 'VB'), (3, 4, 'NN'), (4, 1, 'NN'), (4, 1, 'PREPOSITION'), (6, 24, 'DEF')]


(93.89833510449876, 93.8235554375719, 93.86093037666534)

## No POS, Segmentation only

In [53]:
cols = ['sent_id', 'token_id', 'form']

In [54]:
bclm.evaluate_dfs(dev_gold, yap_dev_regular, cols=cols)

11301 gold tokens/morphems, 11426 predicted, 10923 correct.
Precision: 95.6
Recall:    96.66
F1:        96.12
FP ex.: [(1, 5, 'לישראל'), (1, 8, 'ה'), (3, 28, 'הם'), (5, 9, 'ה'), (5, 22, 'ה')]
FN ex.: [(1, 5, 'ישראל'), (1, 5, 'ל'), (3, 28, 'המ'), (6, 25, 'ב'), (6, 25, 'מקום')]


(95.59775949588658, 96.65516325988851, 96.1235534826418)

In [55]:
bclm.evaluate_dfs(dev_gold, yap_out_non_o_keep, cols=cols)

11301 gold tokens/morphems, 11292 predicted, 11022 correct.
Precision: 97.61
Recall:    97.53
F1:        97.57
FP ex.: [(3, 28, 'הם'), (4, 1, 'מצד'), (5, 13, 'ה'), (8, 9, 'ה'), (8, 17, 'ה')]
FN ex.: [(3, 28, 'המ'), (4, 1, 'מ'), (4, 1, 'צד'), (8, 17, 'הפעילה'), (15, 5, 'מפם')]


(97.60892667375133, 97.53119192991771, 97.57004381888196)

## Evluate Token Means

In [61]:
bclm.evaluate_means(dev_gold, yap_dev_regular)

(11301, 27) (11426, 13)
(8531,) (8531,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gold_df['upostag'] = gold_df.upostag.str.replace('_','-')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df['upostag'] = pred_df.upostag.str.replace('_','-')


(92.39245106083693, 92.66107920134412, 92.47467220389504)

In [63]:
bclm.evaluate_means(dev_gold, yap_out_non_o_keep)

(11301, 27) (11405, 13)
(8531,) (8531,)


(92.84276950728714, 93.06548665650764, 92.90810545294192)

In [64]:
bclm.evaluate_means(dev_gold, yap_out_all_keep)

(11301, 27) (11301, 13)
(8531,) (8531,)


(93.27550502090416, 93.33313796741297, 93.27243498501264)

### No FORM

In [65]:
bclm.evaluate_means(dev_gold, yap_dev_regular, cols=cols)

(11301, 27) (11426, 13)
(8531,) (8531,)


(93.07232446372053, 93.42691360919001, 93.13696267394545)

In [66]:
bclm.evaluate_means(dev_gold, yap_out_non_o_keep, cols=cols)

(11301, 27) (11405, 13)
(8531,) (8531,)


(93.40346969874575, 93.73754542257649, 93.47466662201161)

In [67]:
bclm.evaluate_means(dev_gold, yap_out_all_keep, cols=cols)

(11301, 27) (11301, 13)
(8531,) (8531,)


(93.8264369163443, 93.94463329816746, 93.82024102572689)

## #Segment accuracy

In [69]:
gs = dev_gold.groupby(['sent_id', 'token_id', 'token_str']).size().reset_index().rename(columns={0: 'morpheme_count'})

In [70]:
gs.head()

Unnamed: 0,sent_id,token_id,token_str,morpheme_count
0,1,1,עשרות,1
1,1,2,אנשים,1
2,1,3,מגיעים,1
3,1,4,מתאילנד,2
4,1,5,לישראל,2


In [71]:
len(ps), len(gs)

(8531, 8531)

In [72]:
from sklearn.metrics import accuracy_score
accuracy_score(ps.biose_count, gs.morpheme_count)

0.974328918063533

In [73]:
accuracy_score(ps[(ps.biose.str.contains('-'))].biose_count, gs[(ps.biose.str.contains('-'))].morpheme_count)


0.9835616438356164

In [74]:
accuracy_score(ps[~(ps.biose.str.contains('-'))].biose_count, gs[~(ps.biose.str.contains('-'))].morpheme_count)


0.9734649403922574

In [75]:
gs.morpheme_count.value_counts()

1    6078
2    2143
3     303
4       7
Name: morpheme_count, dtype: int64

In [76]:
ps.biose_count.value_counts()

1    6056
2    2156
3     316
4       3
Name: biose_count, dtype: int64

In [77]:
ps[(ps.biose_count!=gs.morpheme_count) & (ps.biose.str.contains('-'))]

Unnamed: 0,sent_id,token_id,token_str,biose,biose_count
633,30,1,במלחמת,O^B-LOC^I-LOC,3
641,30,9,שכונת,O^B-LOC,2
2075,103,16,הארקין,B-ORG^E-ORG,2
2212,110,6,הארקין,B-ORG^E-ORG,2
4133,225,13,הדסון,I-ORG^E-ORG,2
4180,227,6,לנקובסקי,O^S-PER,2
4232,229,8,שקריסטול,S-PER,1
4478,239,26,באיסט,O^B-GPE,2
6254,337,1,בארץ,O^B-GPE^E-GPE,3
6573,356,11,בשן,S-PER,1


In [78]:
ps[(ps.biose_count==gs.morpheme_count) & (ps.biose.str.contains('-'))].iloc[140:190]

Unnamed: 0,sent_id,token_id,token_str,biose,biose_count
1693,82,6,"ארה""ב",S-GPE,1
1700,82,13,כברית,O^B-GPE,2
1701,82,14,המועצות,I-GPE^E-GPE,2
1705,83,3,איובה,S-GPE,1
1712,83,10,במערב,O^B-LOC^I-LOC,3
1713,83,11,התיכון,I-LOC^E-LOC,2
1719,83,17,גימי,B-PER,1
1720,83,18,קרטר,E-PER,1
1730,83,28,לאפגניסטן,O^S-GPE,2
1732,84,1,איובה,S-GPE,1


In [80]:
ys = bclm.get_token_df(bclm.read_yap_output(treebank_set='dev'), fields=['upostag'])
ys.head()

Unnamed: 0,sent_id,token_id,token_str,upostag
0,1,1,עשרות,CDT
1,1,2,אנשים,NN
2,1,3,מגיעים,BN
3,1,4,מתאילנד,PREPOSITION^NNP
4,1,5,לישראל,NNP


In [81]:
ys['morpheme_count'] = ys.upostag.apply(lambda x: len(x.split('^')))
ys.head()

Unnamed: 0,sent_id,token_id,token_str,upostag,morpheme_count
0,1,1,עשרות,CDT,1
1,1,2,אנשים,NN,1
2,1,3,מגיעים,BN,1
3,1,4,מתאילנד,PREPOSITION^NNP,2
4,1,5,לישראל,NNP,1


In [82]:
accuracy_score(ys.morpheme_count, gs.morpheme_count)

0.9597936935880905

In [83]:
accuracy_score(ys[(ps.biose.str.contains('-'))].morpheme_count, gs[(ps.biose.str.contains('-'))].morpheme_count)


0.9260273972602739

In [84]:
accuracy_score(ys[~(ps.biose.str.contains('-'))].morpheme_count, gs[~(ps.biose.str.contains('-'))].morpheme_count)


0.9629534675041661

In [85]:
ys[(ys.morpheme_count!=gs.morpheme_count) & (ps.biose.str.contains('-'))]

Unnamed: 0,sent_id,token_id,token_str,upostag,morpheme_count
4,1,5,לישראל,NNP,1
766,37,3,"שצה""ל",NNP,1
819,40,9,כמנזר,PREPOSITION^DEF^NN,3
1168,56,30,ביחד,PREPOSITION^RB,2
1182,57,12,לירושלים,NNP,1
1193,57,23,ואד,CONJ^NN,2
1199,57,29,אלי,IN^S_PRN,2
1222,58,12,לירושלים,PREPOSITION^DEF^NNP,3
1730,83,28,לאפגניסטן,PREPOSITION^DEF^NNP,3
1900,95,4,לטום,NNP,1
