# Re-evaluation
Perform two evaluations:
1. Strict morpheme evaluation
1. Token evaluation (morpheme labels are extended to the token level heuristically)

In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
%matplotlib inline

In [7]:
import pandas as pd
import numpy as np

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [9]:
import os

In [10]:
import sys
sys.path.append('/home/nlp/danb/NER')

import bclm
import ne_evaluate_mentions as nem

In [11]:
def get_biose_count(path, sent_id_shift=1):
    sents = nem.read_file_sents(path, fix_multi_tag=False, sent_id_shift=sent_id_shift)
    bc = []
    for i, sent in sents.iteritems():
        for j, (tok, bio) in enumerate(sent):
            bc.append([i, j+1, tok, bio, len(bio.split('^'))])

    bc = pd.DataFrame(bc, columns=['sent_id', 'token_id', 'token_str', 
                                   'biose', 'biose_count'])
    return bc

In [12]:
import networkx as nx

In [13]:
def get_valid_edges(lattices, bc,
                    non_o_only=True, keep_all_if_no_valid=True):
    valid_edges = []
    for (i, df), (_, biose, biose_count) in zip(lattices.groupby(['sent_id', 'token_id']), 
                                                bc[['biose', 'biose_count']].itertuples()):
        el = df[['ID1', 'ID2']].rename(columns={'ID1': 'source', 'ID2': 'target'})
        #min_node = [n for n,v in G.nodes(data=True) if v['since'] == 'December 2008'][0]

        g = nx.from_pandas_edgelist(el, create_using=nx.DiGraph)
        min_node = el.source.min()
        max_node = el.target.max()
        #print(min_node,max_node)
        #print(biose_count)
        if non_o_only and not '-' in biose:
            vp = list(nx.all_simple_paths(g, min_node, max_node))
        else:
            vp = [path for path in nx.all_simple_paths(g, min_node, max_node, cutoff=biose_count+1) if len(path)==biose_count+1]
        if keep_all_if_no_valid and len(vp)==0:
             vp = nx.all_simple_paths(g, min_node, max_node)
        for path in vp:
            for source, target in zip(path[:-1], path[1:]):
                valid_edges.append((i[0], i[1], source, target))
                
    return valid_edges

In [14]:
def to_lattices(df, path, cols = ['ID1', 'ID2', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'token_id']):
    with open(path, 'w', encoding='utf8') as of:
        for _, sent in df.groupby('sent_id'):
            for _, row in sent[cols].iterrows():
                of.write('\t'.join(row.astype(str).tolist())+'\n')
            of.write('\n')
            
    

In [15]:
def prune_lattices(lattices_path, ner_pred_path, output_path, keep_all_if_no_valid=True):
    lat = bclm.read_lattices(lattices_path)
    bc = get_biose_count(ner_pred_path, sent_id_shift=1)
    valid_edges = get_valid_edges(lat, bc, non_o_only=False, keep_all_if_no_valid=keep_all_if_no_valid)
    cols = ['sent_id', 'token_id', 'ID1', 'ID2']
    pruned_lat = lat[lat[cols].apply(lambda x: tuple(x), axis=1).isin(valid_edges)]
    to_lattices(pruned_lat, output_path)

## Evaluate Segmentation

In [16]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]

In [17]:
dev_gold = bclm.read_dataframe('spmrl', subset='dev')
test_gold = spdf[spdf.set=='test']
test_sent_id_map = (test_gold.groupby('sent_id').size()
                    .reset_index().drop(0, axis=1).reset_index()
                    .assign(index=lambda x: x+1).set_index('sent_id')['index'])
test_gold['sent_id'] = test_gold.sent_id.map(test_sent_id_map)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
cols = ['sent_id', 'token_id', 'form']

In [19]:
res = []
for folder in os.scandir('output/predict'):
    if 'multi' in folder.name and not '.ipynb' in folder.name:
        test_lfo = bclm.read_yap_output(treebank_set=None, tokens_path=bclm.TREEBANK_TOKEN_PATHS['test'], 
                                             dep_path=os.path.join(folder, 'test_pruned.conll'),
                                             map_path=os.path.join(folder, 'test_pruned.map'))
        p,r,f_sp = bclm.evaluate_dfs(test_gold, test_lfo)
        
        p,r,f_so = bclm.evaluate_dfs(test_gold, test_lfo, cols=cols)
        
        res.append((folder.name, f_sp, f_so))
#         for file in os.scandir(folder):
#             if '.bmes' in file.name and not '.ipynb' in file.name:
#                 if 'dev' in file.name:
#                     prune_lattices(bclm.LATTICES_PATHS['dev'], 
#                        file.path,
#                        os.path.join(folder.path, 'dev_pruned.lat'))
#                 elif 'test' in file.name:
#                     prune_lattices(bclm.LATTICES_PATHS['test'], 
#                        file.path,
#                        os.path.join(folder.path, 'test_pruned.lat'))
                    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gold_df['upostag'] = gold_df.upostag.str.replace('_','-')


16828 gold tokens/morphems, 16857 predicted, 15399 correct.
Precision: 91.35
Recall:    91.51
F1:        91.43
FP ex.: [(1, 1, 'הכל', 'VB'), (1, 3, 'עם', 'IN'), (1, 17, 'החל', 'VB'), (1, 20, 'כלה', 'NN'), (2, 1, 'אומר', 'NNT')]
FN ex.: [(1, 1, 'הכל', 'NN'), (1, 3, 'עמ', 'IN'), (1, 17, 'החל', 'IN'), (1, 20, 'כלה', 'IN'), (2, 1, 'אומר', 'BN')]
16828 gold tokens/morphems, 16857 predicted, 16421 correct.
Precision: 97.41
Recall:    97.58
F1:        97.5
FP ex.: [(1, 3, 'עם'), (2, 3, 'ה'), (2, 3, 'תק"ם'), (4, 10, 'לנו'), (4, 10, 'ש')]
FN ex.: [(1, 3, 'עמ'), (2, 3, 'התק"ם'), (4, 10, 'אנחנו'), (4, 10, 'של'), (6, 9, 'O')]
16828 gold tokens/morphems, 16842 predicted, 15394 correct.
Precision: 91.4
Recall:    91.48
F1:        91.44
FP ex.: [(1, 1, 'הכל', 'VB'), (1, 3, 'עם', 'IN'), (1, 17, 'החל', 'VB'), (1, 20, 'כלה', 'NN'), (2, 1, 'אומר', 'NNT')]
FN ex.: [(1, 1, 'הכל', 'NN'), (1, 3, 'עמ', 'IN'), (1, 17, 'החל', 'IN'), (1, 20, 'כלה', 'IN'), (2, 1, 'אומר', 'BN')]
16828 gold tokens/morphems, 16842 p

In [20]:
seg_res_df = pd.DataFrame(res, columns=['model', 'f_seg_pos', 'f_seg_only'])
seg_res_df

Unnamed: 0,model,f_seg_pos,f_seg_only
0,multi_54360,91.42942,97.497402
1,multi_44184,91.440451,97.451737
2,multi_20423,91.500861,97.511433
3,multi_80520,91.363231,97.493317
4,multi_27916,91.394659,97.465875
5,multi_63795,91.493911,97.576478
6,multi_30528,91.46403,97.473353
7,multi_78160,91.41026,97.498739
8,multi_12345,91.455377,97.500148
9,multi_95148,91.391909,97.504901


In [21]:
seg_res_df.mean()

f_seg_pos     91.434411
f_seg_only    97.497338
dtype: float64

## Align Multitok

In [22]:
def soft_merge_bio_labels(multitok_sents, tokmorph_sents, verbose=False):
    new_sents = []
    for (i, mt_sent), (sent_id, mor_sent) in zip(multitok_sents.iteritems(), tokmorph_sents.iteritems()):
        new_sent = []
        for (form, bio), (token_id, token_str, forms) in zip(mt_sent, mor_sent):
            forms = forms.split('^')
            bio = bio.split('^')
            if len(forms) == len(bio):
                new_forms = (1, list(zip(forms,bio)))
            elif len(forms)>len(bio):
                dif = len(forms) - len(bio)
                new_forms = (2, list(zip(forms[:dif],['O']*dif)) + list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            else:
                new_forms = (3, list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            new_sent.extend(new_forms[1])
        new_sents.append(new_sent)
    return new_sents

In [23]:
def align_multitok(ner_pred_path, tokens_path, conll_path, map_path, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)
    prun_yo = bclm.read_yap_output(treebank_set=None, tokens_path=tokens_path, dep_path=conll_path, map_path=map_path)
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [24]:
decode_sets = {
    'token': {
        'dev': '../NER/data/for_ncrf/morph_gold_dev.bmes',
        'test': '../NER/data/for_ncrf/morph_gold_test.bmes',
    },
    'multitok': {
        'dev': '../NER/data/for_ncrf/morph_gold_dev.bmes',
        'test': '../NER/data/for_ncrf/morph_gold_test.bmes',
    }
}

In [25]:
align_multitok('output/predict/multi_44184/token_gold_test_dummy_o.bmes', 
               bclm.TREEBANK_TOKEN_PATHS['test'], 
               'output/predict/multi_44184/test_pruned.conll',
               'output/predict/multi_44184/test_pruned.map',
               'output/predict/multi_44184/morph_pruned_test.bmes'
              )
p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], 'output/predict/multi_44184/morph_pruned_test.bmes', str_join_char='')
p,r,f

(0.75, 0.7628755364806867, 0.7563829787234043)

In [26]:
nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                   'output/predict/multi_44184/token_gold_test_dummy_o.bmes', str_join_char='')

(0.7789473684210526, 0.7939914163090128, 0.7863974495217853)

In [27]:
res = []
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        print (folder.name)

        pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.bmes')
        
        align_multitok(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                       bclm.TREEBANK_TOKEN_PATHS['test'], 
                       os.path.join(folder.path, 'test_pruned.conll'),
                       os.path.join(folder.path, 'test_pruned.map'),
                       pruned_ner_path
                      )
        p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], pruned_ner_path, str_join_char='')
        res.append((folder.name, p, r, f))


multi_54360
multi_44184
multi_20423
multi_80520
multi_27916
multi_63795
multi_30528
multi_78160
multi_12345
multi_95148


In [28]:
res

[('multi_54360', 0.7497371188222923, 0.7650214592274678, 0.7573021773765267),
 ('multi_44184', 0.75, 0.7628755364806867, 0.7563829787234043),
 ('multi_20423', 0.75, 0.7660944206008584, 0.7579617834394905),
 ('multi_80520', 0.7368972746331237, 0.7542918454935622, 0.7454931071049842),
 ('multi_27916', 0.7466666666666667, 0.7811158798283262, 0.7635028841111694),
 ('multi_63795', 0.7357512953367875, 0.7618025751072961, 0.7485503426462835),
 ('multi_30528', 0.7427685950413223, 0.7714592274678111, 0.7568421052631579),
 ('multi_78160', 0.7515856236786469, 0.7628755364806867, 0.7571884984025559),
 ('multi_12345', 0.7306122448979592, 0.7682403433476395, 0.7489539748953974),
 ('multi_95148', 0.7569296375266524, 0.7618025751072961, 0.7593582887700534)]

In [29]:
ne_morph_df = pd.DataFrame(res, columns=['model', 'precision', 'recall', 'f'])
ne_morph_df

Unnamed: 0,model,precision,recall,f
0,multi_54360,0.749737,0.765021,0.757302
1,multi_44184,0.75,0.762876,0.756383
2,multi_20423,0.75,0.766094,0.757962
3,multi_80520,0.736897,0.754292,0.745493
4,multi_27916,0.746667,0.781116,0.763503
5,multi_63795,0.735751,0.761803,0.74855
6,multi_30528,0.742769,0.771459,0.756842
7,multi_78160,0.751586,0.762876,0.757188
8,multi_12345,0.730612,0.76824,0.748954
9,multi_95148,0.75693,0.761803,0.759358


In [30]:
def biose_to_o(in_path, out_path):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            if line=='\n':
                of.write(line)
                sents+=1
            else:
                line = line.strip()
                word, tag = line.split()
                tag = 'O'
                of.write(word+' '+tag+'\n')
            
    print (sents)
    
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.bmes')
        biose_to_o(pruned_ner_path, pruned_ner_path.replace('.bmes', '.bioul'))


706
706
706
706
706
706
706
706
706
706


In [32]:
import json

def jsonl_to_biose(in_path, out_path, bioul_to_biose=True):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            sent = json.loads(line)
            for word, tag in zip(sent['words'], sent['tags']):
                if bioul_to_biose:
                    tag = tag.replace('L-', 'E-').replace('U-', 'S-')
                of.write(word+' '+tag+'\n')
            of.write('\n')
            sents+=1
    print (sents)


In [33]:
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'morph' in folder.name and not '.ipynb_checkpoints' in folder.name:
        pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.json')
        jsonl_to_biose(pruned_ner_path, pruned_ner_path.replace('.json', '.bmes'))

706
706
706
706
706
706
706
706
706
706


## SINGLE + MULTI

In [34]:
res = []
for folder in os.scandir('output/predict'):
    if '.ipynb' in folder.name:
        continue
        
    variant, seed = folder.name.split('_')
    
    if 'single' in folder.name:    
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                                   os.path.join(folder.path,'token_gold_test_fix.bmes'), str_join_char='')
        res.append(('test', 'token', variant, 'tokens', '-', seed, p, r, f))

        
    if 'multi' in folder.name:
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                                   os.path.join(folder.path,'token_gold_test_dummy_o.bmes'), str_join_char='')
        res.append(('test', 'token', variant, 'tokens', '-', seed, p, r, f))
        
    if 'morph' in folder.name:
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                   os.path.join(folder.path,'morph_gold_test.bmes'), str_join_char='')
        res.append(('test', 'morph', variant, 'gold', '-', seed, p, r, f))
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                   os.path.join(folder.path,'morph_yap_test.bmes'), str_join_char='')
        res.append(('test', 'morph', variant, 'yap', '-', seed, p, r, f))
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                   os.path.join(folder.path,'morph_pruned_test.bmes'), str_join_char='')
        res.append(('test', 'morph', variant, 'hybrid', '-', seed, p, r, f))
        
        
    
    

ne_df = pd.DataFrame(res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

ne_df.groupby(['eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p,r,f
eval_unit,variant,prediction,align,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
morph,morph,gold,-,0.760813,0.796888,0.778388
morph,morph,hybrid,-,0.734401,0.761588,0.74771
morph,morph,yap,-,0.661856,0.673391,0.667525
token,multi,tokens,-,0.774972,0.797747,0.78616
token,single,tokens,-,0.781663,0.814914,0.797819


In [35]:
res = []
for folder in os.scandir('output/predict_char'):
    if '.ipynb' in folder.name:
        continue
        
    variant, seed = folder.name.split('_')
    
    if 'single' in folder.name:    
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                                   os.path.join(folder.path,'token_gold_test_fix.bmes'), str_join_char='')
        res.append(('test', 'token', variant, 'tokens', '-', seed, p, r, f))

        
    if 'multi' in folder.name:
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                                   os.path.join(folder.path,'token_gold_test_dummy_o.bmes'), str_join_char='')
        res.append(('test', 'token', variant, 'tokens', '-', seed, p, r, f))
        
    if 'morph' in folder.name:
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                   os.path.join(folder.path,'morph_gold_test.bmes'), str_join_char='')
        res.append(('test', 'morph', variant, 'gold', '-', seed, p, r, f))
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                   os.path.join(folder.path,'morph_yap_test.bmes'), str_join_char='')
        res.append(('test', 'morph', variant, 'yap', '-', seed, p, r, f))
        #p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
        #                           os.path.join(folder.path,'morph_pruned_test.bmes'), str_join_char='')
        #res.append(('test', 'morph', variant, 'hybrid', '-', seed, p, r, f))
        
        
    
    

ne_c_df = pd.DataFrame(res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

ne_c_df.groupby(['eval_unit','variant', 'prediction', 'align']).f.mean()

eval_unit  variant  prediction  align
morph      morph    gold        -        0.784453
                    yap         -        0.673834
token      multi    tokens      -        0.787413
           single   tokens      -        0.794716
Name: f, dtype: float64

## Add Alignments

### Token Level Eval

In [36]:
import re

o_re = re.compile('^O+$') 
s_re = re.compile('^O*SO*$|^O*BI*EO*$')
b_re = re.compile('^O*BI*$')
i_re = re.compile('^I+$')
e_re = re.compile('^I*EO*$')
def get_fixed_for_valid_biose(bio_seq):
    if o_re.match(bio_seq):
        return 'O'
    if s_re.match(bio_seq):
        return 'S'
    if b_re.match(bio_seq):
        return 'B'
    if i_re.match(bio_seq):
        return 'I'
    if e_re.match(bio_seq):
        return 'E'
    raise ValueError
    

def get_fixed_for_invalid_biose(parts):
    bio = 'O'
    if 'S' in parts:
        bio = 'S'
    elif 'B' in parts and 'E' in parts:
        bio='S'
    elif 'E' in parts:
        bio = 'E'
    elif 'B' in parts:
        bio = 'B'
    elif 'I' in parts:
        bio = 'I'
    return bio

valid_bio_re = re.compile('^O*BI*$|^O*BI*EO*$|^I+$|^I*EO*$|^O*SO*$')

from functools import lru_cache


def validate_biose_sequence(full_bio_seq):
    #print(full_bio_seq)
    bio_seq, type_seq = zip(*[('O', None) if b=='O' else b.split('-') for b in full_bio_seq])
    bio_seq = ''.join(bio_seq)
    valid_bio = valid_bio_re.match(bio_seq)
    type_seq = list(filter(lambda x: x is not None, type_seq))
    type_seq_set = set(type_seq)

    if valid_bio:
        fixed_bio = get_fixed_for_valid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
            
    else:
        #take the first BIOSE tag which is not O:
        #fixed_bio = list(filter(lambda x: x!='O', full_bio_seq))[0]
        #rough BIOSE and first category:
        fixed_bio = get_fixed_for_invalid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
        
    return valid_bio is not None, len(type_seq_set)<=1, fixed_bio


@lru_cache(1000)
def get_fixed_bio_sequence(full_bio_seq):
    return validate_biose_sequence(full_bio_seq)[2]

In [37]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]
dev_gold = spdf[spdf.set=='dev']
test_gold = spdf[spdf.set=='test']
test_gold['sent_id'] = test_gold.sent_id.rank(method='dense').astype(int)
dev_yap = bclm.read_yap_output(treebank_set='dev')
test_yap = bclm.read_yap_output(treebank_set='test')
dev_gold_sents = bclm.get_sentences_list(dev_gold, fields=['token_id', 'token_str'])
test_gold_sents = bclm.get_sentences_list(test_gold, fields=['token_id', 'token_str'])
dev_yap_sents = bclm.get_sentences_list(dev_yap, fields=['token_id', 'token_str'])
test_yap_sents = bclm.get_sentences_list(test_yap, fields=['token_id', 'token_str'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [38]:
dev_gold_tok = (bclm.get_token_df(dev_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok = (bclm.get_token_df(test_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok['sent_id'] = test_gold_tok.sent_id.rank(method='dense').astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [39]:
path = 'output/predict/morph_12345/morph_yap_test.bmes'
def get_fixed_tok(path, orig_sents=dev_yap_sents):
    x = nem.read_file_sents(path, fix_multi_tag=False)
    new_sents = []
    for (i, ner_sent), (sent_id, yap_sent) in zip(x.iteritems(), orig_sents.iteritems()):
        for (form, bio), (token_id, token_str) in zip(ner_sent, yap_sent):
            new_sents.append((sent_id, token_id, token_str, form, bio))
    new_sents = pd.DataFrame(new_sents, columns=['sent_id', 'token_id', 'token_str', 'form', 'bio'])
    new_toks = bclm.get_token_df(new_sents, fields=['bio'])
    new_toks['fixed_bio'] = new_toks.bio.apply(lambda x: get_fixed_bio_sequence(tuple(x.split('^'))))
    return new_toks

new_toks = get_fixed_tok(path, orig_sents=test_yap_sents)
new_toks.head(20)

Unnamed: 0,sent_id,token_id,token_str,bio,fixed_bio
0,1,1,הכל,O^O,O
1,1,2,נושאים,O,O
2,1,3,עמם,O,O
3,1,4,את,O,O
4,1,5,כישלונות,O,O
5,1,6,הקליטה,O^O,O
6,1,7,בעליות,O^O^O,O
7,1,8,הקודמות,O^O,O
8,1,9,",",O,O
9,1,10,את,O,O


In [40]:
def sents_from_df(df, sent_id_col='sent_id', 
                  group_cols=['token_str'], 
                  val_cols=['fixed_bio']):
    sents = bclm.get_sentences_list(df, fields=group_cols+val_cols)
    return sents

def evaluate_dataframes(gold_df, pred_df, fix_multi_tag_pred=True, truncate=None, ignore_cat=False, str_join_char=' '):
    gold_sents = sents_from_df(gold_df)
    pred_sents = sents_from_df(pred_df)
    gold_mentions = nem.sents_to_mentions(gold_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    pred_mentions = nem.sents_to_mentions(pred_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    return nem.evaluate_mentions(gold_mentions, pred_mentions, verbose=False)

In [41]:
sents_from_df(new_toks)

sent_id
1      [[הכל, O], [נושאים, O], [עמם, O], [את, O], [כי...
2      [[אומר, O], [מזכיר, O], [התק"ם, S-ORG], [,, O]...
3      [[לא, O], [ייתכן, O], [שעולה, O], [יבוא, O], [...
4      [[לא, O], [ייתכן, O], [שהוא, O], [יירד, O], [מ...
5      [[לכן, O], [קבענו, O], [עיקרון, O], [שצריכה, O...
                             ...                        
702    [[האנטיפסטו, O], [של, O], [מאכלי, O], [ים, O],...
703    [[אף, O], [שהמנה, O], [היתה, O], [טעימה, O], [...
704    [[כמו, O], [שיעולים, O], [,, O], [שלשול, O], [...
705    [[הוריהם, O], [האמינו, O], [ברפואה, O], [המודר...
706    [[העצה, O], [היתה, O], [טובה, O], [אך, O], [לא...
Length: 706, dtype: object

In [42]:
evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

(0.7568421052631579, 0.7714592274678111, 0.7640807651434643)

#### Hybrid


In [43]:
out_folder = '../NER/data/tokens_for_ncrf'
dev_out = os.path.join(out_folder, 'dev_tokens.txt')
test_out = os.path.join(out_folder, 'test_tokens.txt')
token_paths = {'dev': dev_out, 'test': test_out}

In [44]:
@lru_cache(512)
def get_prun_yo(ds, dep_path, map_path):

    
    prun_yo = bclm.read_yap_output(treebank_set=None,
                               tokens_path=token_paths[ds],
                               dep_path=dep_path,
                               map_path=map_path,
                                )
    return prun_yo

In [45]:
dep_path='output/predict/multi_12345/test_pruned.conll'
map_path='output/predict/multi_12345/test_pruned.map'

prun_yo = get_prun_yo('test', dep_path, map_path)
prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str'])
path =  'output/predict/morph_12345/morph_pruned_test.bmes'
new_toks = get_fixed_tok(path, orig_sents=prun_sents)
evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

(0.7664609053497943, 0.7993562231759657, 0.7825630252100841)

In [46]:
nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes',
                  path)

(0.7438271604938271, 0.7757510729613734, 0.7594537815126049)

#### Run on all pruned

In [47]:
@lru_cache(512)
def get_sent_list(ds, dp, mp):
    prun_yo = get_prun_yo(ds, dp, mp)
    return bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str'])

In [48]:
align_tok_res = []
for folder in os.scandir('output/predict'):
    if 'morph' in folder.name and not '.ipynb' in folder.name:
        ## test 
        variant, seed = folder.name.split('_')
        file = os.path.join(folder.path,'morph_pruned_test.bmes')
        multi_folder = folder.path.replace('morph_', 'multi_')
        dep_path = os.path.join(multi_folder, 'test_pruned.conll')
        map_path = os.path.join(multi_folder, 'test_pruned.map')
        out_path = os.path.join(folder.path, 'morph_pruned_test_align_tokens.bmes')
        
        prun_sents = get_sent_list('test',dep_path , map_path)
        new_toks = get_fixed_tok(file, orig_sents=prun_sents)
        
        if not os.path.exists(out_path):
            new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
            with open(out_path, 'w') as of:
                for i, sent in new_sents.iteritems():
                    for tok, bio in sent:
                        of.write(tok+' '+bio+'\n')
                    of.write('\n')
                    
        p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

        align_tok_res.append(('test', 'token', 'morph', 'hybrid', 'tokens', seed, p, r, f))
        

In [49]:
at_df = pd.DataFrame(align_tok_res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

at_df.groupby(['eval_unit','variant', 'prediction', 'align']).f.mean()

eval_unit  variant  prediction  align 
token      morph    hybrid      tokens    0.77026
Name: f, dtype: float64

#### Run all gold and YAP

In [50]:
align_tok_res_yg = []
for folder in os.scandir('output/predict'):
    if 'morph' in folder.name and not '.ipynb' in folder.name:
        ## test 
        ## - gold
        variant, seed = folder.name.split('_')
        file = os.path.join(folder.path,'morph_gold_test.bmes')
        out_path = os.path.join(folder.path, 'morph_gold_test_align_tokens.bmes')
        
        new_toks = get_fixed_tok(file, orig_sents=test_gold_sents)
        
        if not os.path.exists(out_path):
            new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
            with open(out_path, 'w') as of:
                for i, sent in new_sents.iteritems():
                    for tok, bio in sent:
                        of.write(tok+' '+bio+'\n')
                    of.write('\n')
                    
        p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

        align_tok_res_yg.append(('test', 'token', 'morph', 'gold', 'tokens', seed, p, r, f))
        
        ## - yap
        variant, seed = folder.name.split('_')
        file = os.path.join(folder.path,'morph_yap_test.bmes')
        out_path = os.path.join(folder.path, 'morph_yap_test_align_tokens.bmes')
        
        new_toks = get_fixed_tok(file, orig_sents=test_yap_sents)
        
        if not os.path.exists(out_path):
            new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
            with open(out_path, 'w') as of:
                for i, sent in new_sents.iteritems():
                    for tok, bio in sent:
                        of.write(tok+' '+bio+'\n')
                    of.write('\n')
        p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

        align_tok_res_yg.append(('test', 'token', 'morph', 'yap', 'tokens', seed, p, r, f))

In [51]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

at_df.groupby(['eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p,r,f
eval_unit,variant,prediction,align,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
token,morph,gold,tokens,0.764682,0.800536,0.782153
token,morph,hybrid,tokens,0.756655,0.784442,0.77026
token,morph,yap,tokens,0.737297,0.750107,0.743593


### Morpheme Level Eval

#### Token Multi

#### Hybrid

In [52]:
align_morph_res_hyb = []
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        print (folder.name)

        pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.bmes')
        
        p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], pruned_ner_path, str_join_char='')
        align_morph_res_hyb.append(('test', 'morph', 'multi', 'tokens', 'hybrid', seed, p, r, f))

multi_54360
multi_44184
multi_20423
multi_80520
multi_27916
multi_63795
multi_30528
multi_78160
multi_12345
multi_95148


In [53]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

at_df.groupby(['eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p,r,f
eval_unit,variant,prediction,align,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
morph,multi,tokens,hybrid,0.745095,0.765558,0.755154
token,morph,gold,tokens,0.764682,0.800536,0.782153
token,morph,hybrid,tokens,0.756655,0.784442,0.77026
token,morph,yap,tokens,0.737297,0.750107,0.743593


#### YAP + GOLD

In [54]:
def align_multitok_yg(ner_pred_path, prun_sents, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)

    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [55]:
gold_morph = {'dev': dev_gold, 'test': test_gold}
def get_sents_for_mult(treebank_set, gold=False, pred_set=None, 
                       dep_path=None, map_path=None):
    if treebank_set is None:
        prun_yo = get_prun_yo(pred_set, dep_path, map_path)
    else:
        if not gold:
            prun_yo = bclm.read_yap_output(treebank_set=treebank_set)
        else:
            prun_yo = gold_morph[treebank_set]
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    return prun_sents

dev_yap_sents_m = get_sents_for_mult('dev')
test_yap_sents_m = get_sents_for_mult('test')
dev_gold_sents_m = get_sents_for_mult('dev', gold=True)
test_gold_sents_m = get_sents_for_mult('test', gold=True)

In [56]:
align_morph_res_yap = []
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        print (folder.name)

        yap_ner_path=os.path.join(folder.path, 'morph_yap_test.bmes')
        
        align_multitok_yg(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                           test_yap_sents_m,
                           yap_ner_path
                          )
        p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], yap_ner_path, str_join_char='')
        align_morph_res_yap.append(('test', 'morph', 'multi', 'tokens', 'yap', seed, p, r, f))


multi_54360
multi_44184
multi_20423
multi_80520
multi_27916
multi_63795
multi_30528
multi_78160
multi_12345
multi_95148


In [57]:
align_morph_res_gold = []
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        print (folder.name)

        gold_ner_path=os.path.join(folder.path, 'morph_gold_test.bmes')
        
        align_multitok_yg(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                           test_gold_sents_m,
                           gold_ner_path
                          )
        p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], gold_ner_path, str_join_char='')
        align_morph_res_gold.append(('test', 'morph', 'multi', 'tokens', 'gold', seed, p, r, f))

multi_54360
multi_44184
multi_20423
multi_80520
multi_27916
multi_63795
multi_30528
multi_78160
multi_12345
multi_95148


In [58]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb+align_morph_res_yap+align_morph_res_gold, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

at_df.groupby(['eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p,r,f
eval_unit,variant,prediction,align,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
morph,multi,tokens,gold,0.770651,0.788841,0.779604
morph,multi,tokens,hybrid,0.745095,0.765558,0.755154
morph,multi,tokens,yap,0.704023,0.701824,0.702888
token,morph,gold,tokens,0.764682,0.800536,0.782153
token,morph,hybrid,tokens,0.756655,0.784442,0.77026
token,morph,yap,tokens,0.737297,0.750107,0.743593


In [59]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb+align_morph_res_yap+align_morph_res_gold, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

(at_df.groupby(['eval_unit','variant', 'prediction', 'align']).f.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ (1.96*(x['std']/np.sqrt(10))).round(1).astype(str)+'$')[['mean']])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean
eval_unit,variant,prediction,align,Unnamed: 4_level_1
morph,multi,tokens,gold,$77.96 ± 0.3$
morph,multi,tokens,hybrid,$75.52 ± 0.3$
morph,multi,tokens,yap,$70.29 ± 0.3$
token,morph,gold,tokens,$78.22 ± 0.9$
token,morph,hybrid,tokens,$77.03 ± 0.8$
token,morph,yap,tokens,$74.36 ± 0.8$


In [60]:
(ne_df.groupby(['eval_unit','variant', 'prediction', 'align']).f.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ (1.96*(x['std']/np.sqrt(10))).round(1).astype(str)+'$')[['mean']])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean
eval_unit,variant,prediction,align,Unnamed: 4_level_1
morph,morph,gold,-,$77.84 ± 0.9$
morph,morph,hybrid,-,$74.77 ± 0.9$
morph,morph,yap,-,$66.75 ± 0.8$
token,multi,tokens,-,$78.62 ± 0.4$
token,single,tokens,-,$79.78 ± 0.7$


#### Token Single