# Re-evaluation
Perform two evaluations:
1. Strict morpheme evaluation
1. Token evaluation (morpheme labels are extended to the token level heuristically)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

## Create BIOSE files


In [14]:
import json

def jsonl_to_biose(in_path, out_path, bioul_to_biose=True):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            sent = json.loads(line)
            for word, tag in zip(sent['words'], sent['tags']):
                if bioul_to_biose:
                    tag = tag.replace('L-', 'E-').replace('U-', 'S-')
                of.write(word+' '+tag+'\n')
            of.write('\n')
            sents+=1
    print (sents)
jsonl_to_biose('output/predict/multi_12345/token_gold_dev_dummy_o.json', 'output/predict/multi_12345/token_gold_dev.bmes')

    

500


In [15]:
import os

In [16]:
for folder in os.scandir('output/predict'):
    if not '.ipynb' in folder.name:
        for file in os.scandir(folder):
            if '.json' in file.name and not '.ipynb' in file.name:
                jsonl_to_biose(file.path, file.path.replace('.json', '.bmes'))



500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706


In [162]:
for folder in os.scandir('output/predict_char'):
    if not '.ipynb' in folder.name:
        for file in os.scandir(folder):
            if '.json' in file.name and not '.ipynb' in file.name:
                jsonl_to_biose(file.path, file.path.replace('.json', '.bmes'))



500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706
500
706


## Tasks
1. Strict morpheme evaluation
   1. morph - as they are now
   1. token - evaluate token mentions against gold morpheme mentions
   1. multi - a) token vs gold morph b) yap/pruned vs gold morph
1. Token evaluation 
   1. morph - extend heuristically and evaluate against gold token mentions
   1. token - as it is now
   1. multi - as it is now

In [17]:
import sys
sys.path.append('/home/nlp/danb/NER')

import bclm
import ne_evaluate_mentions as nem

In [18]:
def get_biose_count(path, sent_id_shift=1):
    sents = nem.read_file_sents(path, fix_multi_tag=False, sent_id_shift=sent_id_shift)
    bc = []
    for i, sent in sents.iteritems():
        for j, (tok, bio) in enumerate(sent):
            bc.append([i, j+1, tok, bio, len(bio.split('^'))])

    bc = pd.DataFrame(bc, columns=['sent_id', 'token_id', 'token_str', 
                                   'biose', 'biose_count'])
    return bc

In [19]:
import networkx as nx

In [20]:
def get_valid_edges(lattices, bc,
                    non_o_only=True, keep_all_if_no_valid=True):
    valid_edges = []
    for (i, df), (_, biose, biose_count) in zip(lattices.groupby(['sent_id', 'token_id']), 
                                                bc[['biose', 'biose_count']].itertuples()):
        el = df[['ID1', 'ID2']].rename(columns={'ID1': 'source', 'ID2': 'target'})
        #min_node = [n for n,v in G.nodes(data=True) if v['since'] == 'December 2008'][0]

        g = nx.from_pandas_edgelist(el, create_using=nx.DiGraph)
        min_node = el.source.min()
        max_node = el.target.max()
        #print(min_node,max_node)
        #print(biose_count)
        if non_o_only and not '-' in biose:
            vp = list(nx.all_simple_paths(g, min_node, max_node))
        else:
            vp = [path for path in nx.all_simple_paths(g, min_node, max_node, cutoff=biose_count+1) if len(path)==biose_count+1]
        if keep_all_if_no_valid and len(vp)==0:
             vp = nx.all_simple_paths(g, min_node, max_node)
        for path in vp:
            for source, target in zip(path[:-1], path[1:]):
                valid_edges.append((i[0], i[1], source, target))
                
    return valid_edges

In [21]:
def to_lattices(df, path, cols = ['ID1', 'ID2', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'token_id']):
    with open(path, 'w', encoding='utf8') as of:
        for _, sent in df.groupby('sent_id'):
            for _, row in sent[cols].iterrows():
                of.write('\t'.join(row.astype(str).tolist())+'\n')
            of.write('\n')
            
    

In [30]:
def prune_lattices(lattices_path, ner_pred_path, output_path, keep_all_if_no_valid=True):
    lat = bclm.read_lattices(lattices_path)
    bc = get_biose_count(ner_pred_path, sent_id_shift=1)
    valid_edges = get_valid_edges(lat, bc, non_o_only=False, keep_all_if_no_valid=keep_all_if_no_valid)
    cols = ['sent_id', 'token_id', 'ID1', 'ID2']
    pruned_lat = lat[lat[cols].apply(lambda x: tuple(x), axis=1).isin(valid_edges)]
    to_lattices(pruned_lat, output_path)

In [23]:
prune_lattices(bclm.LATTICES_PATHS['dev'], 
               'output/predict/multi_12345/token_gold_dev_dummy_o.bmes',
               'output/predict/multi_12345/pruned.lat')

In [31]:
for folder in os.scandir('output/predict'):
    if 'multi' in folder.name and not '.ipynb' in folder.name:
        for file in os.scandir(folder):
            if '.bmes' in file.name and not '.ipynb' in file.name:
                if 'dev' in file.name:
                    prune_lattices(bclm.LATTICES_PATHS['dev'], 
                       file.path,
                       os.path.join(folder.path, 'dev_pruned.lat'))
                elif 'test' in file.name:
                    prune_lattices(bclm.LATTICES_PATHS['test'], 
                       file.path,
                       os.path.join(folder.path, 'test_pruned.lat'))
                    

## Run YAP

In [33]:
yap_path = '/home/nlp/danb/yapproj/src/yap/yap'

In [34]:
!export GOPATH=/home/nlp/danb/yapproj

In [35]:
!{yap_path}

/home/nlp/danb/yapproj/src/yap/yap - invoke yap as a standalone app or as an api server

Commands:

    api         start api server
    dep         runs dependency training/parsing
    hebma       run lexicon-based morphological analyzer on raw input
    joint       runs joint morpho-syntactic training and parsing
    ma          run data-driven morphological analyzer on raw input
    md          runs standalone morphological disambiguation training and parsing

Use "/home/nlp/danb/yapproj/src/yap/yap help <command>" for more information about a command.



In [None]:
for folder in os.scandir('output/predict'):
    if 'multi' in folder.name and not '.ipynb' in folder.name:
        for file in os.scandir(folder):
            if '.lat' in file.name and not '.ipynb' in file.name:
                base_out = '.'.join(file.name.split('.')[:-1])
                seg_out, map_out, conll_out = [os.path.join(folder.path, base_out+suf)
                                               for suf in ['.seg', '.map', '.conll']]
                if not os.path.exists(seg_out):
                    !{yap_path} joint -in {file.path} -os {seg_out} -om {map_out} -oc {conll_out}
                    

## Evaluate Segmentation

In [45]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]

In [46]:
dev_gold = bclm.read_dataframe('spmrl', subset='dev')
test_gold = spdf[spdf.set=='test']
test_sent_id_map = (test_gold.groupby('sent_id').size()
                    .reset_index().drop(0, axis=1).reset_index()
                    .assign(index=lambda x: x+1).set_index('sent_id')['index'])
test_gold['sent_id'] = test_gold.sent_id.map(test_sent_id_map)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [52]:
cols = ['sent_id', 'token_id', 'form']

In [64]:
res = []
for folder in os.scandir('output/predict'):
    if 'multi' in folder.name and not '.ipynb' in folder.name:
        dev_lfo = bclm.read_yap_output(treebank_set=None, tokens_path=bclm.TREEBANK_TOKEN_PATHS['dev'], 
                                             dep_path=os.path.join(folder, 'dev_pruned.conll'),
                                             map_path=os.path.join(folder, 'dev_pruned.map'))
        p,r,f_sp = bclm.evaluate_dfs(dev_gold, dev_lfo)
        
        p,r,f_so = bclm.evaluate_dfs(dev_gold, dev_lfo, cols=cols)
        
        res.append((folder.name, f_sp, f_so))
#         for file in os.scandir(folder):
#             if '.bmes' in file.name and not '.ipynb' in file.name:
#                 if 'dev' in file.name:
#                     prune_lattices(bclm.LATTICES_PATHS['dev'], 
#                        file.path,
#                        os.path.join(folder.path, 'dev_pruned.lat'))
#                 elif 'test' in file.name:
#                     prune_lattices(bclm.LATTICES_PATHS['test'], 
#                        file.path,
#                        os.path.join(folder.path, 'test_pruned.lat'))
                    

11301 gold tokens/morphems, 11290 predicted, 10533 correct.
Precision: 93.29
Recall:    93.2
F1:        93.25
FP ex.: [(2, 11, 'דנה', 'BN'), (3, 4, 'ח"כ', 'NNT'), (3, 28, 'הם', 'S-PRN'), (5, 13, 'ה', 'DEF'), (6, 24, 'ה', 'REL')]
FN ex.: [(2, 11, 'דנה', 'VB'), (3, 4, 'ח"כ', 'NN'), (3, 28, 'המ', 'S-PRN'), (6, 24, 'ה', 'DEF'), (8, 1, 'מרגלית', 'NNP')]
11301 gold tokens/morphems, 11290 predicted, 11034 correct.
Precision: 97.73
Recall:    97.64
F1:        97.68
FP ex.: [(3, 28, 'הם'), (5, 13, 'ה'), (8, 7, 'ה'), (8, 7, 'יתרי'), (8, 9, 'ה')]
FN ex.: [(3, 28, 'המ'), (8, 7, 'היתרי'), (17, 11, 'מחפיר'), (18, 8, 'ה'), (23, 8, 'העסקת')]
11301 gold tokens/morphems, 11275 predicted, 10534 correct.
Precision: 93.43
Recall:    93.21
F1:        93.32
FP ex.: [(2, 11, 'דנה', 'BN'), (3, 4, 'ח"כ', 'NNT'), (3, 28, 'הם', 'S-PRN'), (5, 13, 'ה', 'DEF'), (6, 24, 'ה', 'REL')]
FN ex.: [(2, 11, 'דנה', 'VB'), (3, 4, 'ח"כ', 'NN'), (3, 28, 'המ', 'S-PRN'), (6, 24, 'ה', 'DEF'), (8, 1, 'מרגלית', 'NNP')]
11301 gold tok

In [65]:
seg_res_df = pd.DataFrame(res, columns=['model', 'f_seg_pos', 'f_seg_only'])
seg_res_df

Unnamed: 0,model,f_seg_pos,f_seg_only
0,multi_54360,93.249524,97.684919
1,multi_44184,93.32034,97.785259
2,multi_20423,93.288472,97.715601
3,multi_80520,93.307348,97.665766
4,multi_27916,93.348675,97.835996
5,multi_63795,93.166216,97.621684
6,multi_30528,93.283153,97.702015
7,multi_78160,93.218914,97.615783
8,multi_12345,93.319757,97.693568
9,multi_95148,93.317972,97.704715


In [165]:
seg_res_df.mean()

f_seg_pos     93.282037
f_seg_only    97.702530
dtype: float64

## Align Multitok

In [61]:
def soft_merge_bio_labels(multitok_sents, tokmorph_sents, verbose=False):
    new_sents = []
    for (i, mt_sent), (sent_id, mor_sent) in zip(multitok_sents.iteritems(), tokmorph_sents.iteritems()):
        new_sent = []
        for (form, bio), (token_id, token_str, forms) in zip(mt_sent, mor_sent):
            forms = forms.split('^')
            bio = bio.split('^')
            if len(forms) == len(bio):
                new_forms = (1, list(zip(forms,bio)))
            elif len(forms)>len(bio):
                dif = len(forms) - len(bio)
                new_forms = (2, list(zip(forms[:dif],['O']*dif)) + list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            else:
                new_forms = (3, list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            new_sent.extend(new_forms[1])
        new_sents.append(new_sent)
    return new_sents

In [62]:
def align_multitok(ner_pred_path, tokens_path, conll_path, map_path, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)
    prun_yo = bclm.read_yap_output(treebank_set=None, tokens_path=tokens_path, dep_path=conll_path, map_path=map_path)
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [67]:
decode_sets = {
    'token': {
        'dev': '../NER/data/for_ncrf/morph_gold_dev.bmes',
        'test': '../NER/data/for_ncrf/morph_gold_test.bmes',
    },
    'multitok': {
        'dev': '../NER/data/for_ncrf/morph_gold_dev.bmes',
        'test': '../NER/data/for_ncrf/morph_gold_test.bmes',
    }
}

In [78]:
align_multitok('output/predict/multi_44184/token_gold_dev_dummy_o.bmes', 
               bclm.TREEBANK_TOKEN_PATHS['dev'], 
               'output/predict/multi_44184/dev_pruned.conll',
               'output/predict/multi_44184/dev_pruned.map',
               'output/predict/multi_44184/morph_pruned_dev.bmes'
              )
p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], 'output/predict/multi_44184/morph_pruned_dev.bmes', str_join_char='')
p,r,f

(0.8050847457627118, 0.7615230460921844, 0.7826982492276005)

In [79]:
nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                   'output/predict/multi_44184/token_gold_dev_dummy_o.bmes', str_join_char='')

(0.826271186440678, 0.781563126252505, 0.8032955715756952)

In [80]:
res = []
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        print (folder.name)

        pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.bmes')
        
        align_multitok(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                       bclm.TREEBANK_TOKEN_PATHS['dev'], 
                       os.path.join(folder.path, 'dev_pruned.conll'),
                       os.path.join(folder.path, 'dev_pruned.map'),
                       pruned_ner_path
                      )
        p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], pruned_ner_path, str_join_char='')
        res.append((folder.name, p, r, f))


multi_54360
multi_44184
multi_20423
multi_80520
multi_27916
multi_63795
multi_30528
multi_78160
multi_12345
multi_95148


In [81]:
res

[('multi_54360', 0.7749469214437368, 0.7314629258517034, 0.7525773195876289),
 ('multi_44184', 0.8050847457627118, 0.7615230460921844, 0.7826982492276005),
 ('multi_20423', 0.789587852494577, 0.7294589178356713, 0.7583333333333332),
 ('multi_80520', 0.778705636743215, 0.7474949899799599, 0.7627811860940694),
 ('multi_27916', 0.7896995708154506, 0.7374749498997996, 0.7626943005181347),
 ('multi_63795', 0.7836134453781513, 0.7474949899799599, 0.7651282051282051),
 ('multi_30528', 0.7906976744186046, 0.749498997995992, 0.7695473251028806),
 ('multi_78160', 0.8065217391304348, 0.7434869739478958, 0.7737226277372262),
 ('multi_12345', 0.8021505376344086, 0.7474949899799599, 0.7738589211618256),
 ('multi_95148', 0.7956521739130434, 0.7334669338677354, 0.7632950990615224)]

In [83]:
ne_morph_df = pd.DataFrame(res, columns=['model', 'precision', 'recall', 'f'])
ne_morph_df

Unnamed: 0,model,precision,recall,f
0,multi_54360,0.774947,0.731463,0.752577
1,multi_44184,0.805085,0.761523,0.782698
2,multi_20423,0.789588,0.729459,0.758333
3,multi_80520,0.778706,0.747495,0.762781
4,multi_27916,0.7897,0.737475,0.762694
5,multi_63795,0.783613,0.747495,0.765128
6,multi_30528,0.790698,0.749499,0.769547
7,multi_78160,0.806522,0.743487,0.773723
8,multi_12345,0.802151,0.747495,0.773859
9,multi_95148,0.795652,0.733467,0.763295


In [85]:
def biose_to_o(in_path, out_path):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            if line=='\n':
                of.write(line)
                sents+=1
            else:
                line = line.strip()
                word, tag = line.split()
                tag = 'O'
                of.write(word+' '+tag+'\n')
            
    print (sents)
    
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.bmes')
        biose_to_o(pruned_ner_path, pruned_ner_path.replace('.bmes', '.bioul'))


500
500
500
500
500
500
500
500
500
500


In [86]:
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'morph' in folder.name and not '.ipynb_checkpoints' in folder.name:
        pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.json')
        jsonl_to_biose(pruned_ner_path, pruned_ner_path.replace('.json', '.bmes'))

500
500
500
500
500
500
500
500
500
500


## SINGLE + MULTI

In [109]:
res = []
for folder in os.scandir('output/predict'):
    if '.ipynb' in folder.name:
        continue
        
    variant, seed = folder.name.split('_')
    
    if 'single' in folder.name:    
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                                   os.path.join(folder.path,'token_gold_dev_fix.bmes'), str_join_char='')
        res.append(('dev', 'token', variant, 'tokens', '-', seed, p, r, f))

        
    if 'multi' in folder.name:
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                                   os.path.join(folder.path,'token_gold_dev_dummy_o.bmes'), str_join_char='')
        res.append(('dev', 'token', variant, 'tokens', '-', seed, p, r, f))
        
    if 'morph' in folder.name:
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                   os.path.join(folder.path,'morph_gold_dev.bmes'), str_join_char='')
        res.append(('dev', 'morph', variant, 'gold', '-', seed, p, r, f))
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                   os.path.join(folder.path,'morph_yap_dev.bmes'), str_join_char='')
        res.append(('dev', 'morph', variant, 'yap', '-', seed, p, r, f))
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                   os.path.join(folder.path,'morph_pruned_dev.bmes'), str_join_char='')
        res.append(('dev', 'morph', variant, 'hybrid', '-', seed, p, r, f))
        
        
    
    

ne_df = pd.DataFrame(res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

ne_df.groupby(['eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p,r,f
eval_unit,variant,prediction,align,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
morph,morph,gold,-,0.827206,0.782565,0.804233
morph,morph,hybrid,-,0.805129,0.766132,0.785106
morph,morph,yap,-,0.744964,0.703407,0.723552
token,multi,tokens,-,0.807553,0.758116,0.782016
token,single,tokens,-,0.809084,0.783166,0.795844


In [164]:
res = []
for folder in os.scandir('output/predict_char'):
    if '.ipynb' in folder.name:
        continue
        
    variant, seed = folder.name.split('_')
    
    if 'single' in folder.name:    
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                                   os.path.join(folder.path,'token_gold_dev_fix.bmes'), str_join_char='')
        res.append(('dev', 'token', variant, 'tokens', '-', seed, p, r, f))

        
    if 'multi' in folder.name:
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                                   os.path.join(folder.path,'token_gold_dev_dummy_o.bmes'), str_join_char='')
        res.append(('dev', 'token', variant, 'tokens', '-', seed, p, r, f))
        
    if 'morph' in folder.name:
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                   os.path.join(folder.path,'morph_gold_dev.bmes'), str_join_char='')
        res.append(('dev', 'morph', variant, 'gold', '-', seed, p, r, f))
        p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                   os.path.join(folder.path,'morph_yap_dev.bmes'), str_join_char='')
        res.append(('dev', 'morph', variant, 'yap', '-', seed, p, r, f))
        #p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
        #                           os.path.join(folder.path,'morph_pruned_dev.bmes'), str_join_char='')
        #res.append(('dev', 'morph', variant, 'hybrid', '-', seed, p, r, f))
        
        
    
    

ne_c_df = pd.DataFrame(res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

ne_c_df.groupby(['eval_unit','variant', 'prediction', 'align']).f.mean()

eval_unit  variant  prediction  align
morph      morph    gold        -        0.801981
                    yap         -        0.721779
token      multi    tokens      -        0.781968
           single   tokens      -        0.796968
Name: f, dtype: float64

## Add Alignments

### Token Level Eval

In [111]:
import re

o_re = re.compile('^O+$') 
s_re = re.compile('^O*SO*$|^O*BI*EO*$')
b_re = re.compile('^O*BI*$')
i_re = re.compile('^I+$')
e_re = re.compile('^I*EO*$')
def get_fixed_for_valid_biose(bio_seq):
    if o_re.match(bio_seq):
        return 'O'
    if s_re.match(bio_seq):
        return 'S'
    if b_re.match(bio_seq):
        return 'B'
    if i_re.match(bio_seq):
        return 'I'
    if e_re.match(bio_seq):
        return 'E'
    raise ValueError
    

def get_fixed_for_invalid_biose(parts):
    bio = 'O'
    if 'S' in parts:
        bio = 'S'
    elif 'B' in parts and 'E' in parts:
        bio='S'
    elif 'E' in parts:
        bio = 'E'
    elif 'B' in parts:
        bio = 'B'
    elif 'I' in parts:
        bio = 'I'
    return bio

valid_bio_re = re.compile('^O*BI*$|^O*BI*EO*$|^I+$|^I*EO*$|^O*SO*$')

from functools import lru_cache


def validate_biose_sequence(full_bio_seq):
    #print(full_bio_seq)
    bio_seq, type_seq = zip(*[('O', None) if b=='O' else b.split('-') for b in full_bio_seq])
    bio_seq = ''.join(bio_seq)
    valid_bio = valid_bio_re.match(bio_seq)
    type_seq = list(filter(lambda x: x is not None, type_seq))
    type_seq_set = set(type_seq)

    if valid_bio:
        fixed_bio = get_fixed_for_valid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
            
    else:
        #take the first BIOSE tag which is not O:
        #fixed_bio = list(filter(lambda x: x!='O', full_bio_seq))[0]
        #rough BIOSE and first category:
        fixed_bio = get_fixed_for_invalid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
        
    return valid_bio is not None, len(type_seq_set)<=1, fixed_bio


@lru_cache(1000)
def get_fixed_bio_sequence(full_bio_seq):
    return validate_biose_sequence(full_bio_seq)[2]

In [112]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]
dev_gold = spdf[spdf.set=='dev']
test_gold = spdf[spdf.set=='test']
test_gold['sent_id'] = test_gold.sent_id.rank(method='dense').astype(int)
dev_yap = bclm.read_yap_output(treebank_set='dev')
test_yap = bclm.read_yap_output(treebank_set='test')
dev_gold_sents = bclm.get_sentences_list(dev_gold, fields=['token_id', 'token_str'])
test_gold_sents = bclm.get_sentences_list(test_gold, fields=['token_id', 'token_str'])
dev_yap_sents = bclm.get_sentences_list(dev_yap, fields=['token_id', 'token_str'])
test_yap_sents = bclm.get_sentences_list(test_yap, fields=['token_id', 'token_str'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [118]:
dev_gold_tok = (bclm.get_token_df(dev_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok = (bclm.get_token_df(test_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok['sent_id'] = test_gold_tok.sent_id.rank(method='dense').astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [115]:
path = 'output/predict/morph_12345/morph_yap_dev.bmes'
def get_fixed_tok(path, orig_sents=dev_yap_sents):
    x = nem.read_file_sents(path, fix_multi_tag=False)
    new_sents = []
    for (i, ner_sent), (sent_id, yap_sent) in zip(x.iteritems(), orig_sents.iteritems()):
        for (form, bio), (token_id, token_str) in zip(ner_sent, yap_sent):
            new_sents.append((sent_id, token_id, token_str, form, bio))
    new_sents = pd.DataFrame(new_sents, columns=['sent_id', 'token_id', 'token_str', 'form', 'bio'])
    new_toks = bclm.get_token_df(new_sents, fields=['bio'])
    new_toks['fixed_bio'] = new_toks.bio.apply(lambda x: get_fixed_bio_sequence(tuple(x.split('^'))))
    return new_toks

new_toks = get_fixed_tok(path)
new_toks.head(20)

Unnamed: 0,sent_id,token_id,token_str,bio,fixed_bio
0,1,1,עשרות,O,O
1,1,2,אנשים,O,O
2,1,3,מגיעים,O,O
3,1,4,מתאילנד,O^S-GPE,S-GPE
4,1,5,לישראל,O,O
5,1,6,כשהם,O^O,O
6,1,7,נרשמים,O,O
7,1,8,כמתנדבים,O^O^O,O
8,1,9,",",O,O
9,1,10,אך,O,O


In [116]:
def sents_from_df(df, sent_id_col='sent_id', 
                  group_cols=['token_str'], 
                  val_cols=['fixed_bio']):
    sents = bclm.get_sentences_list(df, fields=group_cols+val_cols)
    return sents

def evaluate_dataframes(gold_df, pred_df, fix_multi_tag_pred=True, truncate=None, ignore_cat=False, str_join_char=' '):
    gold_sents = sents_from_df(gold_df)
    pred_sents = sents_from_df(pred_df)
    gold_mentions = nem.sents_to_mentions(gold_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    pred_mentions = nem.sents_to_mentions(pred_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    return nem.evaluate_mentions(gold_mentions, pred_mentions, verbose=False)

In [117]:
sents_from_df(new_toks)

sent_id
1      [[עשרות, O], [אנשים, O], [מגיעים, O], [מתאילנד...
2      [[תופעה, O], [זו, O], [התבררה, O], [אתמול, O],...
3      [[יו"ר, O], [הוועדה, O], [,, O], [ח"כ, O], [או...
4      [[מצד, O], [אחד, O], [רוצה, O], [האוצר, S-ORG]...
5      [[נמיר, S-PER], [הודיעה, O], [כי, O], [תפנה, O...
                             ...                        
496    [[מוות, O], [לערבים, O], [,, O], [מוות, O], [ל...
497    [[קשה, O], [להוציא, O], [את, O], [הארון, O], [...
498    [[עכשיו, O], [קוראים, O], [את, O], [הקדיש, O],...
499           [[תם, O], [מסע, O], [ההלווייה, O], [., O]]
500           [[מתחיל, O], [מסע, O], [הנקמה, O], [., O]]
Length: 500, dtype: object

In [119]:
evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

(0.7742616033755274, 0.7354709418837675, 0.7543679342240494)

#### Hybrid


In [120]:
out_folder = '../NER/data/tokens_for_ncrf'
dev_out = os.path.join(out_folder, 'dev_tokens.txt')
test_out = os.path.join(out_folder, 'test_tokens.txt')
token_paths = {'dev': dev_out, 'test': test_out}

In [122]:
@lru_cache(512)
def get_prun_yo(ds, dep_path, map_path):

    
    prun_yo = bclm.read_yap_output(treebank_set=None,
                               tokens_path=token_paths[ds],
                               dep_path=dep_path,
                               map_path=map_path,
                                )
    return prun_yo

In [127]:
dep_path='output/predict/multi_12345/dev_pruned.conll'
map_path='output/predict/multi_12345/dev_pruned.map'

prun_yo = get_prun_yo('dev', dep_path, map_path)
prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str'])
path =  'output/predict/morph_12345/morph_pruned_dev.bmes'
new_toks = get_fixed_tok(path, orig_sents=prun_sents)
evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

(0.805439330543933, 0.7715430861723447, 0.7881269191402253)

In [128]:
nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes',
                  path)

(0.7949790794979079, 0.7615230460921844, 0.7778915046059366)

#### Run on all pruned

In [129]:
@lru_cache(512)
def get_sent_list(ds, dp, mp):
    prun_yo = get_prun_yo(ds, dp, mp)
    return bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str'])

In [140]:
align_tok_res = []
for folder in os.scandir('output/predict'):
    if 'morph' in folder.name and not '.ipynb' in folder.name:
        ## dev 
        variant, seed = folder.name.split('_')
        file = os.path.join(folder.path,'morph_pruned_dev.bmes')
        multi_folder = folder.path.replace('morph_', 'multi_')
        dep_path = os.path.join(multi_folder, 'dev_pruned.conll')
        map_path = os.path.join(multi_folder, 'dev_pruned.map')
        out_path = os.path.join(folder.path, 'morph_pruned_dev_align_tokens.bmes')
        
        prun_sents = get_sent_list('dev',dep_path , map_path)
        new_toks = get_fixed_tok(file, orig_sents=prun_sents)
        
        if not os.path.exists(out_path):
            new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
            with open(out_path, 'w') as of:
                for i, sent in new_sents.iteritems():
                    for tok, bio in sent:
                        of.write(tok+' '+bio+'\n')
                    of.write('\n')
                    
        p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

        align_tok_res.append(('dev', 'token', 'morph', 'hybrid', 'tokens', seed, p, r, f))
        

In [141]:
at_df = pd.DataFrame(align_tok_res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

at_df.groupby(['eval_unit','variant', 'prediction', 'align']).f.mean()

eval_unit  variant  prediction  align 
token      morph    hybrid      tokens    0.791924
Name: f, dtype: float64

#### Run all gold and YAP

In [142]:
align_tok_res_yg = []
for folder in os.scandir('output/predict'):
    if 'morph' in folder.name and not '.ipynb' in folder.name:
        ## dev 
        ## - gold
        variant, seed = folder.name.split('_')
        file = os.path.join(folder.path,'morph_gold_dev.bmes')
        out_path = os.path.join(folder.path, 'morph_gold_dev_align_tokens.bmes')
        
        new_toks = get_fixed_tok(file, orig_sents=dev_gold_sents)
        
        if not os.path.exists(out_path):
            new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
            with open(out_path, 'w') as of:
                for i, sent in new_sents.iteritems():
                    for tok, bio in sent:
                        of.write(tok+' '+bio+'\n')
                    of.write('\n')
                    
        p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

        align_tok_res_yg.append(('dev', 'token', 'morph', 'gold', 'tokens', seed, p, r, f))
        
        ## - yap
        variant, seed = folder.name.split('_')
        file = os.path.join(folder.path,'morph_yap_dev.bmes')
        out_path = os.path.join(folder.path, 'morph_yap_dev_align_tokens.bmes')
        
        new_toks = get_fixed_tok(file, orig_sents=dev_yap_sents)
        
        if not os.path.exists(out_path):
            new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
            with open(out_path, 'w') as of:
                for i, sent in new_sents.iteritems():
                    for tok, bio in sent:
                        of.write(tok+' '+bio+'\n')
                    of.write('\n')
        p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

        align_tok_res_yg.append(('dev', 'token', 'morph', 'yap', 'tokens', seed, p, r, f))

In [144]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

at_df.groupby(['eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p,r,f
eval_unit,variant,prediction,align,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
token,morph,gold,tokens,0.828991,0.783768,0.805714
token,morph,hybrid,tokens,0.81238,0.772545,0.791924
token,morph,yap,tokens,0.785387,0.741082,0.762553


### Morpheme Level Eval

#### Token Multi

#### Hybrid

In [145]:
align_morph_res_hyb = []
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        print (folder.name)

        pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.bmes')
        
        p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], pruned_ner_path, str_join_char='')
        align_morph_res_hyb.append(('dev', 'morph', 'multi', 'tokens', 'hybrid', seed, p, r, f))

multi_54360
multi_44184
multi_20423
multi_80520
multi_27916
multi_63795
multi_30528
multi_78160
multi_12345
multi_95148


In [148]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

at_df.groupby(['eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p,r,f
eval_unit,variant,prediction,align,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
morph,multi,tokens,hybrid,0.791666,0.742886,0.766464
token,morph,gold,tokens,0.828991,0.783768,0.805714
token,morph,hybrid,tokens,0.81238,0.772545,0.791924
token,morph,yap,tokens,0.785387,0.741082,0.762553


#### YAP + GOLD

In [156]:
def align_multitok_yg(ner_pred_path, prun_sents, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)

    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [157]:
gold_morph = {'dev': dev_gold, 'test': test_gold}
def get_sents_for_mult(treebank_set, gold=False, pred_set=None, 
                       dep_path=None, map_path=None):
    if treebank_set is None:
        prun_yo = get_prun_yo(pred_set, dep_path, map_path)
    else:
        if not gold:
            prun_yo = bclm.read_yap_output(treebank_set=treebank_set)
        else:
            prun_yo = gold_morph[treebank_set]
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    return prun_sents

dev_yap_sents_m = get_sents_for_mult('dev')
test_yap_sents_m = get_sents_for_mult('test')
dev_gold_sents_m = get_sents_for_mult('dev', gold=True)
test_gold_sents_m = get_sents_for_mult('test', gold=True)

In [160]:
align_morph_res_yap = []
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        print (folder.name)

        yap_ner_path=os.path.join(folder.path, 'morph_yap_dev.bmes')
        
        align_multitok_yg(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                           dev_yap_sents_m,
                           yap_ner_path
                          )
        p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], yap_ner_path, str_join_char='')
        align_morph_res_yap.append(('dev', 'morph', 'multi', 'tokens', 'yap', seed, p, r, f))


multi_54360
multi_44184
multi_20423
multi_80520
multi_27916
multi_63795
multi_30528
multi_78160
multi_12345
multi_95148


In [158]:
align_morph_res_gold = []
for folder in os.scandir('output/predict'):
    if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
        print (folder.name)

        gold_ner_path=os.path.join(folder.path, 'morph_gold_dev.bmes')
        
        align_multitok_yg(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                           dev_gold_sents_m,
                           gold_ner_path
                          )
        p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], gold_ner_path, str_join_char='')
        align_morph_res_gold.append(('dev', 'morph', 'multi', 'tokens', 'gold', seed, p, r, f))

multi_54360
multi_44184
multi_20423
multi_80520
multi_27916
multi_63795
multi_30528
multi_78160
multi_12345
multi_95148


In [161]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb+align_morph_res_yap+align_morph_res_gold, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

at_df.groupby(['eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p,r,f
eval_unit,variant,prediction,align,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
morph,multi,tokens,gold,0.803848,0.752705,0.777399
morph,multi,tokens,hybrid,0.791666,0.742886,0.766464
morph,multi,tokens,yap,0.75507,0.696593,0.724617
token,morph,gold,tokens,0.828991,0.783768,0.805714
token,morph,hybrid,tokens,0.81238,0.772545,0.791924
token,morph,yap,tokens,0.785387,0.741082,0.762553


In [168]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb+align_morph_res_yap+align_morph_res_gold, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'seed', 'p', 'r', 'f'])

(at_df.groupby(['eval_unit','variant', 'prediction', 'align']).f.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ (1.96*(x['std']/np.sqrt(10))).round(1).astype(str)+'$')[['mean']])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean
eval_unit,variant,prediction,align,Unnamed: 4_level_1
morph,multi,tokens,gold,$77.74 ± 0.5$
morph,multi,tokens,hybrid,$76.65 ± 0.5$
morph,multi,tokens,yap,$72.46 ± 0.5$
token,morph,gold,tokens,$80.57 ± 0.3$
token,morph,hybrid,tokens,$79.19 ± 0.4$
token,morph,yap,tokens,$76.26 ± 0.4$


In [169]:
(ne_df.groupby(['eval_unit','variant', 'prediction', 'align']).f.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ (1.96*(x['std']/np.sqrt(10))).round(1).astype(str)+'$')[['mean']])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean
eval_unit,variant,prediction,align,Unnamed: 4_level_1
morph,morph,gold,-,$80.42 ± 0.3$
morph,morph,hybrid,-,$78.51 ± 0.3$
morph,morph,yap,-,$72.36 ± 0.4$
token,multi,tokens,-,$78.20 ± 0.6$
token,single,tokens,-,$79.58 ± 0.3$


In [166]:
at_df.groupby(['eval_unit','variant', 'prediction', 'align']).size()

eval_unit  variant  prediction  align 
morph      multi    tokens      gold      10
                                hybrid    10
                                yap       10
token      morph    gold        tokens    10
                    hybrid      tokens    10
                    yap         tokens    10
dtype: int64