# Re-evaluation
Perform two evaluations:
1. Strict morpheme evaluation
1. Token evaluation (morpheme labels are extended to the token level heuristically)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [5]:
import os

In [6]:
import sys
sys.path.append('/home/nlp/danb/NER')

import bclm
import ne_evaluate_mentions as nem

## Create BIOSE files


In [80]:
import json

def jsonl_to_biose(in_path, out_path, bioul_to_biose=True):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            sent = json.loads(line)
            for word, tag in zip(sent['words'], sent['tags']):
                if bioul_to_biose:
                    tag = tag.replace('L-', 'E-').replace('U-', 'S-')
                of.write(word+' '+tag+'\n')
            of.write('\n')
            sents+=1
    print (sents)
    

In [81]:
import os

In [133]:
for trans in os.scandir('output/predict_alephbert'):
    for folder in os.scandir(trans):
        if not '.ipynb' in folder.name:
            for file in os.scandir(folder):
                if '.json' in file.name and not '.ipynb' in file.name:
                    output_path = file.path.replace('.json', '.bmes')
                    if not os.path.exists(output_path):
                        print (folder.path)
                        jsonl_to_biose(file.path, output_path)



output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/single_54360
500
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/single_54360
706
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/morph_54360
500
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/morph_54360
706
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/morph_54360
500
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/morph_54360
706
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/single_44184
500
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/single_44184
706
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/single_20423
500
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/single_20423
706
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/morph_44184
500
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/morph_44184
706
output/predict_alephbert/bert-small-wordpiece-oscar-52000-10/morph_441

## Tasks
1. Strict morpheme evaluation
   1. morph - as they are now
   1. token - evaluate token mentions against gold morpheme mentions
   1. multi - a) token vs gold morph b) yap/pruned vs gold morph
1. Token evaluation 
   1. morph - extend heuristically and evaluate against gold token mentions
   1. token - as it is now
   1. multi - as it is now

In [134]:
def get_biose_count(path, sent_id_shift=1):
    sents = nem.read_file_sents(path, fix_multi_tag=False, sent_id_shift=sent_id_shift)
    bc = []
    for i, sent in sents.iteritems():
        for j, (tok, bio) in enumerate(sent):
            bc.append([i, j+1, tok, bio, len(bio.split('^'))])

    bc = pd.DataFrame(bc, columns=['sent_id', 'token_id', 'token_str', 
                                   'biose', 'biose_count'])
    return bc

In [135]:
import networkx as nx

In [136]:
def get_valid_edges(lattices, bc,
                    non_o_only=True, keep_all_if_no_valid=True):
    valid_edges = []
    for (i, df), (_, biose, biose_count) in zip(lattices.groupby(['sent_id', 'token_id']), 
                                                bc[['biose', 'biose_count']].itertuples()):
        el = df[['ID1', 'ID2']].rename(columns={'ID1': 'source', 'ID2': 'target'})
        #min_node = [n for n,v in G.nodes(data=True) if v['since'] == 'December 2008'][0]

        g = nx.from_pandas_edgelist(el, create_using=nx.DiGraph)
        min_node = el.source.min()
        max_node = el.target.max()
        #print(min_node,max_node)
        #print(biose_count)
        if non_o_only and not '-' in biose:
            vp = list(nx.all_simple_paths(g, min_node, max_node))
        else:
            vp = [path for path in nx.all_simple_paths(g, min_node, max_node, cutoff=biose_count+1) if len(path)==biose_count+1]
        if keep_all_if_no_valid and len(vp)==0:
             vp = nx.all_simple_paths(g, min_node, max_node)
        for path in vp:
            for source, target in zip(path[:-1], path[1:]):
                valid_edges.append((i[0], i[1], source, target))
                
    return valid_edges

In [137]:
def to_lattices(df, path, cols = ['ID1', 'ID2', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'token_id']):
    with open(path, 'w', encoding='utf8') as of:
        for _, sent in df.groupby('sent_id'):
            for _, row in sent[cols].iterrows():
                of.write('\t'.join(row.astype(str).tolist())+'\n')
            of.write('\n')
            
    

In [138]:
def prune_lattices(lattices_path, ner_pred_path, output_path, keep_all_if_no_valid=True):
    lat = bclm.read_lattices(lattices_path)
    bc = get_biose_count(ner_pred_path, sent_id_shift=1)
    valid_edges = get_valid_edges(lat, bc, non_o_only=False, keep_all_if_no_valid=keep_all_if_no_valid)
    cols = ['sent_id', 'token_id', 'ID1', 'ID2']
    pruned_lat = lat[lat[cols].apply(lambda x: tuple(x), axis=1).isin(valid_edges)]
    to_lattices(pruned_lat, output_path)

In [139]:
os.listdir('output/predict_alephbert/')

['52k',
 '.ipynb_checkpoints',
 '2k',
 '8k',
 '32k',
 '4k',
 '16k',
 '64k',
 '128k',
 'heBERT',
 'unichar_improved_52k',
 'unichar_improved_with_hash_52k',
 'bert-distilled-wordpiece-oscar-52000',
 'bert-basic-wordpiece-owt-52000-10',
 'heBERT2',
 'bert-small-wordpiece-oscar-52000-10']

In [140]:
include_only = ['bert-small-wordpiece-oscar-52000-10',]
print(include_only)

['bert-small-wordpiece-oscar-52000-10']


In [141]:
for trans in os.scandir('output/predict_alephbert'):
    print(trans.name)
    if trans.name not in include_only:
        continue
    for folder in os.scandir(trans):
        if 'multi' in folder.name and not '.ipynb' in folder.name:
            #print(folder.name)
            for file in os.scandir(folder):
                if 'dummy' in file.name and '.bmes' in file.name and not '.ipynb' in file.name:
                    if 'dev' in file.name:
                        output_path = os.path.join(folder.path, 'dev_pruned.lat')
                        if not os.path.exists(output_path):
                            print(folder.name)
                            prune_lattices(bclm.LATTICES_PATHS['dev'], 
                               file.path,
                               output_path)
                    elif 'test' in file.name:
                        output_path = os.path.join(folder.path, 'test_pruned.lat')
                        if not os.path.exists(output_path):
                            print(folder.name)
                            prune_lattices(bclm.LATTICES_PATHS['test'], 
                               file.path,
                               output_path)


52k
.ipynb_checkpoints
2k
8k
32k
4k
16k
64k
128k
heBERT
unichar_improved_52k
unichar_improved_with_hash_52k
bert-distilled-wordpiece-oscar-52000
bert-basic-wordpiece-owt-52000-10
heBERT2
bert-small-wordpiece-oscar-52000-10
multi_54360
multi_54360
multi_44184
multi_44184
multi_20423
multi_20423
multi_80520
multi_80520
multi_27916
multi_27916


## Run YAP

In [142]:
yap_path = '/home/nlp/danb/yapproj/src/yap/yap'

In [143]:
!export GOPATH=/home/nlp/danb/yapproj

In [144]:
!{yap_path}

/home/nlp/danb/yapproj/src/yap/yap - invoke yap as a standalone app or as an api server

Commands:

    api         start api server
    dep         runs dependency training/parsing
    hebma       run lexicon-based morphological analyzer on raw input
    joint       runs joint morpho-syntactic training and parsing
    ma          run data-driven morphological analyzer on raw input
    md          runs standalone morphological disambiguation training and parsing

Use "/home/nlp/danb/yapproj/src/yap/yap help <command>" for more information about a command.



In [145]:
for trans in os.scandir('output/predict_alephbert'):
    if trans.name not in include_only:
        continue
    print(trans.name)
    for folder in os.scandir(trans):
        if 'multi' in folder.name and not '.ipynb' in folder.name:
            print(folder.name)
            for file in os.scandir(folder):
                if '.lat' in file.name and not '.ipynb' in file.name:
                    base_out = '.'.join(file.name.split('.')[:-1])
                    seg_out, map_out, conll_out = [os.path.join(folder.path, base_out+suf)
                                                   for suf in ['.seg', '.map', '.conll']]
                    if True:#not os.path.exists(map_out):
                        !{yap_path} joint -in {file.path} -os {seg_out} -om {map_out} -oc {conll_out} > /dev/null 2>&1
                    

bert-small-wordpiece-oscar-52000-10
multi_54360
multi_44184
multi_20423
multi_80520
multi_27916


## Evaluate Segmentation

In [146]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]

In [147]:
dev_gold = bclm.read_dataframe('spmrl', subset='dev')
test_gold = spdf[spdf.set=='test']
test_sent_id_map = (test_gold.groupby('sent_id').size()
                    .reset_index().drop(0, axis=1).reset_index()
                    .assign(index=lambda x: x+1).set_index('sent_id')['index'])
test_gold['sent_id'] = test_gold.sent_id.map(test_sent_id_map)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [148]:
cols = ['sent_id', 'token_id', 'form']

In [149]:
res = []
use_filter = True
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if use_filter and trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if 'multi' in folder.name and not '.ipynb' in folder.name:
            dev_lfo = bclm.read_yap_output(treebank_set=None, tokens_path=bclm.TREEBANK_TOKEN_PATHS['dev'], 
                                                 dep_path=os.path.join(folder, 'dev_pruned.conll'),
                                                 map_path=os.path.join(folder, 'dev_pruned.map'))
            p,r,f_sp = bclm.evaluate_dfs(dev_gold, dev_lfo, verbose=False)

            p,r,f_so = bclm.evaluate_dfs(dev_gold, dev_lfo, cols=cols, verbose=False)
            res.append(('dev', trans_name, folder.name, f_sp, f_so))

            test_lfo = bclm.read_yap_output(treebank_set=None, tokens_path=bclm.TREEBANK_TOKEN_PATHS['test'], 
                                                 dep_path=os.path.join(folder, 'test_pruned.conll'),
                                                 map_path=os.path.join(folder, 'test_pruned.map'))
            p,r,f_sp = bclm.evaluate_dfs(test_gold, test_lfo, verbose=False)

            p,r,f_so = bclm.evaluate_dfs(test_gold, test_lfo, cols=cols, verbose=False)

            res.append(('test', trans_name, folder.name, f_sp, f_so))
#         for file in os.scandir(folder):
#             if '.bmes' in file.name and not '.ipynb' in file.name:
#                 if 'dev' in file.name:
#                     prune_lattices(bclm.LATTICES_PATHS['dev'], 
#                        file.path,
#                        os.path.join(folder.path, 'dev_pruned.lat'))
#                 elif 'test' in file.name:
#                     prune_lattices(bclm.LATTICES_PATHS['test'], 
#                        file.path,
#                        os.path.join(folder.path, 'test_pruned.lat'))
                    

bert-small-wordpiece-oscar-52000-10


In [150]:
seg_res_df = pd.DataFrame(res, columns=['pred_set', 'trans_name', 'model', 'f_seg_pos', 'f_seg_only'])

In [151]:
seg_res_df.groupby(['pred_set', 'trans_name']).mean().unstack()

Unnamed: 0_level_0,f_seg_pos,f_seg_only
trans_name,bert-small-wordpiece-oscar-52000-10,bert-small-wordpiece-oscar-52000-10
pred_set,Unnamed: 1_level_2,Unnamed: 2_level_2
dev,93.425476,97.889414
test,91.622853,97.71575


## Align Multitok

In [152]:
def soft_merge_bio_labels(multitok_sents, tokmorph_sents, verbose=False):
    new_sents = []
    for (i, mt_sent), (sent_id, mor_sent) in zip(multitok_sents.iteritems(), tokmorph_sents.iteritems()):
        new_sent = []
        for (form, bio), (token_id, token_str, forms) in zip(mt_sent, mor_sent):
            forms = forms.split('^')
            bio = bio.split('^')
            if len(forms) == len(bio):
                new_forms = (1, list(zip(forms,bio)))
            elif len(forms)>len(bio):
                dif = len(forms) - len(bio)
                new_forms = (2, list(zip(forms[:dif],['O']*dif)) + list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            else:
                new_forms = (3, list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            new_sent.extend(new_forms[1])
        new_sents.append(new_sent)
    return new_sents

In [153]:
def align_multitok(ner_pred_path, tokens_path, conll_path, map_path, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)
    prun_yo = bclm.read_yap_output(treebank_set=None, tokens_path=tokens_path, dep_path=conll_path, map_path=map_path)
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [154]:
decode_sets = {
    'token': {
        'dev': '../NER/data/for_ncrf/morph_gold_dev.bmes',
        'test': '../NER/data/for_ncrf/morph_gold_test.bmes',
    },
    'multitok': {
        'dev': '../NER/data/for_ncrf/morph_gold_dev.bmes',
        'test': '../NER/data/for_ncrf/morph_gold_test.bmes',
    }
}

In [155]:
res = []
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            print (folder.name)
            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.bmes')
            if True: #not os.path.exists(pruned_ner_path):
                align_multitok(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                               bclm.TREEBANK_TOKEN_PATHS['dev'], 
                               os.path.join(folder.path, 'dev_pruned.conll'),
                               os.path.join(folder.path, 'dev_pruned.map'),
                               pruned_ner_path
                              )
                p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], pruned_ner_path, str_join_char='')
                res.append(('dev', trans_name, folder.name, p, r, f))

            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.bmes')
            if True: #not os.path.exists(pruned_ner_path):
                align_multitok(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                               bclm.TREEBANK_TOKEN_PATHS['test'], 
                               os.path.join(folder.path, 'test_pruned.conll'),
                               os.path.join(folder.path, 'test_pruned.map'),
                               pruned_ner_path
                              )
                p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], pruned_ner_path, str_join_char='')
                res.append(('test', trans_name, folder.name, p, r, f))


bert-small-wordpiece-oscar-52000-10
multi_54360
multi_44184
multi_20423
multi_80520
multi_27916


In [156]:
def biose_to_o(in_path, out_path):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            if line=='\n':
                of.write(line)
                sents+=1
            else:
                line = line.strip()
                word, tag = line.split()
                tag = 'O'
                of.write(word+' '+tag+'\n')
            
    
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue    
    print(trans.name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.bmes')
            output_path = pruned_ner_path.replace('.bmes', '.bioul')
            if True: #not os.path.exists(output_path):
                biose_to_o(pruned_ner_path, output_path)
            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.bmes')
            output_path = pruned_ner_path.replace('.bmes', '.bioul')
            if True: #not os.path.exists(output_path):
                biose_to_o(pruned_ner_path, output_path)


bert-small-wordpiece-oscar-52000-10


## <-- NOW RUN PREDICT ON PRUNED

In [158]:
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'morph' in folder.name and not '.ipynb_checkpoints' in folder.name:
            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.json')
            output_path = pruned_ner_path.replace('.json', '.bmes')
            if True:#not os.path.exists(output_path):
                jsonl_to_biose(pruned_ner_path, output_path)
            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.json')
            output_path = pruned_ner_path.replace('.json', '.bmes')
            if True:#not os.path.exists(output_path):
                jsonl_to_biose(pruned_ner_path, output_path)

bert-small-wordpiece-oscar-52000-10
706
500
706
500
706
500
706
500
706
500


## SINGLE + MULTI

In [159]:
res = []
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if '.ipynb' in folder.name:
            continue

        variant, seed = folder.name.split('_')

        if 'single' in folder.name:    
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_test_fix.bmes'), str_join_char='')
            res.append(('test', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))


        if 'multi' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_test_dummy_o.bmes'), str_join_char='')
            res.append(('test', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))

        if 'morph' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                       os.path.join(folder.path,'morph_gold_test.bmes'), str_join_char='')
            res.append(('test', 'morph', variant, 'gold', '-', trans_name, seed, p, r, f))
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                       os.path.join(folder.path,'morph_yap_test.bmes'), str_join_char='')
            res.append(('test', 'morph', variant, 'yap', '-', trans_name, seed, p, r, f))
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                       os.path.join(folder.path,'morph_pruned_test.bmes'), str_join_char='')
            res.append(('test', 'morph', variant, 'hybrid', '-', trans_name, seed, p, r, f))

        #dev

        if 'single' in folder.name:    
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_dev_fix.bmes'), str_join_char='')
            res.append(('dev', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))


        if 'multi' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_dev_dummy_o.bmes'), str_join_char='')
            res.append(('dev', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))

        if 'morph' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                       os.path.join(folder.path,'morph_gold_dev.bmes'), str_join_char='')
            res.append(('dev', 'morph', variant, 'gold', '-', trans_name, seed, p, r, f))
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                       os.path.join(folder.path,'morph_yap_dev.bmes'), str_join_char='')
            res.append(('dev', 'morph', variant, 'yap', '-', trans_name, seed, p, r, f))
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                       os.path.join(folder.path,'morph_pruned_dev.bmes'), str_join_char='')
            res.append(('dev', 'morph', variant, 'hybrid', '-', trans_name, seed, p, r, f))
    
    

ne_df = pd.DataFrame(res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

ne_df.groupby(['set', 'eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

bert-small-wordpiece-oscar-52000-10


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,p,r,f
set,eval_unit,variant,prediction,align,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dev,morph,morph,gold,-,0.816269,0.777956,0.796625
dev,morph,morph,hybrid,-,0.802461,0.761924,0.781648
dev,morph,morph,yap,-,0.724738,0.698597,0.711411
dev,token,multi,tokens,-,0.839158,0.762725,0.799088
dev,token,single,tokens,-,0.823865,0.792385,0.807789
test,morph,morph,gold,-,0.771591,0.790773,0.781056
test,morph,morph,hybrid,-,0.741636,0.753863,0.747688
test,morph,morph,yap,-,0.664694,0.669099,0.66688
test,token,multi,tokens,-,0.791587,0.779185,0.785329
test,token,single,tokens,-,0.762308,0.790129,0.775935


## Add Alignments

### Token Level Eval

In [160]:
import re

o_re = re.compile('^O+$') 
s_re = re.compile('^O*SO*$|^O*BI*EO*$')
b_re = re.compile('^O*BI*$')
i_re = re.compile('^I+$')
e_re = re.compile('^I*EO*$')
def get_fixed_for_valid_biose(bio_seq):
    if o_re.match(bio_seq):
        return 'O'
    if s_re.match(bio_seq):
        return 'S'
    if b_re.match(bio_seq):
        return 'B'
    if i_re.match(bio_seq):
        return 'I'
    if e_re.match(bio_seq):
        return 'E'
    raise ValueError
    

def get_fixed_for_invalid_biose(parts):
    bio = 'O'
    if 'S' in parts:
        bio = 'S'
    elif 'B' in parts and 'E' in parts:
        bio='S'
    elif 'E' in parts:
        bio = 'E'
    elif 'B' in parts:
        bio = 'B'
    elif 'I' in parts:
        bio = 'I'
    return bio

valid_bio_re = re.compile('^O*BI*$|^O*BI*EO*$|^I+$|^I*EO*$|^O*SO*$')

from functools import lru_cache


def validate_biose_sequence(full_bio_seq):
    #print(full_bio_seq)
    bio_seq, type_seq = zip(*[('O', None) if b=='O' else b.split('-') for b in full_bio_seq])
    bio_seq = ''.join(bio_seq)
    valid_bio = valid_bio_re.match(bio_seq)
    type_seq = list(filter(lambda x: x is not None, type_seq))
    type_seq_set = set(type_seq)

    if valid_bio:
        fixed_bio = get_fixed_for_valid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
            
    else:
        #take the first BIOSE tag which is not O:
        #fixed_bio = list(filter(lambda x: x!='O', full_bio_seq))[0]
        #rough BIOSE and first category:
        fixed_bio = get_fixed_for_invalid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
        
    return valid_bio is not None, len(type_seq_set)<=1, fixed_bio


@lru_cache(1000)
def get_fixed_bio_sequence(full_bio_seq):
    return validate_biose_sequence(full_bio_seq)[2]

In [161]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]
dev_gold = spdf[spdf.set=='dev']
test_gold = spdf[spdf.set=='test']
test_gold['sent_id'] = test_gold.sent_id.rank(method='dense').astype(int)
dev_yap = bclm.read_yap_output(treebank_set='dev')
test_yap = bclm.read_yap_output(treebank_set='test')
dev_gold_sents = bclm.get_sentences_list(dev_gold, fields=['token_id', 'token_str'])
test_gold_sents = bclm.get_sentences_list(test_gold, fields=['token_id', 'token_str'])
dev_yap_sents = bclm.get_sentences_list(dev_yap, fields=['token_id', 'token_str'])
test_yap_sents = bclm.get_sentences_list(test_yap, fields=['token_id', 'token_str'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [162]:
dev_gold_tok = (bclm.get_token_df(dev_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok = (bclm.get_token_df(test_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok['sent_id'] = test_gold_tok.sent_id.rank(method='dense').astype(int)


In [163]:
def get_fixed_tok(path, orig_sents=dev_yap_sents):
    x = nem.read_file_sents(path, fix_multi_tag=False)
    new_sents = []
    for (i, ner_sent), (sent_id, yap_sent) in zip(x.iteritems(), orig_sents.iteritems()):
        for (form, bio), (token_id, token_str) in zip(ner_sent, yap_sent):
            new_sents.append((sent_id, token_id, token_str, form, bio))
    new_sents = pd.DataFrame(new_sents, columns=['sent_id', 'token_id', 'token_str', 'form', 'bio'])
    new_toks = bclm.get_token_df(new_sents, fields=['bio'])
    new_toks['fixed_bio'] = new_toks.bio.apply(lambda x: get_fixed_bio_sequence(tuple(x.split('^'))))
    return new_toks


In [164]:
def sents_from_df(df, sent_id_col='sent_id', 
                  group_cols=['token_str'], 
                  val_cols=['fixed_bio']):
    sents = bclm.get_sentences_list(df, fields=group_cols+val_cols)
    return sents

def evaluate_dataframes(gold_df, pred_df, fix_multi_tag_pred=True, truncate=None, ignore_cat=False, str_join_char=' '):
    gold_sents = sents_from_df(gold_df)
    pred_sents = sents_from_df(pred_df)
    gold_mentions = nem.sents_to_mentions(gold_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    pred_mentions = nem.sents_to_mentions(pred_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    return nem.evaluate_mentions(gold_mentions, pred_mentions, verbose=False)

#### Hybrid


In [165]:
out_folder = '../NER/data/tokens_for_ncrf'
dev_out = os.path.join(out_folder, 'dev_tokens.txt')
test_out = os.path.join(out_folder, 'test_tokens.txt')
token_paths = {'dev': dev_out, 'test': test_out}

In [166]:
def get_prun_yo(ds, dep_path, map_path):

    
    prun_yo = bclm.read_yap_output(treebank_set=None,
                               tokens_path=token_paths[ds],
                               dep_path=dep_path,
                               map_path=map_path,
                                )
    return prun_yo

#### Run on all pruned

In [167]:
def get_sent_list(ds, dp, mp):
    prun_yo = get_prun_yo(ds, dp, mp)
    return bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str'])

In [168]:
align_tok_res = []
for trans in os.scandir('output/predict_alephbert'): 
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if 'morph' in folder.name and not '.ipynb' in folder.name:
            ## dev 
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_pruned_dev.bmes')
            multi_folder = folder.path.replace('morph_', 'multi_')
            dep_path = os.path.join(multi_folder, 'dev_pruned.conll')
            map_path = os.path.join(multi_folder, 'dev_pruned.map')
            out_path = os.path.join(folder.path, 'morph_pruned_dev_align_tokens.bmes')

            prun_sents = get_sent_list('dev',dep_path , map_path)
            new_toks = get_fixed_tok(file, orig_sents=prun_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

            align_tok_res.append(('dev', 'token', 'morph', 'hybrid', 'tokens', trans_name, seed, p, r, f))
            
            ## test 
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_pruned_test.bmes')
            multi_folder = folder.path.replace('morph_', 'multi_')
            dep_path = os.path.join(multi_folder, 'test_pruned.conll')
            map_path = os.path.join(multi_folder, 'test_pruned.map')
            out_path = os.path.join(folder.path, 'morph_pruned_test_align_tokens.bmes')

            prun_sents = get_sent_list('test',dep_path , map_path)
            new_toks = get_fixed_tok(file, orig_sents=prun_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

            align_tok_res.append(('test', 'token', 'morph', 'hybrid', 'tokens', trans_name, seed, p, r, f))


bert-small-wordpiece-oscar-52000-10


In [169]:
at_df = pd.DataFrame(align_tok_res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

at_df.groupby(['set', 'trans_name', 'eval_unit','variant', 'prediction', 'align']).f.mean()

set   trans_name                           eval_unit  variant  prediction  align 
dev   bert-small-wordpiece-oscar-52000-10  token      morph    hybrid      tokens    0.795792
test  bert-small-wordpiece-oscar-52000-10  token      morph    hybrid      tokens    0.770039
Name: f, dtype: float64

#### Run all gold and YAP

In [170]:
align_tok_res_yg = []
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if 'morph' in folder.name and not '.ipynb' in folder.name:
            ## dev 
            ## - gold
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_gold_dev.bmes')
            out_path = os.path.join(folder.path, 'morph_gold_dev_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=dev_gold_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('dev', 'token', 'morph', 'gold', 'tokens', trans_name, seed, p, r, f))

            ## - yap
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_yap_dev.bmes')
            out_path = os.path.join(folder.path, 'morph_yap_dev_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=dev_yap_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')
            p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('dev', 'token', 'morph', 'yap', 'tokens', trans_name, seed, p, r, f))

            ## test 
            ## - gold
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_gold_test.bmes')
            out_path = os.path.join(folder.path, 'morph_gold_test_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=test_gold_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('test', 'token', 'morph', 'gold', 'tokens', trans_name, seed, p, r, f))

            ## - yap
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_yap_test.bmes')
            out_path = os.path.join(folder.path, 'morph_yap_test_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=test_yap_sents)

            if True: # not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')
            p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('test', 'token', 'morph', 'yap', 'tokens', trans_name, seed, p, r, f))

bert-small-wordpiece-oscar-52000-10


In [171]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

at_df.groupby(['set', 'trans_name', 'eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,p,r,f
set,trans_name,eval_unit,variant,prediction,align,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
dev,bert-small-wordpiece-oscar-52000-10,token,morph,gold,tokens,0.825018,0.785972,0.804998
dev,bert-small-wordpiece-oscar-52000-10,token,morph,hybrid,tokens,0.817154,0.775551,0.795792
dev,bert-small-wordpiece-oscar-52000-10,token,morph,yap,tokens,0.770481,0.742685,0.756311
test,bert-small-wordpiece-oscar-52000-10,token,morph,gold,tokens,0.775941,0.795064,0.785378
test,bert-small-wordpiece-oscar-52000-10,token,morph,hybrid,tokens,0.76381,0.776395,0.770039
test,bert-small-wordpiece-oscar-52000-10,token,morph,yap,tokens,0.736611,0.740987,0.738783


### Morpheme Level Eval

#### Token Multi

#### Hybrid

In [172]:
align_morph_res_hyb = []
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.bmes')

            p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], pruned_ner_path, str_join_char='')
            align_morph_res_hyb.append(('dev', 'morph', 'multi', 'tokens', 'hybrid', trans_name, seed, p, r, f))

            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.bmes')

            p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], pruned_ner_path, str_join_char='')
            align_morph_res_hyb.append(('test', 'morph', 'multi', 'tokens', 'hybrid', trans_name, seed, p, r, f))

bert-small-wordpiece-oscar-52000-10


In [173]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

at_df.groupby(['set', 'trans_name', 'eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,p,r,f
set,trans_name,eval_unit,variant,prediction,align,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
dev,bert-small-wordpiece-oscar-52000-10,morph,multi,tokens,hybrid,0.819594,0.744289,0.780101
dev,bert-small-wordpiece-oscar-52000-10,token,morph,gold,tokens,0.825018,0.785972,0.804998
dev,bert-small-wordpiece-oscar-52000-10,token,morph,hybrid,tokens,0.817154,0.775551,0.795792
dev,bert-small-wordpiece-oscar-52000-10,token,morph,yap,tokens,0.770481,0.742685,0.756311
test,bert-small-wordpiece-oscar-52000-10,morph,multi,tokens,hybrid,0.762503,0.749571,0.755973
test,bert-small-wordpiece-oscar-52000-10,token,morph,gold,tokens,0.775941,0.795064,0.785378
test,bert-small-wordpiece-oscar-52000-10,token,morph,hybrid,tokens,0.76381,0.776395,0.770039
test,bert-small-wordpiece-oscar-52000-10,token,morph,yap,tokens,0.736611,0.740987,0.738783


#### YAP + GOLD

In [174]:
def align_multitok_yg(ner_pred_path, prun_sents, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)

    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [175]:
gold_morph = {'dev': dev_gold, 'test': test_gold}
def get_sents_for_mult(treebank_set, gold=False, pred_set=None, 
                       dep_path=None, map_path=None):
    if treebank_set is None:
        prun_yo = get_prun_yo(pred_set, dep_path, map_path)
    else:
        if not gold:
            prun_yo = bclm.read_yap_output(treebank_set=treebank_set)
        else:
            prun_yo = gold_morph[treebank_set]
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    return prun_sents

dev_yap_sents_m = get_sents_for_mult('dev')
test_yap_sents_m = get_sents_for_mult('test')
dev_gold_sents_m = get_sents_for_mult('dev', gold=True)
test_gold_sents_m = get_sents_for_mult('test', gold=True)

In [176]:
align_morph_res_yap = []
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            #dev
            yap_ner_path=os.path.join(folder.path, 'morph_yap_dev.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                               dev_yap_sents_m,
                               yap_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], yap_ner_path, str_join_char='')
            align_morph_res_yap.append(('dev', 'morph', 'multi', 'tokens', 'yap', trans_name, seed, p, r, f))
            
            #test
            yap_ner_path=os.path.join(folder.path, 'morph_yap_test.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                               test_yap_sents_m,
                               yap_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], yap_ner_path, str_join_char='')
            align_morph_res_yap.append(('test', 'morph', 'multi', 'tokens', 'yap', trans_name, seed, p, r, f))


bert-small-wordpiece-oscar-52000-10


In [177]:
align_morph_res_gold = []
for trans in os.scandir('output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            #dev
            gold_ner_path=os.path.join(folder.path, 'morph_gold_dev.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                               dev_gold_sents_m,
                               gold_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], gold_ner_path, str_join_char='')
            align_morph_res_gold.append(('dev', 'morph', 'multi', 'tokens', 'gold', trans_name, seed, p, r, f))

            #test
            gold_ner_path=os.path.join(folder.path, 'morph_gold_test.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                               test_gold_sents_m,
                               gold_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], gold_ner_path, str_join_char='')
            align_morph_res_gold.append(('test', 'morph', 'multi', 'tokens', 'gold', trans_name, seed, p, r, f))

bert-small-wordpiece-oscar-52000-10


## ALL SCORES

In [178]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb+align_morph_res_yap+align_morph_res_gold, columns=['set', 'eval_unit', 'variant', 
                                                                                                                           'prediction', 'align', 'trans_name', 
                                                                                                                           'seed', 'p', 'r', 'f'])

In [179]:
ALL_DF_PATH = 'output/all_results_alephbert.csv'
all_df = pd.read_csv(ALL_DF_PATH)

In [180]:
all_df.groupby(['set', 'eval_unit','variant', 
                'prediction', 'align','trans_name']).size().agg([min,max])

min    5
max    5
dtype: int64

In [181]:
all_df = pd.concat([all_df, at_df, ne_df])
all_df.groupby(['set', 'eval_unit','variant', 
                'prediction', 'align','trans_name']).size().agg([min,max])

min    5
max    5
dtype: int64

In [182]:
mean_scores = (all_df
               .groupby(['set', 'eval_unit','variant', 
                         'prediction', 'align','trans_name'])
               .f.mean().unstack().mul(100)[include_only]
              )
mean_scores

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,trans_name,bert-small-wordpiece-oscar-52000-10
set,eval_unit,variant,prediction,align,Unnamed: 5_level_1
dev,morph,morph,gold,-,79.662546
dev,morph,morph,hybrid,-,78.164751
dev,morph,morph,yap,-,71.141128
dev,morph,multi,tokens,gold,78.991742
dev,morph,multi,tokens,hybrid,78.010109
dev,morph,multi,tokens,yap,73.582826
dev,token,morph,gold,tokens,80.499798
dev,token,morph,hybrid,tokens,79.579169
dev,token,morph,yap,tokens,75.631056
dev,token,multi,tokens,-,79.90881


In [183]:
all_df.to_csv(ALL_DF_PATH, index=False)

In [184]:
mean_scores.reset_index().to_csv('output/mean_results_alephbert.csv', index=False)


In [185]:
#seg_res_df.to_csv('output/all_seg_results_alephbert.csv', index=False)

In [186]:
(seg_res_df.groupby(['pred_set', 'trans_name'])
 .f_seg_pos.mean().unstack()[include_only]
)

trans_name,bert-small-wordpiece-oscar-52000-10
pred_set,Unnamed: 1_level_1
dev,93.425476
test,91.622853


In [187]:
(seg_res_df.groupby(['pred_set', 'trans_name'])
 .f_seg_only.mean().unstack()[include_only]
)

trans_name,bert-small-wordpiece-oscar-52000-10
pred_set,Unnamed: 1_level_1
dev,97.889414
test,97.71575


In [7]:
nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', '../NER/data/from_amit/morph_label_test.bmes', str_join_char='')

(0.7950108459869848, 0.7864806866952789, 0.7907227615965479)

In [8]:
nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', '../NER/data/from_amit/morph_label_dev.bmes', str_join_char='')

(0.801255230125523, 0.7675350701402806, 0.7840327533265097)

In [79]:
nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', '/home/nlp/shaked571/NerBert/ft_ner_alefbert/seed1_BaseBert10e_maxlen150_batch4_single_random/test_predictions.txt', str_join_char='')

(0.8597826086956522, 0.8487124463519313, 0.8542116630669546)