# Re-evaluation
Perform two evaluations:
1. Strict morpheme evaluation
1. Token evaluation (morpheme labels are extended to the token level heuristically)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [5]:
import os

In [6]:
import sys
sys.path.append('/home/nlp/danb/NER')

import bclm
import ne_evaluate_mentions as nem

## Create input files

In [7]:
import sys
sys.path.append('../wiki_dump')

In [8]:
import json

In [9]:
import unicodedata

clean_nikud=True

In [10]:
from hebtokenizer import tokenize
from hebtokenizer import alternative_scanner
alt_scan = True
clean_junk = True

In [14]:
def clean_hebchars(text):
    norm = unicodedata.normalize('NFKD', text)
    text =''.join([c for c in norm if not unicodedata.combining(c)]) 
    #line = line.replace('־', '-')
    # maqaf
    text = text.replace(u'\u05be', '-')

    #line = line.replace('׳', '\'')
    #geresh
    text = text.replace(u'\u05f3', '\'')

    #line = line.replace('״', '"')
    #gershayim
    text = text.replace(u'\u05f4', '"')
    #line = line.replace('”', '"')
    #line = line.replace('„', '"')
    #en dash
    text = text.replace(u'\u2013', '-')
    #em dash
    text = text.replace(u'\u2014', '-')
    return text

In [15]:
def tokenize_sentences(sents):
    tok_sents = []
    for sent in sents:
        if alt_scan:
            tok = tokenize(sent, alternative_scanner)
        else:
            tok = tokenize(sent)

        last_type, last_form = tok[-1]
        if len(last_form)>1 and last_type!='PUNCT' and last_form[-1] in ('?', '!', '.'):
            tok[-1] = (last_type, last_form[:-1])
            tok.append(('PUNCT', last_form[-1]))

        final = []
        for c, t in tok: 
            if clean_junk and c=='JUNK':
                continue
            final.append(t)
        tok_sents.append(final)
    return tok_sents

In [23]:
sents = []
for line in open('../NER/data/sinai/HZF-sentences.tsv'):
    sent = clean_hebchars(line.strip())
    if sent=='text':
        continue
    sents.append(sent)

sents[:10]    


['התתר אשר היה במעקקא והשתטח על קבר היותו במעקקא יעטוף על מצנפתו חגור לבן או אדום  וישאהו כל הימים',
 'כל איש אשר התבונן בעין בחונה אל הנעשה, והנשמע במשך השנים האחרונות  בארצות הדאנוי, ראה כי דעת הקהל התהפכה שם  כחומר חותם ורוח אחרת היתה אתה',
 'גם זה היה לטובה להאורחה ההיא, כי נספחו אליה שני שעריפים (צאצאי מחמד) מעיר אל-שרק הנמצאה בין החוף והמדבר, אשר שבו מדרך הקודש, מערי מעקקא ומודינה, וילכו לשוב אל עירם  ומולדתם',
 'לא טוב גורל האובדים והנדחים בפאריז, ולא טובה ממנו גם מנת חלק רוב הפליטים המתגוררים בארץ הצבי',
 'לפי הודעת מכ"ע האנגלים כבר מצאה הממשלה מספר אניות הדרושות אל החפץ, ובעת הזאת מן גדות הטהעמזע ומן גדות הגאנגעס יעופו צבאות אנגליא המעטים וטובים כעל כנפי נשרים לעשות נפלאות בארץ חם,  בפרט ע"ד צבאות הודו לא ייעפו ולא ייגעו מכה"ע האנגלים  לתנות תקף מעשיהם וגבורתם הנפלאה',
 "גם ביחוס פלגות הארץ לפי מזגה יחלקה ה' בורטאן לשני חלקים: החלק הצפוני עם החוף מאקווא עוד לא נחקר ולא נדרש כל צרכו; ובחלק הדרומי אשד לו החוף וועדי, תתנכרנה עקבות עמי קדם אשר היו שם",
 'בראשית מסענו לא פגשנו איש,

In [24]:
sents = tokenize_sentences(sents)
sents[:10]

[['התתר',
  'אשר',
  'היה',
  'במעקקא',
  'והשתטח',
  'על',
  'קבר',
  'היותו',
  'במעקקא',
  'יעטוף',
  'על',
  'מצנפתו',
  'חגור',
  'לבן',
  'או',
  'אדום',
  'וישאהו',
  'כל',
  'הימים'],
 ['כל',
  'איש',
  'אשר',
  'התבונן',
  'בעין',
  'בחונה',
  'אל',
  'הנעשה',
  ',',
  'והנשמע',
  'במשך',
  'השנים',
  'האחרונות',
  'בארצות',
  'הדאנוי',
  ',',
  'ראה',
  'כי',
  'דעת',
  'הקהל',
  'התהפכה',
  'שם',
  'כחומר',
  'חותם',
  'ורוח',
  'אחרת',
  'היתה',
  'אתה'],
 ['גם',
  'זה',
  'היה',
  'לטובה',
  'להאורחה',
  'ההיא',
  ',',
  'כי',
  'נספחו',
  'אליה',
  'שני',
  'שעריפים',
  '(',
  'צאצאי',
  'מחמד',
  ')',
  'מעיר',
  'אל',
  '-',
  'שרק',
  'הנמצאה',
  'בין',
  'החוף',
  'והמדבר',
  ',',
  'אשר',
  'שבו',
  'מדרך',
  'הקודש',
  ',',
  'מערי',
  'מעקקא',
  'ומודינה',
  ',',
  'וילכו',
  'לשוב',
  'אל',
  'עירם',
  'ומולדתם'],
 ['לא',
  'טוב',
  'גורל',
  'האובדים',
  'והנדחים',
  'בפאריז',
  ',',
  'ולא',
  'טובה',
  'ממנו',
  'גם',
  'מנת',
  'חלק',
  'רוב',
  'הפליטים',
  '

In [25]:
with open('../NER/data/sinai/HZF-sentences_tokenized.txt', 'w', encoding='utf8') as of:
    for sent in sents:
        for tok in sent:
            of.write(tok+ '\n')
        of.write('\n')
        
with open('../NER/data/sinai/HZF-sentences_tokenized_dummy_o.bmes', 'w', encoding='utf8') as of:
    for sent in sents:
        for tok in sent:
            of.write(tok+ ' O\n')
        of.write('\n')

## Create BIOSE files


In [26]:
import json

def jsonl_to_biose(in_path, out_path, bioul_to_biose=True):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            sent = json.loads(line)
            for word, tag in zip(sent['words'], sent['tags']):
                if bioul_to_biose:
                    tag = tag.replace('L-', 'E-').replace('U-', 'S-')
                of.write(word+' '+tag+'\n')
            of.write('\n')
            sents+=1
    print (sents)
    

In [27]:
import os

In [30]:
json_path = 'output/predict_sinai/multi_heBERT_53460.json'
output_path = json_path.replace('.json', '.bmes')
if not os.path.exists(output_path):
    jsonl_to_biose(json_path, output_path)



101


In [46]:
ner_multi_path = output_path

## Prune

### Run YAP MA

In [36]:
yap_path = '/home/nlp/danb/yapproj/src/yap/yap'

In [37]:
!export GOPATH=/home/nlp/danb/yapproj

In [38]:
!{yap_path}

/home/nlp/danb/yapproj/src/yap/yap - invoke yap as a standalone app or as an api server

Commands:

    api         start api server
    dep         runs dependency training/parsing
    hebma       run lexicon-based morphological analyzer on raw input
    joint       runs joint morpho-syntactic training and parsing
    ma          run data-driven morphological analyzer on raw input
    md          runs standalone morphological disambiguation training and parsing

Use "/home/nlp/danb/yapproj/src/yap/yap help <command>" for more information about a command.



In [39]:
tokens_path = '../NER/data/sinai/HZF-sentences_tokenized.txt'
lattices_path = '../NER/data/sinai/HZF-sentences_tokenized.lattices'
!{yap_path} hebma -raw {tokens_path} -out {lattices_path} > /dev/null 2>&1
                    

In [47]:
def get_biose_count(path, sent_id_shift=1):
    sents = nem.read_file_sents(path, fix_multi_tag=False, sent_id_shift=sent_id_shift)
    bc = []
    for i, sent in sents.iteritems():
        for j, (tok, bio) in enumerate(sent):
            bc.append([i, j+1, tok, bio, len(bio.split('^'))])

    bc = pd.DataFrame(bc, columns=['sent_id', 'token_id', 'token_str', 
                                   'biose', 'biose_count'])
    return bc

In [48]:
import networkx as nx

In [49]:
def get_valid_edges(lattices, bc,
                    non_o_only=True, keep_all_if_no_valid=True):
    valid_edges = []
    for (i, df), (_, biose, biose_count) in zip(lattices.groupby(['sent_id', 'token_id']), 
                                                bc[['biose', 'biose_count']].itertuples()):
        el = df[['ID1', 'ID2']].rename(columns={'ID1': 'source', 'ID2': 'target'})
        #min_node = [n for n,v in G.nodes(data=True) if v['since'] == 'December 2008'][0]

        g = nx.from_pandas_edgelist(el, create_using=nx.DiGraph)
        min_node = el.source.min()
        max_node = el.target.max()
        #print(min_node,max_node)
        #print(biose_count)
        if non_o_only and not '-' in biose:
            vp = list(nx.all_simple_paths(g, min_node, max_node))
        else:
            vp = [path for path in nx.all_simple_paths(g, min_node, max_node, cutoff=biose_count+1) if len(path)==biose_count+1]
        if keep_all_if_no_valid and len(vp)==0:
             vp = nx.all_simple_paths(g, min_node, max_node)
        for path in vp:
            for source, target in zip(path[:-1], path[1:]):
                valid_edges.append((i[0], i[1], source, target))
                
    return valid_edges

In [50]:
def to_lattices(df, path, cols = ['ID1', 'ID2', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'token_id']):
    with open(path, 'w', encoding='utf8') as of:
        for _, sent in df.groupby('sent_id'):
            for _, row in sent[cols].iterrows():
                of.write('\t'.join(row.astype(str).tolist())+'\n')
            of.write('\n')
            
    

In [51]:
def prune_lattices(lattices_path, ner_pred_path, output_path, keep_all_if_no_valid=True):
    lat = bclm.read_lattices(lattices_path)
    bc = get_biose_count(ner_pred_path, sent_id_shift=1)
    valid_edges = get_valid_edges(lat, bc, non_o_only=False, keep_all_if_no_valid=keep_all_if_no_valid)
    cols = ['sent_id', 'token_id', 'ID1', 'ID2']
    pruned_lat = lat[lat[cols].apply(lambda x: tuple(x), axis=1).isin(valid_edges)]
    to_lattices(pruned_lat, output_path)

In [69]:
pruned_lat_path = lattices_path.replace('.lattices', '_pruned.lattices')
prune_lattices(lattices_path, ner_multi_path, pruned_lat_path)

In [71]:
pruned_lat_path_ft = lattices_path.replace('.lattices', '_pruned_ft.lattices')
ner_multi_path_ft = '../NCRFpp/data/sinai/decode_output/HZF_sentences.multitok.char_cnn.ft_tok.51_seed.bmes'
prune_lattices(lattices_path, ner_multi_path_ft, pruned_lat_path_ft)

## Run YAP

In [55]:
seg_out, map_out, conll_out = [pruned_lat_path.replace('.lattices', suf)
                               for suf in ['.seg', '.map', '.conll']]
if True:#not os.path.exists(map_out):
    !{yap_path} joint -in {pruned_lat_path} -os {seg_out} -om {map_out} -oc {conll_out} > /dev/null 2>&1


In [72]:
seg_out_ft, map_out_ft, conll_out_ft = [pruned_lat_path_ft.replace('.lattices', suf)
                               for suf in ['.seg', '.map', '.conll']]
if True:#not os.path.exists(map_out):
    !{yap_path} joint -in {pruned_lat_path_ft} -os {seg_out_ft} -om {map_out_ft} -oc {conll_out_ft} > /dev/null 2>&1


## Evaluate Segmentation

In [58]:
yap_out = bclm.read_yap_output(treebank_set=None, tokens_path=tokens_path, 
                                     dep_path=conll_out,
                                     map_path=map_out)
yap_out.head()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,misc,sent_id,token_id,token_str
0,1,ה,ה,DEF,DEF,,2,def,_,_,1,1,התתר
1,2,תתר,תתר,NN,NN,gen=M|num=S,14,subj,_,_,1,1,התתר
2,3,אשר,אשר,CC,CC,,2,rcmod,_,_,1,2,אשר
3,4,היה,היה,COP,COP,gen=M|num=S|per=3,7,conj,_,_,1,3,היה
4,5,ב,ב,PREPOSITION,PREPOSITION,,4,prepmod,_,_,1,4,במעקקא


In [73]:
yap_out_ft = bclm.read_yap_output(treebank_set=None, tokens_path=tokens_path, 
                                     dep_path=conll_out_ft,
                                     map_path=map_out_ft)
yap_out_ft.head()

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,head,deprel,deps,misc,sent_id,token_id,token_str
0,1,התתר,התתר,NN,NN,gen=M|num=S,12,subj,_,_,1,1,התתר
1,2,אשר,אשר,CC,CC,,1,rcmod,_,_,1,2,אשר
2,3,היה,היה,COP,COP,gen=M|num=S|per=3,2,relcomp,_,_,1,3,היה
3,4,במעקקא,במעקקא,NNP,NNP,gen=F|gen=M|num=S,5,ROOT,_,_,1,4,במעקקא
4,5,ו,ו,CONJ,CONJ,,3,ROOT,_,_,1,5,והשתטח


## Align Multitok

In [59]:
def soft_merge_bio_labels(multitok_sents, tokmorph_sents, verbose=False):
    new_sents = []
    for (i, mt_sent), (sent_id, mor_sent) in zip(multitok_sents.iteritems(), tokmorph_sents.iteritems()):
        new_sent = []
        for (form, bio), (token_id, token_str, forms) in zip(mt_sent, mor_sent):
            forms = forms.split('^')
            bio = bio.split('^')
            if len(forms) == len(bio):
                new_forms = (1, list(zip(forms,bio)))
            elif len(forms)>len(bio):
                dif = len(forms) - len(bio)
                new_forms = (2, list(zip(forms[:dif],['O']*dif)) + list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            else:
                new_forms = (3, list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            new_sent.extend(new_forms[1])
        new_sents.append(new_sent)
    return new_sents

In [60]:
def align_multitok(ner_pred_path, tokens_path, conll_path, map_path, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)
    prun_yo = bclm.read_yap_output(treebank_set=None, tokens_path=tokens_path, dep_path=conll_path, map_path=map_path)
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [62]:
pruned_ner_path=pruned_lat_path.replace('.lattices', '.bmes')
if True: #not os.path.exists(pruned_ner_path):
    align_multitok(ner_multi_path, 
                   tokens_path, 
                   conll_out,
                   map_out,
                   pruned_ner_path
                  )

In [74]:
pruned_ner_path_ft=pruned_lat_path_ft.replace('.lattices', '.bmes')
if True: #not os.path.exists(pruned_ner_path):
    align_multitok(ner_multi_path_ft, 
                   tokens_path, 
                   conll_out_ft,
                   map_out_ft,
                   pruned_ner_path_ft
                  )

In [76]:
def biose_to_o(in_path, out_path):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            if line=='\n':
                of.write(line)
                sents+=1
            else:
                line = line.strip()
                word, tag = line.split()
                tag = 'O'
                of.write(word+' '+tag+'\n')
            
    

output_path = pruned_ner_path.replace('.bmes', '_dummy_o.bmes')
if True: #not os.path.exists(output_path):
    biose_to_o(pruned_ner_path, output_path)


In [75]:
output_path_ft = pruned_ner_path_ft.replace('.bmes', '_dummy_o.bmes')
if True: #not os.path.exists(output_path):
    biose_to_o(pruned_ner_path_ft, output_path_ft)


## <-- NOW RUN PREDICT ON PRUNED

In [67]:
json_path = 'output/predict_sinai/morph_pruned_heBERT_53460.json'
output_path = json_path.replace('.json', '.bmes')
if not os.path.exists(output_path):
    jsonl_to_biose(json_path, output_path)



101


## SINGLE + MULTI

In [35]:
res = []
for trans in os.scandir('output/sinai'):
    trans_name = trans.name
    print(trans_name)
    for folder in os.scandir(trans):
        if '.ipynb' in folder.name:
            continue

        variant, seed = folder.name.split('_')

        if 'single' in folder.name:    
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_test_fix.bmes'), str_join_char='')
            res.append(('test', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))


        if 'multi' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_test_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_test_dummy_o.bmes'), str_join_char='')
            res.append(('test', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))

        if 'morph' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                       os.path.join(folder.path,'morph_gold_test.bmes'), str_join_char='')
            res.append(('test', 'morph', variant, 'gold', '-', trans_name, seed, p, r, f))
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                       os.path.join(folder.path,'morph_yap_test.bmes'), str_join_char='')
            res.append(('test', 'morph', variant, 'yap', '-', trans_name, seed, p, r, f))
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_test.bmes', 
                                       os.path.join(folder.path,'morph_pruned_test.bmes'), str_join_char='')
            res.append(('test', 'morph', variant, 'hybrid', '-', trans_name, seed, p, r, f))

        #dev

        if 'single' in folder.name:    
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_dev_fix.bmes'), str_join_char='')
            res.append(('dev', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))


        if 'multi' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/token_gold_dev_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_dev_dummy_o.bmes'), str_join_char='')
            res.append(('dev', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))

        if 'morph' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                       os.path.join(folder.path,'morph_gold_dev.bmes'), str_join_char='')
            res.append(('dev', 'morph', variant, 'gold', '-', trans_name, seed, p, r, f))
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                       os.path.join(folder.path,'morph_yap_dev.bmes'), str_join_char='')
            res.append(('dev', 'morph', variant, 'yap', '-', trans_name, seed, p, r, f))
            p,r,f = nem.evaluate_files('../NER/data/for_ncrf/morph_gold_dev.bmes', 
                                       os.path.join(folder.path,'morph_pruned_dev.bmes'), str_join_char='')
            res.append(('dev', 'morph', variant, 'hybrid', '-', trans_name, seed, p, r, f))
    
    

ne_df = pd.DataFrame(res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

ne_df.groupby(['set', 'eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

52k
.ipynb_checkpoints
2k
8k
32k
4k
16k
64k
128k
heBERT
unichar_improved_52k
unichar_improved_with_hash_52k
distilled_52k_temp


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,p,r,f
set,eval_unit,variant,prediction,align,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dev,morph,morph,gold,-,0.795912,0.752238,0.773388
dev,morph,morph,hybrid,-,0.769143,0.726319,0.747048
dev,morph,morph,yap,-,0.712882,0.677989,0.694896
dev,token,multi,tokens,-,0.789026,0.708016,0.746213
dev,token,single,tokens,-,0.787436,0.740414,0.763101
test,morph,morph,gold,-,0.736141,0.764056,0.74977
test,morph,morph,hybrid,-,0.706601,0.72373,0.715005
test,morph,morph,yap,-,0.633964,0.644725,0.639237
test,token,multi,tokens,-,0.748274,0.718294,0.732886
test,token,single,tokens,-,0.71997,0.738215,0.728903


## Add Alignments

### Token Level Eval

In [36]:
import re

o_re = re.compile('^O+$') 
s_re = re.compile('^O*SO*$|^O*BI*EO*$')
b_re = re.compile('^O*BI*$')
i_re = re.compile('^I+$')
e_re = re.compile('^I*EO*$')
def get_fixed_for_valid_biose(bio_seq):
    if o_re.match(bio_seq):
        return 'O'
    if s_re.match(bio_seq):
        return 'S'
    if b_re.match(bio_seq):
        return 'B'
    if i_re.match(bio_seq):
        return 'I'
    if e_re.match(bio_seq):
        return 'E'
    raise ValueError
    

def get_fixed_for_invalid_biose(parts):
    bio = 'O'
    if 'S' in parts:
        bio = 'S'
    elif 'B' in parts and 'E' in parts:
        bio='S'
    elif 'E' in parts:
        bio = 'E'
    elif 'B' in parts:
        bio = 'B'
    elif 'I' in parts:
        bio = 'I'
    return bio

valid_bio_re = re.compile('^O*BI*$|^O*BI*EO*$|^I+$|^I*EO*$|^O*SO*$')

from functools import lru_cache


def validate_biose_sequence(full_bio_seq):
    #print(full_bio_seq)
    bio_seq, type_seq = zip(*[('O', None) if b=='O' else b.split('-') for b in full_bio_seq])
    bio_seq = ''.join(bio_seq)
    valid_bio = valid_bio_re.match(bio_seq)
    type_seq = list(filter(lambda x: x is not None, type_seq))
    type_seq_set = set(type_seq)

    if valid_bio:
        fixed_bio = get_fixed_for_valid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
            
    else:
        #take the first BIOSE tag which is not O:
        #fixed_bio = list(filter(lambda x: x!='O', full_bio_seq))[0]
        #rough BIOSE and first category:
        fixed_bio = get_fixed_for_invalid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
        
    return valid_bio is not None, len(type_seq_set)<=1, fixed_bio


@lru_cache(1000)
def get_fixed_bio_sequence(full_bio_seq):
    return validate_biose_sequence(full_bio_seq)[2]

In [37]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
spdf = bclm.read_dataframe('spmrl')
spdf = spdf[(~spdf.sent_id.isin(dropped))]
dev_gold = spdf[spdf.set=='dev']
test_gold = spdf[spdf.set=='test']
test_gold['sent_id'] = test_gold.sent_id.rank(method='dense').astype(int)
dev_yap = bclm.read_yap_output(treebank_set='dev')
test_yap = bclm.read_yap_output(treebank_set='test')
dev_gold_sents = bclm.get_sentences_list(dev_gold, fields=['token_id', 'token_str'])
test_gold_sents = bclm.get_sentences_list(test_gold, fields=['token_id', 'token_str'])
dev_yap_sents = bclm.get_sentences_list(dev_yap, fields=['token_id', 'token_str'])
test_yap_sents = bclm.get_sentences_list(test_yap, fields=['token_id', 'token_str'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [38]:
dev_gold_tok = (bclm.get_token_df(dev_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok = (bclm.get_token_df(test_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok['sent_id'] = test_gold_tok.sent_id.rank(method='dense').astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [39]:
def get_fixed_tok(path, orig_sents=dev_yap_sents):
    x = nem.read_file_sents(path, fix_multi_tag=False)
    new_sents = []
    for (i, ner_sent), (sent_id, yap_sent) in zip(x.iteritems(), orig_sents.iteritems()):
        for (form, bio), (token_id, token_str) in zip(ner_sent, yap_sent):
            new_sents.append((sent_id, token_id, token_str, form, bio))
    new_sents = pd.DataFrame(new_sents, columns=['sent_id', 'token_id', 'token_str', 'form', 'bio'])
    new_toks = bclm.get_token_df(new_sents, fields=['bio'])
    new_toks['fixed_bio'] = new_toks.bio.apply(lambda x: get_fixed_bio_sequence(tuple(x.split('^'))))
    return new_toks


In [40]:
def sents_from_df(df, sent_id_col='sent_id', 
                  group_cols=['token_str'], 
                  val_cols=['fixed_bio']):
    sents = bclm.get_sentences_list(df, fields=group_cols+val_cols)
    return sents

def evaluate_dataframes(gold_df, pred_df, fix_multi_tag_pred=True, truncate=None, ignore_cat=False, str_join_char=' '):
    gold_sents = sents_from_df(gold_df)
    pred_sents = sents_from_df(pred_df)
    gold_mentions = nem.sents_to_mentions(gold_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    pred_mentions = nem.sents_to_mentions(pred_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    return nem.evaluate_mentions(gold_mentions, pred_mentions, verbose=False)

#### Hybrid


In [41]:
out_folder = '../NER/data/tokens_for_ncrf'
dev_out = os.path.join(out_folder, 'dev_tokens.txt')
test_out = os.path.join(out_folder, 'test_tokens.txt')
token_paths = {'dev': dev_out, 'test': test_out}

In [42]:
@lru_cache(512)
def get_prun_yo(ds, dep_path, map_path):

    
    prun_yo = bclm.read_yap_output(treebank_set=None,
                               tokens_path=token_paths[ds],
                               dep_path=dep_path,
                               map_path=map_path,
                                )
    return prun_yo

#### Run on all pruned

In [43]:
@lru_cache(512)
def get_sent_list(ds, dp, mp):
    prun_yo = get_prun_yo(ds, dp, mp)
    return bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str'])

In [44]:
align_tok_res = []
for trans in os.scandir('output/sinai'): 
    trans_name = trans.name
    print(trans_name)
    for folder in os.scandir(trans):
        if 'morph' in folder.name and not '.ipynb' in folder.name:
            ## dev 
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_pruned_dev.bmes')
            multi_folder = folder.path.replace('morph_', 'multi_')
            dep_path = os.path.join(multi_folder, 'dev_pruned.conll')
            map_path = os.path.join(multi_folder, 'dev_pruned.map')
            out_path = os.path.join(folder.path, 'morph_pruned_dev_align_tokens.bmes')

            prun_sents = get_sent_list('dev',dep_path , map_path)
            new_toks = get_fixed_tok(file, orig_sents=prun_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

            align_tok_res.append(('dev', 'token', 'morph', 'hybrid', 'tokens', trans_name, seed, p, r, f))
            
            ## test 
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_pruned_test.bmes')
            multi_folder = folder.path.replace('morph_', 'multi_')
            dep_path = os.path.join(multi_folder, 'test_pruned.conll')
            map_path = os.path.join(multi_folder, 'test_pruned.map')
            out_path = os.path.join(folder.path, 'morph_pruned_test_align_tokens.bmes')

            prun_sents = get_sent_list('test',dep_path , map_path)
            new_toks = get_fixed_tok(file, orig_sents=prun_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

            align_tok_res.append(('test', 'token', 'morph', 'hybrid', 'tokens', trans_name, seed, p, r, f))


52k
.ipynb_checkpoints
2k
8k
32k
4k
16k
64k
128k
heBERT
unichar_improved_52k
unichar_improved_with_hash_52k
distilled_52k_temp


In [45]:
at_df = pd.DataFrame(align_tok_res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

at_df.groupby(['set', 'trans_name', 'eval_unit','variant', 'prediction', 'align']).f.mean()

set   trans_name                      eval_unit  variant  prediction  align 
dev   128k                            token      morph    hybrid      tokens    0.792964
      16k                             token      morph    hybrid      tokens    0.758491
      2k                              token      morph    hybrid      tokens    0.698021
      32k                             token      morph    hybrid      tokens    0.757407
      4k                              token      morph    hybrid      tokens    0.727420
      52k                             token      morph    hybrid      tokens    0.743293
      64k                             token      morph    hybrid      tokens    0.786625
      8k                              token      morph    hybrid      tokens    0.733904
      distilled_52k_temp              token      morph    hybrid      tokens    0.768593
      heBERT                          token      morph    hybrid      tokens    0.816921
      unichar_improved_52k       

#### Run all gold and YAP

In [46]:
align_tok_res_yg = []
for trans in os.scandir('output/sinai'):
    trans_name = trans.name
    print(trans_name)
    for folder in os.scandir(trans):
        if 'morph' in folder.name and not '.ipynb' in folder.name:
            ## dev 
            ## - gold
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_gold_dev.bmes')
            out_path = os.path.join(folder.path, 'morph_gold_dev_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=dev_gold_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('dev', 'token', 'morph', 'gold', 'tokens', trans_name, seed, p, r, f))

            ## - yap
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_yap_dev.bmes')
            out_path = os.path.join(folder.path, 'morph_yap_dev_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=dev_yap_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')
            p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('dev', 'token', 'morph', 'yap', 'tokens', trans_name, seed, p, r, f))

            ## test 
            ## - gold
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_gold_test.bmes')
            out_path = os.path.join(folder.path, 'morph_gold_test_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=test_gold_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('test', 'token', 'morph', 'gold', 'tokens', trans_name, seed, p, r, f))

            ## - yap
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_yap_test.bmes')
            out_path = os.path.join(folder.path, 'morph_yap_test_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=test_yap_sents)

            if True: # not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')
            p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('test', 'token', 'morph', 'yap', 'tokens', trans_name, seed, p, r, f))

52k
.ipynb_checkpoints
2k
8k
32k
4k
16k
64k
128k
heBERT
unichar_improved_52k
unichar_improved_with_hash_52k
distilled_52k_temp


In [47]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

at_df.groupby(['set', 'trans_name', 'eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,p,r,f
set,trans_name,eval_unit,variant,prediction,align,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
dev,128k,token,morph,gold,tokens,0.830539,0.779559,0.804221
dev,128k,token,morph,hybrid,tokens,0.820187,0.767535,0.792964
dev,128k,token,morph,yap,tokens,0.775718,0.734269,0.754375
dev,16k,token,morph,gold,tokens,0.810019,0.749900,0.778781
dev,16k,token,morph,hybrid,tokens,0.788561,0.730661,0.758491
...,...,...,...,...,...,...,...,...
test,unichar_improved_52k,token,morph,hybrid,tokens,0.736913,0.767597,0.751924
test,unichar_improved_52k,token,morph,yap,tokens,0.719576,0.737983,0.728650
test,unichar_improved_with_hash_52k,token,morph,gold,tokens,0.745432,0.787768,0.765928
test,unichar_improved_with_hash_52k,token,morph,hybrid,tokens,0.730115,0.759013,0.744231


### Morpheme Level Eval

#### Token Multi

#### Hybrid

In [48]:
align_morph_res_hyb = []
for trans in os.scandir('output/sinai'):
    trans_name = trans.name
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_dev.bmes')

            p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], pruned_ner_path, str_join_char='')
            align_morph_res_hyb.append(('dev', 'morph', 'multi', 'tokens', 'hybrid', trans_name, seed, p, r, f))

            pruned_ner_path=os.path.join(folder.path, 'morph_pruned_test.bmes')

            p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], pruned_ner_path, str_join_char='')
            align_morph_res_hyb.append(('test', 'morph', 'multi', 'tokens', 'hybrid', trans_name, seed, p, r, f))

52k
.ipynb_checkpoints
2k
8k
32k
4k
16k
64k
128k
heBERT
unichar_improved_52k
unichar_improved_with_hash_52k
distilled_52k_temp


In [49]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

at_df.groupby(['set', 'trans_name', 'eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,p,r,f
set,trans_name,eval_unit,variant,prediction,align,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
dev,128k,morph,multi,tokens,hybrid,0.814822,0.742285,0.776844
dev,128k,token,morph,gold,tokens,0.830539,0.779559,0.804221
dev,128k,token,morph,hybrid,tokens,0.820187,0.767535,0.792964
dev,128k,token,morph,yap,tokens,0.775718,0.734269,0.754375
dev,16k,morph,multi,tokens,hybrid,0.796010,0.701403,0.745671
...,...,...,...,...,...,...,...,...
test,unichar_improved_52k,token,morph,yap,tokens,0.719576,0.737983,0.728650
test,unichar_improved_with_hash_52k,morph,multi,tokens,hybrid,0.685187,0.626609,0.654505
test,unichar_improved_with_hash_52k,token,morph,gold,tokens,0.745432,0.787768,0.765928
test,unichar_improved_with_hash_52k,token,morph,hybrid,tokens,0.730115,0.759013,0.744231


#### YAP + GOLD

In [50]:
def align_multitok_yg(ner_pred_path, prun_sents, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)

    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [51]:
gold_morph = {'dev': dev_gold, 'test': test_gold}
def get_sents_for_mult(treebank_set, gold=False, pred_set=None, 
                       dep_path=None, map_path=None):
    if treebank_set is None:
        prun_yo = get_prun_yo(pred_set, dep_path, map_path)
    else:
        if not gold:
            prun_yo = bclm.read_yap_output(treebank_set=treebank_set)
        else:
            prun_yo = gold_morph[treebank_set]
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    return prun_sents

dev_yap_sents_m = get_sents_for_mult('dev')
test_yap_sents_m = get_sents_for_mult('test')
dev_gold_sents_m = get_sents_for_mult('dev', gold=True)
test_gold_sents_m = get_sents_for_mult('test', gold=True)

In [52]:
align_morph_res_yap = []
for trans in os.scandir('output/sinai'):
    trans_name = trans.name
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            #dev
            yap_ner_path=os.path.join(folder.path, 'morph_yap_dev.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                               dev_yap_sents_m,
                               yap_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], yap_ner_path, str_join_char='')
            align_morph_res_yap.append(('dev', 'morph', 'multi', 'tokens', 'yap', trans_name, seed, p, r, f))
            
            #test
            yap_ner_path=os.path.join(folder.path, 'morph_yap_test.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                               test_yap_sents_m,
                               yap_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], yap_ner_path, str_join_char='')
            align_morph_res_yap.append(('test', 'morph', 'multi', 'tokens', 'yap', trans_name, seed, p, r, f))


52k
.ipynb_checkpoints
2k
8k
32k
4k
16k
64k
128k
heBERT
unichar_improved_52k
unichar_improved_with_hash_52k
distilled_52k_temp


In [53]:
align_morph_res_gold = []
for trans in os.scandir('output/sinai'):
    trans_name = trans.name
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            #dev
            gold_ner_path=os.path.join(folder.path, 'morph_gold_dev.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                               dev_gold_sents_m,
                               gold_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], gold_ner_path, str_join_char='')
            align_morph_res_gold.append(('dev', 'morph', 'multi', 'tokens', 'gold', trans_name, seed, p, r, f))

            #test
            gold_ner_path=os.path.join(folder.path, 'morph_gold_test.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                               test_gold_sents_m,
                               gold_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], gold_ner_path, str_join_char='')
            align_morph_res_gold.append(('test', 'morph', 'multi', 'tokens', 'gold', trans_name, seed, p, r, f))

52k
.ipynb_checkpoints
2k
8k
32k
4k
16k
64k
128k
heBERT
unichar_improved_52k
unichar_improved_with_hash_52k
distilled_52k_temp


## ALL SCORES

In [54]:
at_df = pd.DataFrame(align_tok_res+align_tok_res_yg+align_morph_res_hyb+align_morph_res_yap+align_morph_res_gold, columns=['set', 'eval_unit', 'variant', 
                                                                                                                           'prediction', 'align', 'trans_name', 
                                                                                                                           'seed', 'p', 'r', 'f'])

In [55]:
all_df = pd.concat([at_df, ne_df])

In [60]:
all_df.groupby(['set', 'eval_unit','variant', 'prediction', 'align','trans_name']).f.mean().unstack().mul(100)[['heBERT', '2k', '4k', 
                                                                                                               '8k', '16k', '32k', '52k', '64k',
                                                                                                               '128k', 'unichar_improved_52k', 'unichar_improved_with_hash_52k',
                                                                                                               'distilled_52k_temp']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,trans_name,heBERT,2k,4k,8k,16k,32k,52k,64k,128k,unichar_improved_52k,unichar_improved_with_hash_52k,distilled_52k_temp
set,eval_unit,variant,prediction,align,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
dev,morph,morph,gold,-,81.83569,71.832022,74.332451,75.303345,77.836627,77.005951,76.029966,80.049914,79.38825,78.948506,78.060047,77.442353
dev,morph,morph,hybrid,-,80.452674,68.806994,72.245034,72.690626,74.975752,74.340257,73.421568,77.308677,77.640584,75.317802,73.67135,75.586166
dev,morph,morph,yap,-,73.208328,65.803361,67.441296,67.33867,69.286417,68.596986,67.277702,71.746003,70.618044,72.498128,70.164699,69.895083
dev,morph,multi,tokens,gold,80.180944,71.283581,73.183299,74.678721,75.792306,74.758317,72.184387,76.960361,78.723405,69.006655,67.246843,75.858374
dev,morph,multi,tokens,hybrid,79.195641,69.86458,71.960541,73.460755,74.567084,73.653733,71.356746,75.84978,77.684371,67.896153,66.549135,74.844583
dev,morph,multi,tokens,yap,75.129157,67.069537,68.681349,70.084404,71.517485,70.301198,67.857737,72.118344,73.51868,65.306475,63.556439,71.124813
dev,token,morph,gold,tokens,82.411615,71.945562,74.653507,75.467997,77.878077,77.872582,76.112238,80.58368,80.422059,79.356387,78.100574,78.095895
dev,token,morph,hybrid,tokens,81.692126,69.80214,72.742023,73.390441,75.849078,75.740698,74.329329,78.662476,79.296433,77.253904,75.053053,76.859293
dev,token,morph,yap,tokens,79.021235,68.559928,70.312438,70.193658,72.525007,72.762894,71.012144,75.138418,75.437486,74.869642,72.819262,73.87334
dev,token,multi,tokens,-,81.282045,71.21987,73.148684,74.694507,75.978079,75.769232,72.619302,77.691137,79.638059,69.309411,67.331539,76.773382


In [57]:
(ne_df.groupby(['set', 'trans_name', 'eval_unit','variant', 'prediction', 'align']).f.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ (1.96*(x['std']/np.sqrt(10))).round(1).astype(str)+'$')[['mean']])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,mean
set,trans_name,eval_unit,variant,prediction,align,Unnamed: 6_level_1
dev,128k,morph,morph,gold,-,$79.39 ± 0.4$
dev,128k,morph,morph,hybrid,-,$77.64 ± 0.4$
dev,128k,morph,morph,yap,-,$70.62 ± 0.2$
dev,128k,token,multi,tokens,-,$79.64 ± 0.5$
dev,128k,token,single,tokens,-,$81.30 ± 0.2$
...,...,...,...,...,...,...
test,unichar_improved_with_hash_52k,morph,morph,gold,-,$76.13 ± 0.5$
test,unichar_improved_with_hash_52k,morph,morph,hybrid,-,$71.60 ± 0.7$
test,unichar_improved_with_hash_52k,morph,morph,yap,-,$64.91 ± 0.4$
test,unichar_improved_with_hash_52k,token,multi,tokens,-,$67.68 ± 0.7$


In [58]:
seg_res_df.groupby(['pred_set', 'trans_name']).f_seg_pos.mean().unstack()[['heBERT', '2k', '4k', 
                                                                           '8k', '16k', '32k', '52k', '64k',
                                                                           '128k', 'unichar_improved_52k', 'unichar_improved_with_hash_52k']]

trans_name,heBERT,2k,4k,8k,16k,32k,52k,64k,128k,unichar_improved_52k,unichar_improved_with_hash_52k
pred_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
dev,93.692111,92.755259,93.003025,93.076573,93.041299,93.306393,93.072591,93.350109,93.267056,92.265538,91.669155
test,91.851358,91.010369,91.439091,91.420828,91.24954,91.523147,91.12684,91.474458,91.650169,90.204324,89.918698


In [59]:
seg_res_df.groupby(['pred_set', 'trans_name']).f_seg_only.mean().unstack()[['heBERT', '2k', '4k', 
                                                                           '8k', '16k', '32k', '52k', '64k',
                                                                           '128k', 'unichar_improved_52k', 'unichar_improved_with_hash_52k']]

trans_name,heBERT,2k,4k,8k,16k,32k,52k,64k,128k,unichar_improved_52k,unichar_improved_with_hash_52k
pred_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
dev,98.07505,97.266295,97.589115,97.567593,97.490896,97.765845,97.531311,97.734921,97.714928,96.759412,96.351061
test,97.963137,97.080259,97.530438,97.547616,97.386152,97.650438,97.201905,97.646012,97.786412,96.41433,96.112701
