# Re-evaluation
Perform two evaluations:
1. Strict morpheme evaluation
1. Token evaluation (morpheme labels are extended to the token level heuristically)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [5]:
import os

In [6]:
import sys
sys.path.append('/home/nlp/danb/NER')

import bclm
import ne_evaluate_mentions as nem

## Create BIOSE files


In [13]:
import json

def jsonl_to_biose(in_path, out_path, bioul_to_biose=True):
    sents = 0
    with open(out_path, 'w', encoding='utf8') as of:
        for line in open(in_path, 'r'):
            sent = json.loads(line)
            for word, tag in zip(sent['words'], sent['tags']):
                if bioul_to_biose:
                    tag = tag.replace('L-', 'E-').replace('U-', 'S-')
                of.write(word+' '+tag+'\n')
            of.write('\n')
            sents+=1
    print (sents)
    

In [14]:
import os

In [15]:
for trans in os.scandir('ud/output/predict_alephbert'):
    for folder in os.scandir(trans):
        if not '.ipynb' in folder.name:
            for file in os.scandir(folder):
                if '.json' in file.name and not '.ipynb' in file.name:
                    output_path = file.path.replace('.json', '.bmes')
                    if not os.path.exists(output_path):
                        print (folder)
                        jsonl_to_biose(file.path, output_path)



<DirEntry 'morph_54360'>
484
<DirEntry 'morph_54360'>
491
<DirEntry 'multi_54360'>
484
<DirEntry 'multi_54360'>
491
<DirEntry 'single_54360'>
484
<DirEntry 'single_54360'>
491
<DirEntry 'morph_44184'>
484
<DirEntry 'morph_44184'>
491
<DirEntry 'multi_44184'>
484
<DirEntry 'multi_44184'>
491
<DirEntry 'single_44184'>
484
<DirEntry 'single_44184'>
491
<DirEntry 'morph_20423'>
484
<DirEntry 'morph_20423'>
491
<DirEntry 'multi_20423'>
484
<DirEntry 'multi_20423'>
491
<DirEntry 'single_20423'>
484
<DirEntry 'single_20423'>
491
<DirEntry 'morph_80520'>
484
<DirEntry 'morph_80520'>
491
<DirEntry 'multi_80520'>
484
<DirEntry 'multi_80520'>
491
<DirEntry 'single_80520'>
484
<DirEntry 'single_80520'>
491
<DirEntry 'morph_27916'>
484
<DirEntry 'morph_27916'>
491
<DirEntry 'multi_27916'>
484
<DirEntry 'multi_27916'>
491
<DirEntry 'single_27916'>
484
<DirEntry 'single_27916'>
491


In [16]:
include_only = ['bert-basic-wordpiece-otw-52000-cp3',]
print(include_only)

['bert-basic-wordpiece-otw-52000-cp3']


## SINGLE + MULTI

In [17]:
res = []
for trans in os.scandir('ud/output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if '.ipynb' in folder.name:
            continue

        variant, seed = folder.name.split('_')

        if 'single' in folder.name:    
            p,r,f = nem.evaluate_files('../NER/data/ud_ner/token_gold_test_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_test_fix.bmes'), str_join_char='')
            res.append(('test', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))


        if 'multi' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/ud_ner/token_gold_test_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_test_dummy_o.bmes'), str_join_char='')
            res.append(('test', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))

        if 'morph' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/ud_ner/morph_gold_test.bmes', 
                                       os.path.join(folder.path,'morph_gold_test.bmes'), str_join_char='')
            res.append(('test', 'morph', variant, 'gold', '-', trans_name, seed, p, r, f))


        #dev

        if 'single' in folder.name:    
            p,r,f = nem.evaluate_files('../NER/data/ud_ner/token_gold_dev_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_dev_fix.bmes'), str_join_char='')
            res.append(('dev', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))


        if 'multi' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/ud_ner/token_gold_dev_fix.bmes', 
                                       os.path.join(folder.path,'token_gold_dev_dummy_o.bmes'), str_join_char='')
            res.append(('dev', 'token', variant, 'tokens', '-', trans_name, seed, p, r, f))

        if 'morph' in folder.name:
            p,r,f = nem.evaluate_files('../NER/data/ud_ner/morph_gold_dev.bmes', 
                                       os.path.join(folder.path,'morph_gold_dev.bmes'), str_join_char='')
            res.append(('dev', 'morph', variant, 'gold', '-', trans_name, seed, p, r, f))
    
    

ne_df = pd.DataFrame(res, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

ne_df.groupby(['set', 'eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

bert-basic-wordpiece-otw-52000-cp3


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,p,r,f
set,eval_unit,variant,prediction,align,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dev,morph,morph,gold,-,0.817602,0.797137,0.8072
dev,token,multi,tokens,-,0.833291,0.793047,0.812664
dev,token,single,tokens,-,0.832301,0.815133,0.823587
test,morph,morph,gold,-,0.767949,0.833761,0.799462
test,token,multi,tokens,-,0.799619,0.813248,0.806368
test,token,single,tokens,-,0.784595,0.823932,0.803713


## Add Alignments

### Token Level Eval

In [18]:
import re

o_re = re.compile('^O+$') 
s_re = re.compile('^O*SO*$|^O*BI*EO*$')
b_re = re.compile('^O*BI*$')
i_re = re.compile('^I+$')
e_re = re.compile('^I*EO*$')
def get_fixed_for_valid_biose(bio_seq):
    if o_re.match(bio_seq):
        return 'O'
    if s_re.match(bio_seq):
        return 'S'
    if b_re.match(bio_seq):
        return 'B'
    if i_re.match(bio_seq):
        return 'I'
    if e_re.match(bio_seq):
        return 'E'
    raise ValueError
    

def get_fixed_for_invalid_biose(parts):
    bio = 'O'
    if 'S' in parts:
        bio = 'S'
    elif 'B' in parts and 'E' in parts:
        bio='S'
    elif 'E' in parts:
        bio = 'E'
    elif 'B' in parts:
        bio = 'B'
    elif 'I' in parts:
        bio = 'I'
    return bio

valid_bio_re = re.compile('^O*BI*$|^O*BI*EO*$|^I+$|^I*EO*$|^O*SO*$')

from functools import lru_cache


def validate_biose_sequence(full_bio_seq):
    #print(full_bio_seq)
    bio_seq, type_seq = zip(*[('O', None) if b=='O' else b.split('-') for b in full_bio_seq])
    bio_seq = ''.join(bio_seq)
    valid_bio = valid_bio_re.match(bio_seq)
    type_seq = list(filter(lambda x: x is not None, type_seq))
    type_seq_set = set(type_seq)

    if valid_bio:
        fixed_bio = get_fixed_for_valid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
            
    else:
        #take the first BIOSE tag which is not O:
        #fixed_bio = list(filter(lambda x: x!='O', full_bio_seq))[0]
        #rough BIOSE and first category:
        fixed_bio = get_fixed_for_invalid_biose(bio_seq)
        if fixed_bio!='O':
            fixed_bio += '-' + type_seq[0]
        
    return valid_bio is not None, len(type_seq_set)<=1, fixed_bio


@lru_cache(1000)
def get_fixed_bio_sequence(full_bio_seq):
    return validate_biose_sequence(full_bio_seq)[2]

In [19]:
import re 
sent_id_re = re.compile('# sent_id = (\d+)')

import bz2
def get_sent_ids_lat(path):
    sent_ids = []
    with bz2.open(path, 'rt', encoding='utf-8') as f:
        for line in f:
            if line[0]=='#':
                sid = sent_id_re.match(line)
                if sid:
                    sent_id =  int(sid.group(1))
                sent_ids.append(sent_id)
    return sent_ids

dev_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-dev.heblex.conllul.bz2')
train_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-train.heblex.conllul.bz2')
test_sent_ids = get_sent_ids_lat('../UL_Hebrew-HTB/he_htb-ud-test.heblex.conllul.bz2')

            

In [20]:
dropped = [5438, 5444, 5445, 5446, 5448, 5449, 5450, 5451, 5453, 5459]
uddf = bclm.read_dataframe('ud')
uddf = uddf[(~uddf.global_sent_id.isin(dropped))]

In [21]:
def get_set_from_sent_id(gsi):
    if gsi>=min(dev_sent_ids) and gsi<=max(dev_sent_ids):
        return 'dev'
    elif gsi>=min(train_sent_ids) and gsi<=max(train_sent_ids):
        return 'train'
    elif gsi>=min(test_sent_ids) and gsi<=max(test_sent_ids):
        return 'test'
    
uddf['ud_set'] = uddf.sent_id.apply(get_set_from_sent_id)

uddf.groupby(['ud_set']).sent_id.agg(['min', 'max'])

Unnamed: 0_level_0,min,max
ud_set,Unnamed: 1_level_1,Unnamed: 2_level_1
dev,1,484
test,5726,6216
train,485,5725


In [22]:
dev_gold = uddf[uddf.ud_set=='dev']
test_gold = uddf[uddf.ud_set=='test']
test_gold['sent_id'] = test_gold.sent_id.rank(method='dense').astype(int)
dev_yap = bclm.read_yap_output(treebank_set='dev')
test_yap = bclm.read_yap_output(treebank_set='test')
dev_gold_sents = bclm.get_sentences_list(dev_gold, fields=['token_id', 'token_str'])
test_gold_sents = bclm.get_sentences_list(test_gold, fields=['token_id', 'token_str'])
dev_yap_sents = bclm.get_sentences_list(dev_yap, fields=['token_id', 'token_str'])
test_yap_sents = bclm.get_sentences_list(test_yap, fields=['token_id', 'token_str'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
dev_gold_tok = (bclm.get_token_df(dev_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok = (bclm.get_token_df(test_gold, biose=['biose_layer0'])
                .rename(columns={'biose_layer0': 'fixed_bio'}))
test_gold_tok['sent_id'] = test_gold_tok.sent_id.rank(method='dense').astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [24]:
def get_fixed_tok(path, orig_sents=dev_yap_sents):
    x = nem.read_file_sents(path, fix_multi_tag=False)
    new_sents = []
    for (i, ner_sent), (sent_id, yap_sent) in zip(x.iteritems(), orig_sents.iteritems()):
        for (form, bio), (token_id, token_str) in zip(ner_sent, yap_sent):
            new_sents.append((sent_id, token_id, token_str, form, bio))
    new_sents = pd.DataFrame(new_sents, columns=['sent_id', 'token_id', 'token_str', 'form', 'bio'])
    new_toks = bclm.get_token_df(new_sents, fields=['bio'])
    new_toks['fixed_bio'] = new_toks.bio.apply(lambda x: get_fixed_bio_sequence(tuple(x.split('^'))))
    return new_toks


In [25]:
def sents_from_df(df, sent_id_col='sent_id', 
                  group_cols=['token_str'], 
                  val_cols=['fixed_bio']):
    sents = bclm.get_sentences_list(df, fields=group_cols+val_cols)
    return sents

def evaluate_dataframes(gold_df, pred_df, fix_multi_tag_pred=True, truncate=None, ignore_cat=False, str_join_char=' '):
    gold_sents = sents_from_df(gold_df)
    pred_sents = sents_from_df(pred_df)
    gold_mentions = nem.sents_to_mentions(gold_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    pred_mentions = nem.sents_to_mentions(pred_sents, truncate=truncate, ignore_cat=ignore_cat, str_join_char=str_join_char)
    return nem.evaluate_mentions(gold_mentions, pred_mentions, verbose=False)

#### Run on all pruned

In [26]:
@lru_cache(512)
def get_sent_list(ds, dp, mp):
    prun_yo = get_prun_yo(ds, dp, mp)
    return bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str'])

#### Run all gold and YAP

In [27]:
align_tok_res_yg = []
for trans in os.scandir('ud/output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if 'morph' in folder.name and not '.ipynb' in folder.name:
            ## dev 
            ## - gold
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_gold_dev.bmes')
            out_path = os.path.join(folder.path, 'morph_gold_dev_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=dev_gold_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(dev_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('dev', 'token', 'morph', 'gold', 'tokens', trans_name, seed, p, r, f))


            ## test 
            ## - gold
            variant, seed = folder.name.split('_')
            file = os.path.join(folder.path,'morph_gold_test.bmes')
            out_path = os.path.join(folder.path, 'morph_gold_test_align_tokens.bmes')

            new_toks = get_fixed_tok(file, orig_sents=test_gold_sents)

            if True: #not os.path.exists(out_path):
                new_sents = bclm.get_sentences_list(new_toks, fields=['token_str', 'fixed_bio'])
                with open(out_path, 'w') as of:
                    for i, sent in new_sents.iteritems():
                        for tok, bio in sent:
                            of.write(tok+' '+bio+'\n')
                        of.write('\n')

            p, r, f = evaluate_dataframes(test_gold_tok, new_toks, str_join_char='')

            align_tok_res_yg.append(('test', 'token', 'morph', 'gold', 'tokens', trans_name, seed, p, r, f))


bert-basic-wordpiece-otw-52000-cp3


In [28]:
at_df = pd.DataFrame(align_tok_res_yg, columns=['set', 'eval_unit', 'variant', 'prediction', 'align', 'trans_name', 'seed', 'p', 'r', 'f'])

at_df.groupby(['set', 'trans_name', 'eval_unit','variant', 'prediction', 'align'])[['p', 'r', 'f']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,p,r,f
set,trans_name,eval_unit,variant,prediction,align,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
dev,bert-basic-wordpiece-otw-52000-cp3,token,morph,gold,tokens,0.824728,0.80409,0.814238
test,bert-basic-wordpiece-otw-52000-cp3,token,morph,gold,tokens,0.769358,0.834615,0.800617


### Morpheme Level Eval

#### Token Multi

#### YAP + GOLD

In [29]:
def soft_merge_bio_labels(multitok_sents, tokmorph_sents, verbose=False):
    new_sents = []
    for (i, mt_sent), (sent_id, mor_sent) in zip(multitok_sents.iteritems(), tokmorph_sents.iteritems()):
        new_sent = []
        for (form, bio), (token_id, token_str, forms) in zip(mt_sent, mor_sent):
            forms = forms.split('^')
            bio = bio.split('^')
            if len(forms) == len(bio):
                new_forms = (1, list(zip(forms,bio)))
            elif len(forms)>len(bio):
                dif = len(forms) - len(bio)
                new_forms = (2, list(zip(forms[:dif],['O']*dif)) + list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            else:
                new_forms = (3, list(zip(forms[::-1], bio[::-1]))[::-1])
                if verbose:
                    print(new_forms)
            new_sent.extend(new_forms[1])
        new_sents.append(new_sent)
    return new_sents

In [30]:
decode_sets = {
    'token': {
        'dev': '../NER/data/ud_ner/morph_gold_dev.bmes',
        'test': '../NER/data/ud_ner/morph_gold_test.bmes',
    },
    'multitok': {
        'dev': '../NER/data/ud_ner/morph_gold_dev.bmes',
        'test': '../NER/data/ud_ner/morph_gold_test.bmes',
    }
}

In [31]:
def align_multitok_yg(ner_pred_path, prun_sents, output_path):
    x = nem.read_file_sents(ner_pred_path, fix_multi_tag=False)

    new_sents = soft_merge_bio_labels(x, prun_sents, verbose=False)

    with open(output_path, 'w') as of:
        for sent in new_sents:
            for form, bio in sent:
                of.write(form+' '+bio+'\n')
            of.write('\n')


In [32]:
gold_morph = {'dev': dev_gold, 'test': test_gold}
def get_sents_for_mult(treebank_set, gold=False, pred_set=None, 
                       dep_path=None, map_path=None):
    if treebank_set is None:
        prun_yo = get_prun_yo(pred_set, dep_path, map_path)
    else:
        if not gold:
            prun_yo = bclm.read_yap_output(treebank_set=treebank_set)
        else:
            prun_yo = gold_morph[treebank_set]
    prun_yo = bclm.get_token_df(prun_yo, fields=['form'])
    prun_sents = bclm.get_sentences_list(prun_yo, fields=['token_id', 'token_str', 'form'])
    return prun_sents

# dev_yap_sents_m = get_sents_for_mult('dev')
# test_yap_sents_m = get_sents_for_mult('test')
dev_gold_sents_m = get_sents_for_mult('dev', gold=True)
test_gold_sents_m = get_sents_for_mult('test', gold=True)

In [33]:
align_morph_res_gold = []
for trans in os.scandir('ud/output/predict_alephbert'):
    trans_name = trans.name
    if trans.name not in include_only:
        continue
    print(trans_name)
    for folder in os.scandir(trans):
        if os.path.isdir(folder) and 'multi' in folder.name and not '.ipynb_checkpoints' in folder.name:
            #dev
            gold_ner_path=os.path.join(folder.path, 'morph_gold_dev.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_dev_dummy_o.bmes'), 
                               dev_gold_sents_m,
                               gold_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['dev'], gold_ner_path, str_join_char='')
            align_morph_res_gold.append(('dev', 'morph', 'multi', 'tokens', 'gold', trans_name, seed, p, r, f))

            #test
            gold_ner_path=os.path.join(folder.path, 'morph_gold_test.bmes')

            align_multitok_yg(os.path.join(folder.path, 'token_gold_test_dummy_o.bmes'), 
                               test_gold_sents_m,
                               gold_ner_path
                              )
            p, r, f = nem.evaluate_files(decode_sets['multitok']['test'], gold_ner_path, str_join_char='')
            align_morph_res_gold.append(('test', 'morph', 'multi', 'tokens', 'gold', trans_name, seed, p, r, f))

bert-basic-wordpiece-otw-52000-cp3


## ALL SCORES

In [34]:
at_df = pd.DataFrame(align_tok_res_yg+align_morph_res_gold, columns=['set', 'eval_unit', 'variant', 
                                                                                                                           'prediction', 'align', 'trans_name', 
                                                                                                                           'seed', 'p', 'r', 'f'])

In [35]:
ALL_DF_PATH = 'ud/output/all_results_alephbert.csv'

In [36]:
all_df = pd.read_csv(ALL_DF_PATH)

In [37]:
all_df = pd.concat([all_df, at_df, ne_df])
all_df.groupby(['set', 'eval_unit','variant', 
                'prediction', 'align','trans_name']).size().agg([min,max])

min    5
max    5
dtype: int64

In [39]:
checkpoints = ['heBERT', 
#                '2k', '4k', '8k',
#                                              '16k', '32k', '52k', '64k',
#                                              '128k', 'unichar_improved_52k', 
#                                              'unichar_improved_with_hash_52k',
               'bert-distilled-wordpiece-oscar-2000',
                'bert-distilled-wordpiece-oscar-16000',
                'bert-distilled-wordpiece-oscar-52000',
                'bert-basic-wordpiece-otw-52000-cp3',
              ]

In [40]:
mean_scores = (all_df
               .groupby(['set', 'eval_unit','variant', 
                         'prediction', 'align','trans_name'])
               .f.mean().unstack().mul(100)[checkpoints]
              )
mean_scores

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,trans_name,heBERT,bert-distilled-wordpiece-oscar-2000,bert-distilled-wordpiece-oscar-16000,bert-distilled-wordpiece-oscar-52000,bert-basic-wordpiece-otw-52000-cp3
set,eval_unit,variant,prediction,align,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
dev,morph,morph,gold,-,81.247928,74.41867,78.209848,79.302769,80.719959
dev,morph,multi,tokens,gold,80.100053,74.279756,76.922571,80.941309,80.386946
dev,token,morph,gold,tokens,81.669913,74.632728,78.803933,79.933183,81.423755
dev,token,multi,tokens,-,81.012373,74.052065,77.097025,81.770413,81.266358
dev,token,single,tokens,-,83.009847,74.620953,78.509327,83.504978,82.358662
test,morph,morph,gold,-,81.133317,70.221607,77.655903,77.573463,79.94623
test,morph,multi,tokens,gold,79.751971,69.463298,74.504304,76.010495,79.84853
test,token,morph,gold,tokens,81.217558,70.262844,77.925595,78.027602,80.061666
test,token,multi,tokens,-,80.043647,69.515953,74.716462,76.96357,80.636783
test,token,single,tokens,-,79.515853,67.686947,76.241035,76.413808,80.371276


In [41]:
all_df.to_csv(ALL_DF_PATH, index=False)

In [42]:
mean_scores.reset_index().to_csv('ud/output/mean_results_alephbert.csv', index=False)
