In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [3]:
import pandas as pd
import configs
import numpy as np

In [4]:
import os
import json

## Create setup dicts
1. Word Unit+Labels: `morph, token, multitok`
1. Char Arch: `char_lstm, char_cnn, no_char`
1. Word Embedding: `ft_tok, ft_yap, ft_tok_oov, ft_yap_oov, w2v_tok, w2v_yap, no_word_embed`

3 x 3 x 7 = **63 dicts**

In [7]:
data_folder = '../NER/data/for_ncrf'

datasets = {
    'morph': {
        '_unit': 'morpheme',
        '_scheme': 'bioes',
        'train_dir': 'morph_gold_train.bmes',
        'dev_dir': 'morph_gold_dev.bmes',
        'test_dir': 'morph_gold_test.bmes', 
    },
    'token': {
        '_unit': 'token',
        '_scheme': 'bioes',
        'train_dir': 'token_gold_train_fix.bmes',
        'dev_dir': 'token_gold_dev_fix.bmes',
        'test_dir': 'token_gold_test_fix.bmes',
    },
    'multitok': {
        '_unit': 'token',
        '_scheme': 'concat_bioes',
        'seg': False,
        'train_dir': 'token_gold_train_concat.bmes',
        'dev_dir': 'token_gold_dev_concat.bmes',
        'test_dir': 'token_gold_test_concat.bmes',
    },
}

## Create PER-LOC-ORG only datasets

In [8]:
trans_map = {
    'ANG': None,
    'DUC': None,
    'EVE': None,
    'FAC': 'LOC',
    'GPE': 'LOC',
    'LOC': 'LOC',
    'ORG': 'ORG',
    'PER': 'PER',
    'WOA': None,
}


In [32]:
import re

cat_re = re.compile('.*\-([^\^]+)\^?')
for n, ds in datasets.items():
    for k in ds:
        if ('train' in k or 'dev' in k or 'test' in k):
            path = os.path.join(data_folder, ds[k])
            new_path = os.path.join(data_folder, ds[k].split('.')[0]+'_plo.bmes')
            print(path)
            print(new_path)
            with open(new_path, 'w') as of:
                for line in open(path, 'r'):
                    line = line.split(' ')
                    word = line[0].strip()
                    if word!='':
                        tag = line[-1].strip()
                        tags = tag.split('^')
                        #cat = cat_re.search(tag)
                        new_tags = []
                        for t in tags:
                            if t=='O':
                                new_tags.append('O')
                            else:
                                try:
                                    bio, cat = t.split('-')
                                except:
                                    print(line)
                                    raise ValueError
                                if trans_map[cat] is None:
                                    new_tags.append('O')
                                else:
                                    new_tags.append(bio+'-'+trans_map[cat])
                        new_tag = '^'.join(new_tags)
                        of.write(word+' '+new_tag+'\n')
                        #print(word, tag, new_tag)
                    else:
                        of.write('\n')
                        #print('\n')

../NER/data/for_ncrf/morph_gold_train.bmes
../NER/data/for_ncrf/morph_gold_train_plo.bmes
../NER/data/for_ncrf/morph_gold_dev.bmes
../NER/data/for_ncrf/morph_gold_dev_plo.bmes
../NER/data/for_ncrf/morph_gold_test.bmes
../NER/data/for_ncrf/morph_gold_test_plo.bmes
../NER/data/for_ncrf/token_gold_train_fix.bmes
../NER/data/for_ncrf/token_gold_train_fix_plo.bmes
../NER/data/for_ncrf/token_gold_dev_fix.bmes
../NER/data/for_ncrf/token_gold_dev_fix_plo.bmes
../NER/data/for_ncrf/token_gold_test_fix.bmes
../NER/data/for_ncrf/token_gold_test_fix_plo.bmes
../NER/data/for_ncrf/token_gold_train_concat.bmes
../NER/data/for_ncrf/token_gold_train_concat_plo.bmes
../NER/data/for_ncrf/token_gold_dev_concat.bmes
../NER/data/for_ncrf/token_gold_dev_concat_plo.bmes
../NER/data/for_ncrf/token_gold_test_concat.bmes
../NER/data/for_ncrf/token_gold_test_concat_plo.bmes


In [40]:
data_folder = '../NER/data/for_ncrf'

new_datasets = {
    'morph': {
        '_unit': 'morpheme',
        '_scheme': 'bioes',
        'train_dir': 'morph_gold_train_plo.bmes',
        'dev_dir': 'morph_gold_dev_plo.bmes',
        'test_dir': 'morph_gold_test_plo.bmes', 
    },
    'token': {
        '_unit': 'token',
        '_scheme': 'bioes',
        'train_dir': 'token_gold_train_fix_plo.bmes',
        'dev_dir': 'token_gold_dev_fix_plo.bmes',
        'test_dir': 'token_gold_test_fix_plo.bmes',
    },
    'multitok': {
        '_unit': 'token',
        '_scheme': 'concat_bioes',
        'seg': False,
        'train_dir': 'token_gold_train_concat_plo.bmes',
        'dev_dir': 'token_gold_dev_concat_plo.bmes',
        'test_dir': 'token_gold_test_concat_plo.bmes',
    },
}

In [41]:
default_grid = { 
        # FIXED
        'word_seq_feature': 'LSTM',
        'word_emb_dim': 300,
        'char_emb_dim': 30,
        'iteration': 200,
        'bilstm': True,
        'norm_word_emb': False,
        'norm_char_emb': False,
        'ave_batch_loss': False,
        'use_crf': True,
        'l2': 1e-8,
        'lstm_layer': 2,
        'batch_size': 8,
        'number_normalized': True,
        'optimizer': 'SGD',
        'lr_decay': 0.05,
        'momentum': 0,
        'nbest': 1,
        'hidden_dim': 200,
        'dropout': 0.5,

    }
    
dataset_grids = {
    'multitok': {
        'learning_rate': 0.005,
    },
    'morph': {
        'learning_rate': 0.01,
    },
    'token': {
        'learning_rate': 0.01,
    },
}
arch_grids = {
    'char_lstm': {
        'char_seq_feature': 'LSTM',
        'use_char': True,
        'char_hidden_dim': 70, 
    },
    'char_cnn': {
        'char_seq_feature': 'CNN',
        'use_char': True,
        'char_hidden_dim': 70,
        'char_kernel_size': 7,
    },
    'no_char': {
        'use_char': False,
     },
}


In [42]:
word_embedding_files = {
    #'ft_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.fasttext_skipgram.model.vec.nofirstline',
    #'ft_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.fasttext_skipgram.model.vec.nofirstline',
    'ft_oov_yap': 'data/htb_all_words.wikipedia.alt_tok.yap_form.fasttext_skipgram.txt',
    'ft_oov_tok': 'data/htb_all_words.wikipedia.alt_tok.tokenized.fasttext_skipgram.txt',
    #'w2v_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.word2vec_skipgram.txt.nofirstline',
    #'w2v_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.word2vec_skipgram.txt.nofirstline',
    #'glv_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.glove.txt',
    #'glv_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.glove.txt',
    #'no_word': None,
}

In [43]:
models_folder = 'final_setup/plo_models'
conf_folder =   'final_setup/plo_conf'
json_folder =   'final_setup/plo_conf_json'
logs_folder =   'final_setup/plo_logs'

In [44]:
seed_num_options = np.arange(44, 54)
seed_num_options

array([44, 45, 46, 47, 48, 49, 50, 51, 52, 53])

In [45]:
def create_conf_dict(model_base_name, dataset, arch, emb_name, seed_num):
    full_conf_dict = {}
    full_conf_dict['status'] = 'train'
    full_conf_dict['model_dir'] = os.path.join(models_folder, model_base_name)
    
    for k, v in new_datasets[dataset].items():
        if not k.startswith('_'):
            if k in ['train_dir', 'dev_dir', 'test_dir']:
                full_conf_dict[k] = os.path.join(data_folder, v)
            else:
                full_conf_dict[k] = v
    
    if not(emb_name == 'no_word' or word_embedding_files[emb_name] is None):
        full_conf_dict['word_emb_dir'] = word_embedding_files[emb_name]
    
    
    full_conf_dict.update(default_grid)
    full_conf_dict.update(dataset_grids[dataset])
    full_conf_dict.update(arch_grids[arch])
            
    return full_conf_dict
    

In [46]:
ds_embeds = {'morph': ['ft_oov_yap', 'ft_oov_tok'], 
             'token': ['ft_oov_tok'],
             'multitok': ['ft_oov_tok']}
ds_archs = {'morph': ['char_cnn'],
            'token': ['char_cnn'],
            'multitok': ['char_lstm'],}

confs = {}
for dataset in datasets:
    for arch in ds_archs[dataset]:
        for emb_name in ds_embeds[dataset]:
            for seed_num in seed_num_options:
                model_base_name = '.'.join([dataset, arch, emb_name, str(seed_num)+'_seed'])
                confs[model_base_name] = create_conf_dict(model_base_name, dataset, 
                                                          arch, emb_name, seed_num)
            

In [47]:
len(confs)

40

In [49]:
import pickle
pickle.dump(confs, open('final_setup/plo_confs.pkl', 'wb'))

## Create conf files for setup dicts
1. Random Seed: 10 different `(44, 45, 46...)`
1. `morph.charlstm.ft_tok.44_seed.conf`
1. `multitok.nochar.no_word_embed.47_seed.conf`

63 * 10 = **630 conf files**

In [50]:
if not os.path.exists(models_folder):
    os.mkdir(models_folder)
if not os.path.exists(conf_folder):
    os.mkdir(conf_folder)
if not os.path.exists(json_folder):
    os.mkdir(json_folder)
if not os.path.exists(logs_folder):
    os.mkdir(logs_folder)

In [51]:
for name, conf in confs.items():
    conf_path = os.path.join(conf_folder, name+'.conf')
    with open(conf_path, 'w', encoding='utf8') as of:
        for k, v in conf.items():
            of.write(k+'='+str(v)+'\n')
    json_path = os.path.join(json_folder, name+'.json')
    with open(json_path, 'w') as of:
        of.write(json.dumps(conf))

## Create `main.X.py` files
Only difference is `seed_num`: 
- `main.44.py` will have `seed_num = 44`

## Create `final_setup_run.py`
1. seed_num match: runs `.conf` files with matching `main.X.py` file **only**.
1. Choose device. 
1. Choose conf prefix.
1. Skip confs that are running or ran already (using `.dset` file)

In [7]:
emb_options = list(word_embedding_files.keys())+[None]
emb_options

['alt_tok_yap_ft_sg',
 'alt_tok_tokenized_ft_sg',
 'htb_all_alt_tok_yap_ft_sg',
 'htb_all_alt_tok_tokenized_ft_sg',
 'alt_tok_yap_w2v_sg',
 'alt_tok_tokenized_w2v_sg',
 None]

## Read logs

In [50]:
import pickle
confs = pickle.load( open('final_setup/plo_confs.pkl', 'rb'))

In [272]:
import re
import os
DEV_RES_LINE = re.compile('Dev: .*; acc: (?P<acc>[^,]+)(?:, p: (?P<p>[^,]+), r: (?P<r>[^,]+), f: (?P<f>[-\d\.]+))?')
#Dev: time: 0.94s speed: 536.09st/s; acc: 0.9043
#Dev: time: 3.42s, speed: 146.59st/s; acc: 0.9546, p: 0.7577, r: 0.6393, f: 0.6935

mtimes = []
res = []
archs = []
for f in os.scandir(logs_folder):
    if f.name.startswith('.ipy'):
        continue
    mtimes.append(os.path.getmtime(f.path))
    model_base_name = '.'.join(f.name.split('.')[:-1])
    model_no_seed = '.'.join(f.name.split('.')[:-2])
    unit, arch, w_embed, seed_num = f.name.split('.')[:-1]
    archs.append(arch)
    matching_conf = confs[model_base_name]
    params = { 'model_base_name': model_base_name, 'arch': arch, 
              'unit': unit, 'w_embed': w_embed, 'seed_num': seed_num,
              'model_no_seed': model_no_seed,}
    params.update(matching_conf)
    with open(f.path, 'r') as fp:
        i= 0
        for line in fp:
            m = DEV_RES_LINE.match(line)
            if m:
                r = m.groupdict().copy()
                for k, v in r.items():
                    if v is not None:
                        r[k] = float(v)
                r.update(params)
                r['epoch'] = i
                i+=1
                res.append(r)

rdf = pd.DataFrame(res)

rdf['model_file_name'] = rdf.model_base_name + '.' + rdf.epoch.astype(str) + '.model'
rdf['dset_file_name'] =  rdf.model_base_name +'.dset'
rdf['char_seq_feature'] = rdf.char_seq_feature.fillna('NoChar')

rdf['relevant_score'] = rdf.f.fillna(rdf.acc)

def get_embed_unit(s):
    if 'yap' in s:
        return 'morph'
    elif 'tok' in s:
        return 'token'
    return 'na'

def get_clash_match(s):
    if s.embed_unit=='na':
        return 'na'
    elif s.embed_unit==s.input_unit:
        return 'Match'
    else:
        return 'Clash'
    
rdf['input_unit'] = rdf.unit.apply(lambda x: 'morph' if x=='morph' else 'token')
rdf['embed_unit'] = rdf.w_embed.apply(get_embed_unit)
rdf['embed_type'] = rdf.w_embed.str.replace('_tok|_yap', '')
rdf['cm'] = rdf.apply(get_clash_match, axis=1)

erdf = rdf[(rdf.groupby(['seed_num', 'arch', 'unit', 'w_embed']).relevant_score.transform(max)==rdf.relevant_score) ]
erdf = erdf[(erdf.groupby(['seed_num', 'arch', 'unit', 'w_embed']).epoch.transform(min)==erdf.epoch) ]

In [273]:
erdf.shape

(40, 49)

In [274]:
erdf.groupby(['unit', 'arch', 'w_embed']).seed_num.nunique().unstack()

Unnamed: 0_level_0,w_embed,ft_oov_tok,ft_oov_yap
unit,arch,Unnamed: 2_level_1,Unnamed: 3_level_1
morph,char_cnn,10.0,10.0
multitok,char_lstm,10.0,
token,char_cnn,10.0,


In [275]:
print ('Mean time per run:', round((max(mtimes) - min(mtimes) )/ len(mtimes) / 60, 2), 'minutes')

Mean time per run: 65.55 minutes


In [276]:
erdf.groupby(['unit', 'arch', 'embed_type', 'cm']).relevant_score.mean().unstack([-2,-1]).mul(100).round(2)

Unnamed: 0_level_0,embed_type,ft_oov,ft_oov
Unnamed: 0_level_1,cm,Clash,Match
unit,arch,Unnamed: 2_level_2,Unnamed: 3_level_2
morph,char_cnn,81.9,83.25
multitok,char_lstm,,94.37
token,char_cnn,,80.79


In [277]:
import numpy as np 
def perc(n):
    def perc_(x):
        return np.percentile(x, n)
    perc_.__name__ = 'perc_%s' % n
    return perc_

In [278]:
erdf.groupby(['unit', 'char_seq_feature']).relevant_score.agg(['max', 'min', 'mean', 'std', 'median', perc(95)]).mul(100).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min,mean,std,median,perc_95
unit,char_seq_feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph,CNN,84.12,80.61,82.58,0.91,82.72,83.96
multitok,LSTM,94.5,94.26,94.37,0.08,94.38,94.48
token,CNN,81.32,80.18,80.79,0.42,80.96,81.24


In [279]:
erdf.groupby(['unit', 'char_seq_feature']).relevant_score.agg(['max', 'min', 'mean', 'std', 'median', perc(95)]).mul(100).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min,mean,std,median,perc_95
unit,char_seq_feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph,CNN,84.12,80.61,82.58,0.91,82.72,83.96
multitok,LSTM,94.5,94.26,94.37,0.08,94.38,94.48
token,CNN,81.32,80.18,80.79,0.42,80.96,81.24


In [280]:
erdf.to_pickle('final_setup/plo_erdf.pkl')

## Decode

In [281]:
output_folder = 'final_setup/plo_decode_output'
decode_conf_folder = 'final_setup/plo_decode_conf'

In [282]:
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

if not os.path.exists(decode_conf_folder):
    os.mkdir(decode_conf_folder)
    
decode_sets = {
    'morph': {
        'morph_dev_gold': '../NER/data/for_ncrf/morph_gold_dev_plo.bmes',
        'morph_dev_yap': '../NER/data/for_ncrf/morph_yap_dev_dummy_o.bmes',
        'morph_test_gold': '../NER/data/for_ncrf/morph_gold_test_plo.bmes',
        'morph_test_yap': '../NER/data/for_ncrf/morph_yap_test_dummy_o.bmes',
    },
    'token': {
        'token_dev': '../NER/data/for_ncrf/token_gold_dev_fix_plo.bmes',
        'token_test': '../NER/data/for_ncrf/token_gold_test_fix_plo.bmes',
    },
    'multitok': {
        'token_dev': '../NER/data/for_ncrf/token_gold_dev_concat_plo.bmes',
        'token_test': '../NER/data/for_ncrf/token_gold_test_concat_plo.bmes',
    }
}

In [283]:
params = { 'status': 'decode' }

for i, row in erdf.iterrows():
    unit = row['unit']
    for name, set_path in decode_sets[unit].items():
        row_par = params.copy()
        row_par['load_model_dir'] = os.path.join(models_folder, row['model_file_name'])
        row_par['dset_dir'] = os.path.join(models_folder, row['dset_file_name'])
        row_par['decode_dir'] = os.path.join(output_folder, name+'.'+row['model_base_name']+'.bmes')
        row_par['raw_dir'] = set_path
        
        conf_path = os.path.join(decode_conf_folder, name+'.'+row['model_base_name']+'.decode.conf')
        if not os.path.exists(conf_path):
            with open(conf_path, 'w', encoding='utf8') as of:
                for k, v in row_par.items():
                    of.write(k+'='+str(v)+'\n')        
 

In [161]:
import os, re

In [162]:
pred_line = re.compile('Predict raw 1-best result has been written into file.*')
bads = []
for f in os.scandir('final_setup/plo_decode_logs'):
    if f.name=='.ipynb_checkpoints' or f.name=='.log':
        continue
    with open(f.path, 'r') as fp:
        data = fp.read()
        if len(re.findall(pred_line, data))==0:
            bads.append (f.name)
            #os.remove(f.path)
sorted(bads)

[]

In [163]:
from collections import Counter

In [164]:
xxx = []
for f in os.scandir('final_setup/plo_decode_output'):
    if f.name=='.ipynb_checkpoints' or f.name=='.bmes':
        continue
    elif 'pruned' in f.name:
        xxx.append('.'.join(f.name.split('.')[:-2]))
Counter(xxx).most_common()

[]

## Evaluate decoded folder

In [14]:
erdf = pd.read_pickle('final_setup/plo_erdf.pkl')

In [137]:
import sys
sys.path.append('../NER')
import ne_evaluate_mentions as nem

In [165]:
scores = {}

In [226]:
if os.path.exists('final_setup/plo_scores.pkl'):
    scores = pickle.load(open('final_setup/plo_scores.pkl', 'rb'))

In [227]:
for file in os.scandir(output_folder):
    if file.name=='.ipynb_checkpoints':
        continue
    gold_name, inp, arch, w_embed, seed_num = file.name.split('.')[:-1]
    if (gold_name, inp, arch, w_embed, seed_num) not in scores:
        if len(gold_name.split('_'))>2:
            unit, pred_set, _ = gold_name.split('_')
            gold_path = decode_sets[unit][unit+'_'+pred_set+'_gold']
        else:
            unit, pred_set = gold_name.split('_')
            gold_path = decode_sets[unit][unit+'_'+pred_set]
        p, r, f = nem.evaluate_files(gold_path, file)
        scores[(gold_name, inp, arch, w_embed, seed_num)] = (p, r, f)
    

In [228]:
import pickle
pickle.dump(scores, open('final_setup/plo_scores.pkl', 'wb'))

In [229]:
score_tups = [(*k, *v) for k,v in scores.items()]

In [230]:
mev = pd.DataFrame(score_tups, columns=('gold_name', 'unit', 'arch', 
                                        'w_embed', 'seed_num', 
                                        'p_m', 'r_m', 'f_m'))
(mev[mev.gold_name.str.contains('dev')].groupby(['gold_name', 'unit', 'arch'])
 .f_m.agg(['max', 'min', 'mean', 'std', 'median', perc(95)]))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,max,min,mean,std,median,perc_95
gold_name,unit,arch,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
morph_dev_gold,morph,char_cnn,0.841183,0.808324,0.826238,0.008902,0.827349,0.839565
morph_dev_pruned,morph,char_cnn,0.824834,0.807991,0.815201,0.004974,0.814694,0.822162
morph_dev_yap,morph,char_cnn,0.772475,0.730512,0.750409,0.011575,0.748765,0.76404
token_dev,multitok,char_lstm,0.815385,0.79299,0.802283,0.006898,0.800217,0.812771
token_dev,token,char_cnn,0.812775,0.801752,0.807945,0.004199,0.809624,0.812559


In [231]:
(mev[mev.gold_name.str.contains('dev')].groupby(['gold_name', 'unit', 'arch']).size())

gold_name         unit      arch     
morph_dev_gold    morph     char_cnn     20
morph_dev_pruned  morph     char_cnn     10
morph_dev_yap     morph     char_cnn     20
token_dev         multitok  char_lstm    10
                  token     char_cnn     10
dtype: int64

In [232]:
mev.head()

Unnamed: 0,gold_name,unit,arch,w_embed,seed_num,p_m,r_m,f_m
0,token_dev,token,char_cnn,ft_oov_tok,44_seed,0.843318,0.773784,0.807056
1,token_test,token,char_cnn,ft_oov_tok,44_seed,0.810934,0.772234,0.791111
2,token_dev,multitok,char_lstm,ft_oov_tok,44_seed,0.821826,0.780127,0.800434
3,token_test,multitok,char_lstm,ft_oov_tok,44_seed,0.790043,0.791757,0.790899
4,token_dev,token,char_cnn,ft_oov_tok,46_seed,0.844749,0.782241,0.812294


In [233]:
mev['pred_set'] = mev.gold_name.apply(lambda x: '_'.join(x.split('_')[1:]))

In [234]:
mev = mev.merge(erdf, how='left')

In [235]:
(mev[mev.pred_set.str.contains('dev')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg(['mean', 'std']).mul(100).round(2)
 .assign(mean = lambda x: x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str))[['mean']]
 .unstack([-2,-1]))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,embed_type,ft_oov,ft_oov
Unnamed: 0_level_2,Unnamed: 1_level_2,cm,Clash,Match
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3
morph,dev_gold,char_cnn,81.95 ± 0.7,83.30 ± 0.5
morph,dev_pruned,char_cnn,,81.52 ± 0.5
morph,dev_yap,char_cnn,74.40 ± 1.0,75.68 ± 0.9
multitok,dev,char_lstm,,80.23 ± 0.7
token,dev,char_cnn,,80.79 ± 0.4


In [236]:
x = (mev[mev.pred_set.str.contains('dev')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg([ 'mean', 'std']).mul(100).round(2)
 .assign(std = lambda x: x['std'].round(1))
 .unstack([-2,-1]))
x.columns = x.columns.reorder_levels([1,2,0])
pd.set_option("max_columns", 30)
x.sort_index(axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,embed_type,ft_oov,ft_oov,ft_oov,ft_oov
Unnamed: 0_level_1,Unnamed: 1_level_1,cm,Clash,Clash,Match,Match
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,mean,std,mean,std
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
morph,dev_gold,char_cnn,81.95,0.7,83.3,0.5
morph,dev_pruned,char_cnn,,,81.52,0.5
morph,dev_yap,char_cnn,74.4,1.0,75.68,0.9
multitok,dev,char_lstm,,,80.23,0.7
token,dev,char_cnn,,,80.79,0.4


In [237]:
mev[(mev.unit=='morph') & (mev.pred_set.str.contains('pruned')) & (mev.embed_type=='ft_oov') & (mev.arch=='char_cnn')].groupby(['pred_set','cm']).f_m.mean().unstack()

cm,Match
pred_set,Unnamed: 1_level_1
dev_pruned,0.815201
test_pruned,0.777842


In [238]:
mev['pred_set_sub'] = mev.pred_set.apply(lambda x: x.split('_')[1] if '_' in x else '')
mev['pred_set_main'] = mev.pred_set.apply(lambda x: x.split('_')[0] )
(mev[((mev.unit!='morph') & (mev.embed_type=='ft_oov') 
    ) 
    |
     ((mev.unit=='morph') 
       & (mev.embed_type=='ft_oov') 
      & (mev.arch=='char_cnn'))].groupby(['unit', 'pred_set_sub', 'cm', 'pred_set_main',])
 .f_m.mean().unstack().mul(100).round(2)
 .assign(ratio = lambda x: (x.test/x.dev -1).mul(100).round(1)))

Unnamed: 0_level_0,Unnamed: 1_level_0,pred_set_main,dev,test,ratio
unit,pred_set_sub,cm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
morph,gold,Clash,81.95,79.74,-2.7
morph,gold,Match,83.3,79.94,-4.0
morph,pruned,Match,81.52,77.78,-4.6
morph,yap,Clash,74.4,69.69,-6.3
morph,yap,Match,75.68,69.65,-8.0
multitok,,Match,80.23,80.49,0.3
token,,Match,80.79,79.41,-1.7


In [291]:
x= (mev[ (mev.cm=='Match') & (((mev.unit!='morph') & (mev.embed_type=='ft_oov') 
    ) 
    |
     ((mev.unit=='morph') 
       & (mev.embed_type=='ft_oov') 
      & (mev.arch=='char_cnn')))].groupby(['unit', 'pred_set_sub', 'pred_set_main'])
         .f_m.agg(['mean', 'std']).mul(100).round(2)
         .assign(mean = lambda x: '$'+x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str)+'$')[['mean']].unstack()
 )
x.columns = x.columns.droplevel(0)
print (x.to_latex(bold_rows=True).replace('±', '\pm').replace('\$', '$'))

\begin{tabular}{llll}
\toprule
      & \textbf{pred\_set\_main} &            dev &           test \\
\textbf{unit} & \textbf{pred\_set\_sub} &                &                \\
\midrule
\textbf{morph} & \textbf{gold} &  $83.30 \pm 0.5$ &  $79.94 \pm 0.9$ \\
      & \textbf{pruned} &  $81.52 \pm 0.5$ &  $77.78 \pm 0.9$ \\
      & \textbf{yap} &  $75.68 \pm 0.9$ &  $69.65 \pm 0.9$ \\
\textbf{multitok} &     &  $80.23 \pm 0.7$ &  $80.49 \pm 0.9$ \\
\textbf{token} &     &  $80.79 \pm 0.4$ &  $79.41 \pm 0.6$ \\
\bottomrule
\end{tabular}



In [292]:
x

Unnamed: 0_level_0,pred_set_main,dev,test
unit,pred_set_sub,Unnamed: 2_level_1,Unnamed: 3_level_1
morph,gold,$83.30 ± 0.5$,$79.94 ± 0.9$
morph,pruned,$81.52 ± 0.5$,$77.78 ± 0.9$
morph,yap,$75.68 ± 0.9$,$69.65 ± 0.9$
multitok,,$80.23 ± 0.7$,$80.49 ± 0.9$
token,,$80.79 ± 0.4$,$79.41 ± 0.6$


In [213]:
mev.to_pickle('final_setup/plo_mev.pkl')

In [None]:
1+1

In [257]:
from collections import defaultdict

ne_sets = {
    'morph': {
        'train': '../NER/data/for_ncrf/morph_gold_train_plo.bmes',
        'dev': '../NER/data/for_ncrf/morph_gold_dev_plo.bmes',
        'test': '../NER/data/for_ncrf/morph_gold_test_plo.bmes',
    },


}

all_cats = []
for unit in ne_sets:
    for ps, path in ne_sets[unit].items():
        cats = defaultdict(lambda: 0)

        gold_sents = nem.read_file_sents(path)
        gold_mentions = nem.sents_to_mentions(gold_sents, truncate=None)
        for ment in gold_mentions:
            cats[ment[2]]+=1

        cats.update({'set': ps})
        all_cats.append(cats)
            
cats = pd.DataFrame(all_cats).fillna(0).set_index('set').astype(int)
cats.T.sort_values('train', ascending=False)            

set,train,dev,test
PER,2128,193,267
ORG,2043,119,408
LOC,1871,161,247


In [258]:
from collections import defaultdict

ne_sets = {
    'token': {
        'train': '../NER/data/for_ncrf/token_gold_train_fix_plo.bmes',
        'dev': '../NER/data/for_ncrf/token_gold_dev_fix_plo.bmes',
        'test': '../NER/data/for_ncrf/token_gold_test_fix_plo.bmes',
    },


}

all_cats = []
for unit in ne_sets:
    for ps, path in ne_sets[unit].items():
        cats = defaultdict(lambda: 0)

        gold_sents = nem.read_file_sents(path)
        gold_mentions = nem.sents_to_mentions(gold_sents, truncate=None)
        for ment in gold_mentions:
            cats[ment[2]]+=1

        cats.update({'set': ps})
        all_cats.append(cats)
            
cats = pd.DataFrame(all_cats).fillna(0).set_index('set').astype(int)
cats.T.sort_values('train', ascending=False)     

set,train,dev,test
PER,2128,193,267
ORG,2043,119,408
LOC,1871,161,247


In [155]:
add_dol = lambda x: '$'+str(x)+'$'
print (cats.T.sort_values('train', ascending=False)
       .to_latex(bold_rows=True, formatters = [add_dol, add_dol, add_dol]).replace('\$', '$'))

\begin{tabular}{lrrr}
\toprule
\textbf{set} &  train &   dev &  test \\
\midrule
\textbf{PER} & $2128$ & $193$ & $267$ \\
\textbf{ORG} & $2043$ & $119$ & $408$ \\
\textbf{GPE} & $1377$ & $121$ & $195$ \\
\textbf{LOC} &  $331$ &  $28$ &  $41$ \\
\textbf{FAC} &  $163$ &  $12$ &  $11$ \\
\textbf{WOA} &  $114$ &   $9$ &   $6$ \\
\textbf{EVE} &   $57$ &  $12$ &   $0$ \\
\textbf{DUC} &   $36$ &   $2$ &   $3$ \\
\textbf{ANG} &   $33$ &   $3$ &   $1$ \\
\bottomrule
\end{tabular}



In [157]:
cats.T.sum()

set
train    6282
dev       499
test      932
dtype: int64

## Eval with ignore category

In [239]:
scores = {}

In [240]:
if os.path.exists('final_setup/plo_scores_nocat.pkl'):
    scores = pickle.load(open('final_setup/plo_scores_nocat.pkl', 'rb'))

In [241]:
for file in os.scandir(output_folder):
    if file.name=='.ipynb_checkpoints':
        continue
    gold_name, inp, arch, w_embed, seed_num = file.name.split('.')[:-1]
    if (gold_name, inp, arch, w_embed, seed_num) not in scores:
        if len(gold_name.split('_'))>2:
            unit, pred_set, _ = gold_name.split('_')
            gold_path = decode_sets[unit][unit+'_'+pred_set+'_gold']
        else:
            unit, pred_set = gold_name.split('_')
            gold_path = decode_sets[unit][unit+'_'+pred_set]
        p, r, f = nem.evaluate_files(gold_path, file, ignore_cat=True)
        scores[(gold_name, inp, arch, w_embed, seed_num)] = (p, r, f)
    

In [242]:
import pickle
pickle.dump(scores, open('final_setup/plo_scores_nocat.pkl', 'wb'))

In [243]:
score_tups = [(*k, *v) for k,v in scores.items()]

In [244]:
mev_nocat = pd.DataFrame(score_tups, columns=('gold_name', 'unit', 'arch', 
                                        'w_embed', 'seed_num', 
                                        'p_m', 'r_m', 'f_m'))

In [245]:
(mev_nocat[mev_nocat.gold_name.str.contains('dev')].groupby(['gold_name', 'unit', 'arch']).size())

gold_name         unit      arch     
morph_dev_gold    morph     char_cnn     20
morph_dev_pruned  morph     char_cnn     10
morph_dev_yap     morph     char_cnn     20
token_dev         multitok  char_lstm    10
                  token     char_cnn     10
dtype: int64

In [246]:
mev_nocat['pred_set'] = mev_nocat.gold_name.apply(lambda x: '_'.join(x.split('_')[1:]))

In [247]:
mev_nocat = mev_nocat.merge(erdf, how='left')

In [248]:
(mev_nocat[mev_nocat.pred_set.str.contains('dev')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg(['mean', 'std']).mul(100).round(2)
 .assign(mean = lambda x: x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str))[['mean']]
 .unstack([-2,-1]))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,embed_type,ft_oov,ft_oov
Unnamed: 0_level_2,Unnamed: 1_level_2,cm,Clash,Match
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3
morph,dev_gold,char_cnn,85.90 ± 0.6,86.47 ± 0.4
morph,dev_pruned,char_cnn,,84.30 ± 0.5
morph,dev_yap,char_cnn,78.05 ± 1.0,78.69 ± 0.8
multitok,dev,char_lstm,,84.45 ± 0.7
token,dev,char_cnn,,84.30 ± 0.5


In [249]:
x = (mev_nocat[mev_nocat.pred_set.str.contains('dev')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg([ 'mean', 'std']).mul(100).round(2)
 .assign(std = lambda x: x['std'].round(1))
 .unstack([-2,-1]))
x.columns = x.columns.reorder_levels([1,2,0])
pd.set_option("max_columns", 30)
x.sort_index(axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,embed_type,ft_oov,ft_oov,ft_oov,ft_oov
Unnamed: 0_level_1,Unnamed: 1_level_1,cm,Clash,Clash,Match,Match
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,mean,std,mean,std
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
morph,dev_gold,char_cnn,85.9,0.6,86.47,0.4
morph,dev_pruned,char_cnn,,,84.3,0.5
morph,dev_yap,char_cnn,78.05,1.0,78.69,0.8
multitok,dev,char_lstm,,,84.45,0.7
token,dev,char_cnn,,,84.3,0.5


In [250]:
mev_nocat[(mev_nocat.unit=='morph') 
          & (mev_nocat.pred_set.str.contains('pruned')) & (mev_nocat.embed_type=='ft_oov') 
          & (mev_nocat.arch=='char_cnn')].groupby(['pred_set','cm']).f_m.mean().unstack()

cm,Match
pred_set,Unnamed: 1_level_1
dev_pruned,0.84303
test_pruned,0.836174


In [252]:
mev_nocat['pred_set_sub'] = mev_nocat.pred_set.apply(lambda x: x.split('_')[1] if '_' in x else '')
mev_nocat['pred_set_main'] = mev_nocat.pred_set.apply(lambda x: x.split('_')[0] )
(mev_nocat[((mev_nocat.unit!='morph') & (mev_nocat.embed_type=='ft_oov') ) 
    |
     ((mev_nocat.unit=='morph') 
       & (mev_nocat.embed_type=='ft_oov') 
      & (mev_nocat.arch=='char_cnn'))].groupby(['unit', 'pred_set_sub', 'cm', 'pred_set_main',])
 .f_m.mean().unstack().mul(100).round(2)
 .assign(ratio = lambda x: (x.test/x.dev -1).mul(100).round(1)))

Unnamed: 0_level_0,Unnamed: 1_level_0,pred_set_main,dev,test,ratio
unit,pred_set_sub,cm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
morph,gold,Clash,85.9,85.84,-0.1
morph,gold,Match,86.47,86.19,-0.3
morph,pruned,Match,84.3,83.62,-0.8
morph,yap,Clash,78.05,74.62,-4.4
morph,yap,Match,78.69,74.98,-4.7
multitok,,Match,84.45,86.18,2.0
token,,Match,84.3,84.82,0.6


In [253]:
mev.to_pickle('final_setup/plo_mev_nocat.pkl')