In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [3]:
import pandas as pd
import configs
import numpy as np

In [4]:
import os
import json

## Create setup dicts
1. Word Unit+Labels: `morph, token, multitok`
1. Char Arch: `char_lstm, char_cnn, no_char`
1. Word Embedding: `ft_tok, ft_yap, ft_tok_oov, ft_yap_oov, w2v_tok, w2v_yap, no_word_embed`

3 x 3 x 7 = **63 dicts**

In [7]:
data_folder = '../NER/data/for_ncrf'

datasets = {
    'morph': {
        '_unit': 'morpheme',
        '_scheme': 'bioes',
        'train_dir': 'morph_gold_train.bmes',
        'dev_dir': 'morph_gold_dev.bmes',
        'test_dir': 'morph_gold_test.bmes', 
    },
    'token': {
        '_unit': 'token',
        '_scheme': 'bioes',
        'train_dir': 'token_gold_train_fix.bmes',
        'dev_dir': 'token_gold_dev_fix.bmes',
        'test_dir': 'token_gold_test_fix.bmes',
    },
    'multitok': {
        '_unit': 'token',
        '_scheme': 'concat_bioes',
        'seg': False,
        'train_dir': 'token_gold_train_concat.bmes',
        'dev_dir': 'token_gold_dev_concat.bmes',
        'test_dir': 'token_gold_test_concat.bmes',
    },
}

## Create PER-LOC-ORG only datasets

In [8]:
trans_map = {
    'ANG': None,
    'DUC': None,
    'EVE': None,
    'FAC': 'LOC',
    'GPE': 'LOC',
    'LOC': 'LOC',
    'ORG': 'ORG',
    'PER': 'PER',
    'WOA': None,
}


In [32]:
import re

cat_re = re.compile('.*\-([^\^]+)\^?')
for n, ds in datasets.items():
    for k in ds:
        if ('train' in k or 'dev' in k or 'test' in k):
            path = os.path.join(data_folder, ds[k])
            new_path = os.path.join(data_folder, ds[k].split('.')[0]+'_plo.bmes')
            print(path)
            print(new_path)
            with open(new_path, 'w') as of:
                for line in open(path, 'r'):
                    line = line.split(' ')
                    word = line[0].strip()
                    if word!='':
                        tag = line[-1].strip()
                        tags = tag.split('^')
                        #cat = cat_re.search(tag)
                        new_tags = []
                        for t in tags:
                            if t=='O':
                                new_tags.append('O')
                            else:
                                try:
                                    bio, cat = t.split('-')
                                except:
                                    print(line)
                                    raise ValueError
                                if trans_map[cat] is None:
                                    new_tags
                                else:
                                    new_tags.append(bio+'-'+trans_map[cat])
                        new_tag = '^'.join(new_tags)
                        of.write(word+' '+new_tag+'\n')
                        #print(word, tag, new_tag)
                    else:
                        of.write('\n')
                        #print('\n')

../NER/data/for_ncrf/morph_gold_train.bmes
../NER/data/for_ncrf/morph_gold_train_plo.bmes
../NER/data/for_ncrf/morph_gold_dev.bmes
../NER/data/for_ncrf/morph_gold_dev_plo.bmes
../NER/data/for_ncrf/morph_gold_test.bmes
../NER/data/for_ncrf/morph_gold_test_plo.bmes
../NER/data/for_ncrf/token_gold_train_fix.bmes
../NER/data/for_ncrf/token_gold_train_fix_plo.bmes
../NER/data/for_ncrf/token_gold_dev_fix.bmes
../NER/data/for_ncrf/token_gold_dev_fix_plo.bmes
../NER/data/for_ncrf/token_gold_test_fix.bmes
../NER/data/for_ncrf/token_gold_test_fix_plo.bmes
../NER/data/for_ncrf/token_gold_train_concat.bmes
../NER/data/for_ncrf/token_gold_train_concat_plo.bmes
../NER/data/for_ncrf/token_gold_dev_concat.bmes
../NER/data/for_ncrf/token_gold_dev_concat_plo.bmes
../NER/data/for_ncrf/token_gold_test_concat.bmes
../NER/data/for_ncrf/token_gold_test_concat_plo.bmes


In [40]:
data_folder = '../NER/data/for_ncrf'

new_datasets = {
    'morph': {
        '_unit': 'morpheme',
        '_scheme': 'bioes',
        'train_dir': 'morph_gold_train_plo.bmes',
        'dev_dir': 'morph_gold_dev_plo.bmes',
        'test_dir': 'morph_gold_test_plo.bmes', 
    },
    'token': {
        '_unit': 'token',
        '_scheme': 'bioes',
        'train_dir': 'token_gold_train_fix_plo.bmes',
        'dev_dir': 'token_gold_dev_fix_plo.bmes',
        'test_dir': 'token_gold_test_fix_plo.bmes',
    },
    'multitok': {
        '_unit': 'token',
        '_scheme': 'concat_bioes',
        'seg': False,
        'train_dir': 'token_gold_train_concat_plo.bmes',
        'dev_dir': 'token_gold_dev_concat_plo.bmes',
        'test_dir': 'token_gold_test_concat_plo.bmes',
    },
}

In [41]:
default_grid = { 
        # FIXED
        'word_seq_feature': 'LSTM',
        'word_emb_dim': 300,
        'char_emb_dim': 30,
        'iteration': 200,
        'bilstm': True,
        'norm_word_emb': False,
        'norm_char_emb': False,
        'ave_batch_loss': False,
        'use_crf': True,
        'l2': 1e-8,
        'lstm_layer': 2,
        'batch_size': 8,
        'number_normalized': True,
        'optimizer': 'SGD',
        'lr_decay': 0.05,
        'momentum': 0,
        'nbest': 1,
        'hidden_dim': 200,
        'dropout': 0.5,

    }
    
dataset_grids = {
    'multitok': {
        'learning_rate': 0.005,
    },
    'morph': {
        'learning_rate': 0.01,
    },
    'token': {
        'learning_rate': 0.01,
    },
}
arch_grids = {
    'char_lstm': {
        'char_seq_feature': 'LSTM',
        'use_char': True,
        'char_hidden_dim': 70, 
    },
    'char_cnn': {
        'char_seq_feature': 'CNN',
        'use_char': True,
        'char_hidden_dim': 70,
        'char_kernel_size': 7,
    },
    'no_char': {
        'use_char': False,
     },
}


In [42]:
word_embedding_files = {
    #'ft_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.fasttext_skipgram.model.vec.nofirstline',
    #'ft_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.fasttext_skipgram.model.vec.nofirstline',
    'ft_oov_yap': 'data/htb_all_words.wikipedia.alt_tok.yap_form.fasttext_skipgram.txt',
    'ft_oov_tok': 'data/htb_all_words.wikipedia.alt_tok.tokenized.fasttext_skipgram.txt',
    #'w2v_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.word2vec_skipgram.txt.nofirstline',
    #'w2v_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.word2vec_skipgram.txt.nofirstline',
    #'glv_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.glove.txt',
    #'glv_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.glove.txt',
    #'no_word': None,
}

In [43]:
models_folder = 'final_setup/plo_models'
conf_folder =   'final_setup/plo_conf'
json_folder =   'final_setup/plo_conf_json'
logs_folder =   'final_setup/plo_logs'

In [44]:
seed_num_options = np.arange(44, 54)
seed_num_options

array([44, 45, 46, 47, 48, 49, 50, 51, 52, 53])

In [45]:
def create_conf_dict(model_base_name, dataset, arch, emb_name, seed_num):
    full_conf_dict = {}
    full_conf_dict['status'] = 'train'
    full_conf_dict['model_dir'] = os.path.join(models_folder, model_base_name)
    
    for k, v in new_datasets[dataset].items():
        if not k.startswith('_'):
            if k in ['train_dir', 'dev_dir', 'test_dir']:
                full_conf_dict[k] = os.path.join(data_folder, v)
            else:
                full_conf_dict[k] = v
    
    if not(emb_name == 'no_word' or word_embedding_files[emb_name] is None):
        full_conf_dict['word_emb_dir'] = word_embedding_files[emb_name]
    
    
    full_conf_dict.update(default_grid)
    full_conf_dict.update(dataset_grids[dataset])
    full_conf_dict.update(arch_grids[arch])
            
    return full_conf_dict
    

In [46]:
ds_embeds = {'morph': ['ft_oov_yap', 'ft_oov_tok'], 
             'token': ['ft_oov_tok'],
             'multitok': ['ft_oov_tok']}
ds_archs = {'morph': ['char_cnn'],
            'token': ['char_cnn'],
            'multitok': ['char_lstm'],}

confs = {}
for dataset in datasets:
    for arch in ds_archs[dataset]:
        for emb_name in ds_embeds[dataset]:
            for seed_num in seed_num_options:
                model_base_name = '.'.join([dataset, arch, emb_name, str(seed_num)+'_seed'])
                confs[model_base_name] = create_conf_dict(model_base_name, dataset, 
                                                          arch, emb_name, seed_num)
            

In [47]:
len(confs)

40

In [49]:
import pickle
pickle.dump(confs, open('final_setup/plo_confs.pkl', 'wb'))

## Create conf files for setup dicts
1. Random Seed: 10 different `(44, 45, 46...)`
1. `morph.charlstm.ft_tok.44_seed.conf`
1. `multitok.nochar.no_word_embed.47_seed.conf`

63 * 10 = **630 conf files**

In [50]:
if not os.path.exists(models_folder):
    os.mkdir(models_folder)
if not os.path.exists(conf_folder):
    os.mkdir(conf_folder)
if not os.path.exists(json_folder):
    os.mkdir(json_folder)
if not os.path.exists(logs_folder):
    os.mkdir(logs_folder)

In [51]:
for name, conf in confs.items():
    conf_path = os.path.join(conf_folder, name+'.conf')
    with open(conf_path, 'w', encoding='utf8') as of:
        for k, v in conf.items():
            of.write(k+'='+str(v)+'\n')
    json_path = os.path.join(json_folder, name+'.json')
    with open(json_path, 'w') as of:
        of.write(json.dumps(conf))

## Create `main.X.py` files
Only difference is `seed_num`: 
- `main.44.py` will have `seed_num = 44`

## Create `final_setup_run.py`
1. seed_num match: runs `.conf` files with matching `main.X.py` file **only**.
1. Choose device. 
1. Choose conf prefix.
1. Skip confs that are running or ran already (using `.dset` file)

In [7]:
emb_options = list(word_embedding_files.keys())+[None]
emb_options

['alt_tok_yap_ft_sg',
 'alt_tok_tokenized_ft_sg',
 'htb_all_alt_tok_yap_ft_sg',
 'htb_all_alt_tok_tokenized_ft_sg',
 'alt_tok_yap_w2v_sg',
 'alt_tok_tokenized_w2v_sg',
 None]

## Read logs

In [50]:
import pickle
confs = pickle.load( open('final_setup/plo_confs.pkl', 'rb'))

In [98]:
import re
import os
DEV_RES_LINE = re.compile('Dev: .*; acc: (?P<acc>[^,]+)(?:, p: (?P<p>[^,]+), r: (?P<r>[^,]+), f: (?P<f>[-\d\.]+))?')
#Dev: time: 0.94s speed: 536.09st/s; acc: 0.9043
#Dev: time: 3.42s, speed: 146.59st/s; acc: 0.9546, p: 0.7577, r: 0.6393, f: 0.6935

mtimes = []
res = []
archs = []
for f in os.scandir(logs_folder):
    if f.name.startswith('.ipy'):
        continue
    mtimes.append(os.path.getmtime(f.path))
    model_base_name = '.'.join(f.name.split('.')[:-1])
    model_no_seed = '.'.join(f.name.split('.')[:-2])
    unit, arch, w_embed, seed_num = f.name.split('.')[:-1]
    archs.append(arch)
    matching_conf = confs[model_base_name]
    params = { 'model_base_name': model_base_name, 'arch': arch, 
              'unit': unit, 'w_embed': w_embed, 'seed_num': seed_num,
              'model_no_seed': model_no_seed,}
    params.update(matching_conf)
    with open(f.path, 'r') as fp:
        i= 0
        for line in fp:
            m = DEV_RES_LINE.match(line)
            if m:
                r = m.groupdict().copy()
                for k, v in r.items():
                    if v is not None:
                        r[k] = float(v)
                r.update(params)
                r['epoch'] = i
                i+=1
                res.append(r)

rdf = pd.DataFrame(res)

rdf['model_file_name'] = rdf.model_base_name + '.' + rdf.epoch.astype(str) + '.model'
rdf['dset_file_name'] =  rdf.model_base_name +'.dset'
rdf['char_seq_feature'] = rdf.char_seq_feature.fillna('NoChar')

rdf['relevant_score'] = rdf.f.fillna(rdf.acc)

def get_embed_unit(s):
    if 'yap' in s:
        return 'morph'
    elif 'tok' in s:
        return 'token'
    return 'na'

def get_clash_match(s):
    if s.embed_unit=='na':
        return 'na'
    elif s.embed_unit==s.input_unit:
        return 'Match'
    else:
        return 'Clash'
    
rdf['input_unit'] = rdf.unit.apply(lambda x: 'morph' if x=='morph' else 'token')
rdf['embed_unit'] = rdf.w_embed.apply(get_embed_unit)
rdf['embed_type'] = rdf.w_embed.str.replace('_tok|_yap', '')
rdf['cm'] = rdf.apply(get_clash_match, axis=1)

erdf = rdf[(rdf.groupby(['seed_num', 'arch', 'unit', 'w_embed']).relevant_score.transform(max)==rdf.relevant_score) ]
erdf = erdf[(erdf.groupby(['seed_num', 'arch', 'unit', 'w_embed']).epoch.transform(min)==erdf.epoch) ]

In [99]:
erdf.shape

(14, 49)

In [100]:
erdf.groupby(['unit', 'arch', 'w_embed']).seed_num.nunique().unstack()

Unnamed: 0_level_0,w_embed,ft_oov_tok,ft_oov_yap
unit,arch,Unnamed: 2_level_1,Unnamed: 3_level_1
morph,char_cnn,2.0,2.0
multitok,char_lstm,4.0,
token,char_cnn,6.0,


In [101]:
print ('Mean time per run:', round((max(mtimes) - min(mtimes) )/ len(mtimes) / 60, 2), 'minutes')

Mean time per run: 41.87 minutes


In [102]:
erdf.groupby(['unit', 'arch', 'embed_type', 'cm']).relevant_score.mean().unstack([-2,-1]).mul(100).round(2)

Unnamed: 0_level_0,embed_type,ft_oov,ft_oov
Unnamed: 0_level_1,cm,Clash,Match
unit,arch,Unnamed: 2_level_2,Unnamed: 3_level_2
morph,char_cnn,81.56,83.14
multitok,char_lstm,,94.39
token,char_cnn,,80.92


In [103]:
import numpy as np 
def perc(n):
    def perc_(x):
        return np.percentile(x, n)
    perc_.__name__ = 'perc_%s' % n
    return perc_

In [104]:
erdf.groupby(['unit', 'char_seq_feature']).relevant_score.agg(['max', 'min', 'mean', 'std', 'median', perc(95)]).mul(100).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min,mean,std,median,perc_95
unit,char_seq_feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph,CNN,83.3,80.61,82.35,1.2,82.74,83.25
multitok,LSTM,94.44,94.35,94.39,0.04,94.39,94.43
token,CNN,81.32,80.26,80.92,0.38,81.05,81.27


In [105]:
erdf.groupby(['unit', 'char_seq_feature']).relevant_score.agg(['max', 'min', 'mean', 'std', 'median', perc(95)]).mul(100).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min,mean,std,median,perc_95
unit,char_seq_feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph,CNN,83.3,80.61,82.35,1.2,82.74,83.25
multitok,LSTM,94.44,94.35,94.39,0.04,94.39,94.43
token,CNN,81.32,80.26,80.92,0.38,81.05,81.27


In [106]:
erdf.to_pickle('final_setup/plo_erdf.pkl')

## Decode

In [107]:
output_folder = 'final_setup/plo_decode_output'
decode_conf_folder = 'final_setup/plo_decode_conf'

In [108]:
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

if not os.path.exists(decode_conf_folder):
    os.mkdir(decode_conf_folder)
    
decode_sets = {
    'morph': {
        'morph_dev_gold': '../NER/data/for_ncrf/morph_gold_dev_plo.bmes',
        'morph_dev_yap': '../NER/data/for_ncrf/morph_yap_dev_dummy_o.bmes',
        'morph_test_gold': '../NER/data/for_ncrf/morph_gold_test_plo.bmes',
        'morph_test_yap': '../NER/data/for_ncrf/morph_yap_test_dummy_o.bmes',
    },
    'token': {
        'token_dev': '../NER/data/for_ncrf/token_gold_dev_fix_plo.bmes',
        'token_test': '../NER/data/for_ncrf/token_gold_test_fix_plo.bmes',
    },
    'multitok': {
        'token_dev': '../NER/data/for_ncrf/token_gold_dev_concat_plo.bmes',
        'token_test': '../NER/data/for_ncrf/token_gold_test_concat_plo.bmes',
    }
}

In [109]:
params = { 'status': 'decode' }

for i, row in erdf.iterrows():
    unit = row['unit']
    for name, set_path in decode_sets[unit].items():
        row_par = params.copy()
        row_par['load_model_dir'] = os.path.join(models_folder, row['model_file_name'])
        row_par['dset_dir'] = os.path.join(models_folder, row['dset_file_name'])
        row_par['decode_dir'] = os.path.join(output_folder, name+'.'+row['model_base_name']+'.bmes')
        row_par['raw_dir'] = set_path
        
        conf_path = os.path.join(decode_conf_folder, name+'.'+row['model_base_name']+'.decode.conf')
        if not os.path.exists(conf_path):
            with open(conf_path, 'w', encoding='utf8') as of:
                for k, v in row_par.items():
                    of.write(k+'='+str(v)+'\n')        
 

In [2]:
import os, re

In [110]:
pred_line = re.compile('Predict raw 1-best result has been written into file.*')
bads = []
for f in os.scandir('final_setup/plo_decode_logs'):
    if f.name=='.ipynb_checkpoints' or f.name=='.log':
        continue
    with open(f.path, 'r') as fp:
        data = fp.read()
        if len(re.findall(pred_line, data))==0:
            bads.append (f.name)
            #os.remove(f.path)
sorted(bads)

[]

In [111]:
from collections import Counter

In [112]:
xxx = []
for f in os.scandir('final_setup/plo_decode_output'):
    if f.name=='.ipynb_checkpoints' or f.name=='.bmes':
        continue
    elif 'pruned' in f.name:
        xxx.append('.'.join(f.name.split('.')[:-2]))
Counter(xxx).most_common()

[]

## Evaluate decoded folder

In [14]:
erdf = pd.read_pickle('final_setup/plo_erdf.pkl')

In [113]:
import sys
sys.path.append('../NER')
import ne_evaluate_mentions as nem

In [114]:
scores = {}

In [115]:
if os.path.exists('final_setup/plo_scores.pkl'):
    scores = pickle.load(open('final_setup/plo_scores.pkl', 'rb'))

In [116]:
for file in os.scandir(output_folder):
    if file.name=='.ipynb_checkpoints':
        continue
    gold_name, inp, arch, w_embed, seed_num = file.name.split('.')[:-1]
    if (gold_name, inp, arch, w_embed, seed_num) not in scores:
        if len(gold_name.split('_'))>2:
            unit, pred_set, _ = gold_name.split('_')
            gold_path = decode_sets[unit][unit+'_'+pred_set+'_gold']
        else:
            unit, pred_set = gold_name.split('_')
            gold_path = decode_sets[unit][unit+'_'+pred_set]
        p, r, f = nem.evaluate_files(gold_path, file)
        scores[(gold_name, inp, arch, w_embed, seed_num)] = (p, r, f)
    

In [117]:
import pickle
pickle.dump(scores, open('final_setup/plo_scores.pkl', 'wb'))

In [118]:
score_tups = [(*k, *v) for k,v in scores.items()]

In [119]:
mev = pd.DataFrame(score_tups, columns=('gold_name', 'unit', 'arch', 
                                        'w_embed', 'seed_num', 
                                        'p_m', 'r_m', 'f_m'))
(mev[mev.gold_name.str.contains('dev')].groupby(['gold_name', 'unit', 'arch'])
 .f_m.agg(['max', 'min', 'mean', 'std', 'median', perc(95)]))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,max,min,mean,std,median,perc_95
gold_name,unit,arch,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
morph_dev_gold,morph,char_cnn,0.832967,0.808324,0.82401,0.010949,0.827374,0.832476
morph_dev_yap,morph,char_cnn,0.750838,0.730512,0.74472,0.009602,0.748765,0.750754
token_dev,multitok,char_lstm,0.805281,0.79299,0.79838,0.005585,0.797625,0.804554
token_dev,token,char_cnn,0.812775,0.802617,0.809305,0.003843,0.810544,0.812655


In [120]:
(mev[mev.gold_name.str.contains('dev')].groupby(['gold_name', 'unit', 'arch']).size())

gold_name       unit      arch     
morph_dev_gold  morph     char_cnn     4
morph_dev_yap   morph     char_cnn     4
token_dev       multitok  char_lstm    4
                token     char_cnn     6
dtype: int64

In [121]:
mev.head()

Unnamed: 0,gold_name,unit,arch,w_embed,seed_num,p_m,r_m,f_m
0,token_dev,token,char_cnn,ft_oov_tok,44_seed,0.843318,0.773784,0.807056
1,token_test,token,char_cnn,ft_oov_tok,44_seed,0.810934,0.772234,0.791111
2,token_dev,multitok,char_lstm,ft_oov_tok,44_seed,0.821826,0.780127,0.800434
3,token_test,multitok,char_lstm,ft_oov_tok,44_seed,0.790043,0.791757,0.790899
4,token_dev,token,char_cnn,ft_oov_tok,46_seed,0.844749,0.782241,0.812294


In [122]:
mev['pred_set'] = mev.gold_name.apply(lambda x: '_'.join(x.split('_')[1:]))

In [123]:
mev = mev.merge(erdf, how='left')

In [124]:
(mev[mev.pred_set.str.contains('dev')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg(['mean', 'std']).mul(100).round(2)
 .assign(mean = lambda x: x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str))[['mean']]
 .unstack([-2,-1]))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,embed_type,ft_oov,ft_oov
Unnamed: 0_level_2,Unnamed: 1_level_2,cm,Clash,Match
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3
morph,dev_gold,char_cnn,81.67 ± 1.2,83.13 ± 0.2
morph,dev_yap,char_cnn,73.89 ± 1.2,75.06 ± 0.0
multitok,dev,char_lstm,,79.84 ± 0.6
token,dev,char_cnn,,80.93 ± 0.4


In [92]:
x = (mev[mev.pred_set.str.contains('dev')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg([ 'mean', 'std']).mul(100).round(2)
 .assign(std = lambda x: x['std'].round(1))
 .unstack([-2,-1]))
x.columns = x.columns.reorder_levels([1,2,0])
pd.set_option("max_columns", 30)
x.sort_index(axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,embed_type,ft_oov,ft_oov,ft_oov,ft_oov
Unnamed: 0_level_1,Unnamed: 1_level_1,cm,Clash,Clash,Match,Match
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,mean,std,mean,std
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
morph,dev_gold,char_cnn,81.67,1.2,83.13,0.2
morph,dev_yap,char_cnn,73.89,1.2,75.06,0.0
multitok,dev,char_lstm,,,80.02,0.5
token,dev,char_cnn,,,81.0,0.3


In [93]:
mev[(mev.unit=='morph') & (mev.pred_set.str.contains('pruned')) & (mev.embed_type=='ft_oov') & (mev.arch=='char_cnn')].groupby(['pred_set','cm']).f_m.mean().unstack()

cm
pred_set


In [97]:
mev['pred_set_sub'] = mev.pred_set.apply(lambda x: x.split('_')[1] if '_' in x else '')
mev['pred_set_main'] = mev.pred_set.apply(lambda x: x.split('_')[0] )
(mev[((mev.unit!='morph') & (mev.embed_type=='ft_oov') 
    ) 
    |
     ((mev.unit=='morph') 
       & (mev.embed_type=='ft_oov') 
      & (mev.arch=='char_cnn'))].groupby(['unit', 'pred_set_sub', 'cm', 'pred_set_main',])
 .f_m.mean().unstack().mul(100).round(2)
 .assign(ratio = lambda x: (x.test/x.dev -1).mul(100).round(1)))

Unnamed: 0_level_0,Unnamed: 1_level_0,pred_set_main,dev,test,ratio
unit,pred_set_sub,cm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
morph,gold,Clash,81.67,80.48,-1.5
morph,gold,Match,83.13,80.55,-3.1
morph,yap,Clash,73.89,70.28,-4.9
morph,yap,Match,75.06,70.36,-6.3
multitok,,Match,80.02,80.4,0.5
token,,Match,81.0,79.69,-1.6


In [95]:
mev.to_pickle('final_setup/plo_mev.pkl')

In [80]:
import os
from collections import defaultdict
for d in os.scandir('hp_search'):
    if d.name.startswith('models'):
        all_models_paths = defaultdict(list)
        all_models_epoch = defaultdict(lambda: -1)
        for f in os.scandir(d.path):
            if f.name!='.model' and f.name.endswith('.model'):
                a, c, _, e, _ = f.name.split('.')
                e = int(e)
                all_models_epoch[(a,c)] = max(alL_models_epoch[(a,c)], e)
                all_models_paths[(a,c)].append((e, f.path))
        for k, v in all_models_paths.items():
            for e, path in v:
                if e!=all_models_epoch[k]:
                    #os.remove(path)

        
