In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')

In [3]:
import pandas as pd
import configs
import numpy as np

In [4]:
import os
import json

## Create setup dicts
1. Word Unit+Labels: `morph, token, multitok`
1. Char Arch: `char_lstm, char_cnn, no_char`
1. Word Embedding: `ft_tok, ft_yap, ft_tok_oov, ft_yap_oov, w2v_tok, w2v_yap, no_word_embed`

3 x 3 x 7 = **63 dicts**

In [7]:
data_folder = '../NER/data/for_ncrf'

datasets = {
    'morph': {
        '_unit': 'morpheme',
        '_scheme': 'bioes',
        'train_dir': 'morph_gold_train.bmes',
        'dev_dir': 'morph_gold_dev.bmes',
        'test_dir': 'morph_gold_test.bmes', 
    },
    'token': {
        '_unit': 'token',
        '_scheme': 'bioes',
        'train_dir': 'token_gold_train_fix.bmes',
        'dev_dir': 'token_gold_dev_fix.bmes',
        'test_dir': 'token_gold_test_fix.bmes',
    },
    'multitok': {
        '_unit': 'token',
        '_scheme': 'concat_bioes',
        'seg': False,
        'train_dir': 'token_gold_train_concat.bmes',
        'dev_dir': 'token_gold_dev_concat.bmes',
        'test_dir': 'token_gold_test_concat.bmes',
    },
}

## Create PER-LOC-ORG only datasets

In [8]:
trans_map = {
    'ANG': None,
    'DUC': None,
    'EVE': None,
    'FAC': 'LOC',
    'GPE': 'LOC',
    'LOC': 'LOC',
    'ORG': 'ORG',
    'PER': 'PER',
    'WOA': None,
}


In [32]:
import re

cat_re = re.compile('.*\-([^\^]+)\^?')
for n, ds in datasets.items():
    for k in ds:
        if ('train' in k or 'dev' in k or 'test' in k):
            path = os.path.join(data_folder, ds[k])
            new_path = os.path.join(data_folder, ds[k].split('.')[0]+'_plo.bmes')
            print(path)
            print(new_path)
            with open(new_path, 'w') as of:
                for line in open(path, 'r'):
                    line = line.split(' ')
                    word = line[0].strip()
                    if word!='':
                        tag = line[-1].strip()
                        tags = tag.split('^')
                        #cat = cat_re.search(tag)
                        new_tags = []
                        for t in tags:
                            if t=='O':
                                new_tags.append('O')
                            else:
                                try:
                                    bio, cat = t.split('-')
                                except:
                                    print(line)
                                    raise ValueError
                                if trans_map[cat] is None:
                                    new_tags
                                else:
                                    new_tags.append(bio+'-'+trans_map[cat])
                        new_tag = '^'.join(new_tags)
                        of.write(word+' '+new_tag+'\n')
                        #print(word, tag, new_tag)
                    else:
                        of.write('\n')
                        #print('\n')

../NER/data/for_ncrf/morph_gold_train.bmes
../NER/data/for_ncrf/morph_gold_train_plo.bmes
../NER/data/for_ncrf/morph_gold_dev.bmes
../NER/data/for_ncrf/morph_gold_dev_plo.bmes
../NER/data/for_ncrf/morph_gold_test.bmes
../NER/data/for_ncrf/morph_gold_test_plo.bmes
../NER/data/for_ncrf/token_gold_train_fix.bmes
../NER/data/for_ncrf/token_gold_train_fix_plo.bmes
../NER/data/for_ncrf/token_gold_dev_fix.bmes
../NER/data/for_ncrf/token_gold_dev_fix_plo.bmes
../NER/data/for_ncrf/token_gold_test_fix.bmes
../NER/data/for_ncrf/token_gold_test_fix_plo.bmes
../NER/data/for_ncrf/token_gold_train_concat.bmes
../NER/data/for_ncrf/token_gold_train_concat_plo.bmes
../NER/data/for_ncrf/token_gold_dev_concat.bmes
../NER/data/for_ncrf/token_gold_dev_concat_plo.bmes
../NER/data/for_ncrf/token_gold_test_concat.bmes
../NER/data/for_ncrf/token_gold_test_concat_plo.bmes


In [7]:
data_folder = '../NER/data/for_ncrf'

new_datasets = {
    'morph': {
        '_unit': 'morpheme',
        '_scheme': 'bioes',
        'train_dir': 'morph_gold_train_plo.bmes',
        'dev_dir': 'morph_gold_dev_plo.bmes',
        'test_dir': 'morph_gold_test_plo.bmes', 
    },
    'token': {
        '_unit': 'token',
        '_scheme': 'bioes',
        'train_dir': 'token_gold_train_fix_plo.bmes',
        'dev_dir': 'token_gold_dev_fix_plo.bmes',
        'test_dir': 'token_gold_test_fix_plo.bmes',
    },
    'multitok': {
        '_unit': 'token',
        '_scheme': 'concat_bioes',
        'seg': False,
        'train_dir': 'token_gold_train_concat_plo.bmes',
        'dev_dir': 'token_gold_dev_concat_plo.bmes',
        'test_dir': 'token_gold_test_concat_plo.bmes',
    },
}

In [33]:
default_grid = { 
        # FIXED
        'word_seq_feature': 'LSTM',
        'word_emb_dim': 300,
        'char_emb_dim': 30,
        'iteration': 200,
        'bilstm': True,
        'norm_word_emb': False,
        'norm_char_emb': False,
        'ave_batch_loss': False,
        'use_crf': True,
        'l2': 1e-8,
        'lstm_layer': 2,
        'batch_size': 8,
        'number_normalized': True,
        'optimizer': 'SGD',
        'lr_decay': 0.05,
        'momentum': 0,
        'nbest': 1,
        'hidden_dim': 200,
        'dropout': 0.5,

    }
    
dataset_grids = {
    'multitok': {
        'learning_rate': 0.005,
    },
    'morph': {
        'learning_rate': 0.01,
    },
    'token': {
        'learning_rate': 0.01,
    },
}
arch_grids = {
    'char_lstm': {
        'char_seq_feature': 'LSTM',
        'use_char': True,
        'char_hidden_dim': 70, 
    },
    'char_cnn': {
        'char_seq_feature': 'CNN',
        'use_char': True,
        'char_hidden_dim': 70,
        'char_kernel_size': 7,
    },
    'no_char': {
        'use_char': False,
     },
}


In [34]:
word_embedding_files = {
    #'ft_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.fasttext_skipgram.model.vec.nofirstline',
    #'ft_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.fasttext_skipgram.model.vec.nofirstline',
    'ft_oov_yap': 'data/htb_all_words.wikipedia.alt_tok.yap_form.fasttext_skipgram.txt',
    'ft_oov_tok': 'data/htb_all_words.wikipedia.alt_tok.tokenized.fasttext_skipgram.txt',
    #'w2v_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.word2vec_skipgram.txt.nofirstline',
    #'w2v_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.word2vec_skipgram.txt.nofirstline',
    #'glv_yap': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.glove.txt',
    #'glv_tok': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.tokenized.glove.txt',
    #'no_word': None,
}

In [35]:
models_folder = 'final_setup/plo_models'
conf_folder =   'final_setup/plo_conf'
json_folder =   'final_setup/plo_conf_json'
logs_folder =   'final_setup/plo_logs'

In [36]:
seed_num_options = np.arange(44, 54)
seed_num_options

array([44, 45, 46, 47, 48, 49, 50, 51, 52, 53])

In [10]:
def create_conf_dict(model_base_name, dataset, arch, emb_name, seed_num):
    full_conf_dict = {}
    full_conf_dict['status'] = 'train'
    full_conf_dict['model_dir'] = os.path.join(models_folder, model_base_name)
    
    for k, v in new_datasets[dataset].items():
        if not k.startswith('_'):
            if k in ['train_dir', 'dev_dir', 'test_dir']:
                full_conf_dict[k] = os.path.join(data_folder, v)
            else:
                full_conf_dict[k] = v
    
    if not(emb_name == 'no_word' or word_embedding_files[emb_name] is None):
        full_conf_dict['word_emb_dir'] = word_embedding_files[emb_name]
    
    
    full_conf_dict.update(default_grid)
    full_conf_dict.update(dataset_grids[dataset])
    full_conf_dict.update(arch_grids[arch])
            
    return full_conf_dict
    

In [11]:
confs = {}
for dataset in datasets:
    for arch in arch_grids:
        for emb_name in word_embedding_files:
            for seed_num in seed_num_options:
                model_base_name = '.'.join([dataset, arch, emb_name, str(seed_num)+'_seed'])
                confs[model_base_name] = create_conf_dict(model_base_name, dataset, 
                                                          arch, emb_name, seed_num)
            

In [51]:
len(confs)

630

In [13]:
confs['morph.char_lstm.ft_yap.44_seed']

{'status': 'train',
 'model_dir': 'final_setup/models/morph.char_lstm.ft_yap.44_seed',
 'train_dir': '../NER/data/for_ncrf/morph_gold_train.bmes',
 'dev_dir': '../NER/data/for_ncrf/morph_gold_dev.bmes',
 'test_dir': '../NER/data/for_ncrf/morph_gold_test.bmes',
 'word_emb_dir': '../wordembedding-hebrew/vectors_alt_tok/wikipedia.alt_tok.yap_form.fasttext_skipgram.model.vec.nofirstline',
 'word_seq_feature': 'LSTM',
 'word_emb_dim': 300,
 'char_emb_dim': 30,
 'iteration': 200,
 'bilstm': True,
 'norm_word_emb': False,
 'norm_char_emb': False,
 'ave_batch_loss': False,
 'use_crf': True,
 'l2': 1e-08,
 'lstm_layer': 2,
 'batch_size': 8,
 'number_normalized': True,
 'optimizer': 'SGD',
 'lr_decay': 0.05,
 'momentum': 0,
 'nbest': 1,
 'hidden_dim': 200,
 'dropout': 0.5,
 'learning_rate': 0.01,
 'char_seq_feature': 'LSTM',
 'use_char': True,
 'char_hidden_dim': 70}

In [15]:
import pickle
pickle.dump(confs, open('final_setup/confs.pkl', 'wb'))

## Create conf files for setup dicts
1. Random Seed: 10 different `(44, 45, 46...)`
1. `morph.charlstm.ft_tok.44_seed.conf`
1. `multitok.nochar.no_word_embed.47_seed.conf`

63 * 10 = **630 conf files**

In [16]:
if not os.path.exists(models_folder):
    os.mkdir(models_folder)
if not os.path.exists(conf_folder):
    os.mkdir(conf_folder)
if not os.path.exists(json_folder):
    os.mkdir(json_folder)
if not os.path.exists(logs_folder):
    os.mkdir(logs_folder)

In [17]:
for name, conf in confs.items():
    conf_path = os.path.join(conf_folder, name+'.conf')
    with open(conf_path, 'w', encoding='utf8') as of:
        for k, v in conf.items():
            of.write(k+'='+str(v)+'\n')
    json_path = os.path.join(json_folder, name+'.json')
    with open(json_path, 'w') as of:
        of.write(json.dumps(conf))

## Create `main.X.py` files
Only difference is `seed_num`: 
- `main.44.py` will have `seed_num = 44`

## Create `final_setup_run.py`
1. seed_num match: runs `.conf` files with matching `main.X.py` file **only**.
1. Choose device. 
1. Choose conf prefix.
1. Skip confs that are running or ran already (using `.dset` file)

In [7]:
emb_options = list(word_embedding_files.keys())+[None]
emb_options

['alt_tok_yap_ft_sg',
 'alt_tok_tokenized_ft_sg',
 'htb_all_alt_tok_yap_ft_sg',
 'htb_all_alt_tok_tokenized_ft_sg',
 'alt_tok_yap_w2v_sg',
 'alt_tok_tokenized_w2v_sg',
 None]

## Read logs

In [50]:
import pickle
confs = pickle.load( open('final_setup/confs.pkl', 'rb'))

In [59]:
import re
import os
DEV_RES_LINE = re.compile('Dev: .*; acc: (?P<acc>[^,]+)(?:, p: (?P<p>[^,]+), r: (?P<r>[^,]+), f: (?P<f>[-\d\.]+))?')
#Dev: time: 0.94s speed: 536.09st/s; acc: 0.9043
#Dev: time: 3.42s, speed: 146.59st/s; acc: 0.9546, p: 0.7577, r: 0.6393, f: 0.6935

mtimes = []
res = []
archs = []
for f in os.scandir(logs_folder):
    if f.name.startswith('.ipy'):
        continue
    mtimes.append(os.path.getmtime(f.path))
    model_base_name = '.'.join(f.name.split('.')[:-1])
    model_no_seed = '.'.join(f.name.split('.')[:-2])
    unit, arch, w_embed, seed_num = f.name.split('.')[:-1]
    archs.append(arch)
    matching_conf = confs[model_base_name]
    params = { 'model_base_name': model_base_name, 'arch': arch, 
              'unit': unit, 'w_embed': w_embed, 'seed_num': seed_num,
              'model_no_seed': model_no_seed,}
    params.update(matching_conf)
    with open(f.path, 'r') as fp:
        i= 0
        for line in fp:
            m = DEV_RES_LINE.match(line)
            if m:
                r = m.groupdict().copy()
                for k, v in r.items():
                    if v is not None:
                        r[k] = float(v)
                r.update(params)
                r['epoch'] = i
                i+=1
                res.append(r)

rdf = pd.DataFrame(res)

rdf['model_file_name'] = rdf.model_base_name + '.' + rdf.epoch.astype(str) + '.model'
rdf['dset_file_name'] =  rdf.model_base_name +'.dset'
rdf['char_seq_feature'] = rdf.char_seq_feature.fillna('NoChar')

rdf['relevant_score'] = rdf.f.fillna(rdf.acc)

def get_embed_unit(s):
    if 'yap' in s:
        return 'morph'
    elif 'tok' in s:
        return 'token'
    return 'na'

def get_clash_match(s):
    if s.embed_unit=='na':
        return 'na'
    elif s.embed_unit==s.input_unit:
        return 'Match'
    else:
        return 'Clash'
    
rdf['input_unit'] = rdf.unit.apply(lambda x: 'morph' if x=='morph' else 'token')
rdf['embed_unit'] = rdf.w_embed.apply(get_embed_unit)
rdf['embed_type'] = rdf.w_embed.str.replace('_tok|_yap', '')
rdf['cm'] = rdf.apply(get_clash_match, axis=1)

erdf = rdf[(rdf.groupby(['seed_num', 'arch', 'unit', 'w_embed']).relevant_score.transform(max)==rdf.relevant_score) ]
erdf = erdf[(erdf.groupby(['seed_num', 'arch', 'unit', 'w_embed']).epoch.transform(min)==erdf.epoch) ]

In [60]:
erdf.shape

(630, 49)

In [61]:
erdf.groupby(['unit', 'arch', 'w_embed']).seed_num.nunique().unstack()

Unnamed: 0_level_0,w_embed,ft_oov_tok,ft_oov_yap,ft_tok,ft_yap,glv_tok,glv_yap,no_word
unit,arch,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
morph,char_cnn,10,10,10,10,10,10,10
morph,char_lstm,10,10,10,10,10,10,10
morph,no_char,10,10,10,10,10,10,10
multitok,char_cnn,10,10,10,10,10,10,10
multitok,char_lstm,10,10,10,10,10,10,10
multitok,no_char,10,10,10,10,10,10,10
token,char_cnn,10,10,10,10,10,10,10
token,char_lstm,10,10,10,10,10,10,10
token,no_char,10,10,10,10,10,10,10


In [1214]:
print ('Mean time per run:', round((max(mtimes) - min(mtimes) )/ len(mtimes) / 60, 2), 'minutes')

Mean time per run: 7.15 minutes


In [1215]:
erdf.groupby(['unit', 'arch', 'embed_type', 'cm']).relevant_score.mean().unstack([-2,-1]).mul(100).round(2)

Unnamed: 0_level_0,embed_type,ft,ft,ft_oov,ft_oov,glv,glv,no_word
Unnamed: 0_level_1,cm,Clash,Match,Clash,Match,Clash,Match,na
unit,arch,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
morph,char_cnn,79.39,78.91,79.7,80.03,77.89,78.0,63.02
morph,char_lstm,78.79,78.72,79.42,79.76,77.87,78.62,62.85
morph,no_char,78.57,77.96,79.7,79.71,76.38,76.42,59.76
multitok,char_cnn,92.58,93.92,93.23,94.26,92.39,93.36,88.69
multitok,char_lstm,92.62,93.88,93.19,94.25,92.28,93.33,88.87
multitok,no_char,91.33,92.74,90.27,93.55,90.57,90.49,82.52
token,char_cnn,70.45,75.8,75.97,78.35,69.0,74.2,56.38
token,char_lstm,70.73,76.77,75.67,78.12,69.36,74.12,56.39
token,no_char,69.1,74.72,74.76,77.4,66.42,71.24,52.83


In [24]:
import numpy as np 
def perc(n):
    def perc_(x):
        return np.percentile(x, n)
    perc_.__name__ = 'perc_%s' % n
    return perc_

In [1216]:
erdf.groupby(['unit', 'char_seq_feature']).relevant_score.agg(['max', 'min', 'mean', 'std', 'median', perc(95)]).mul(100).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min,mean,std,median,perc_95
unit,char_seq_feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
morph,CNN,80.68,62.08,76.71,5.7,78.84,80.53
morph,LSTM,80.9,61.34,76.58,5.71,78.7,80.24
morph,NoChar,80.94,58.71,75.5,6.62,77.82,80.09
multitok,CNN,94.4,88.5,92.63,1.74,93.18,94.27
multitok,LSTM,94.37,88.7,92.63,1.68,93.16,94.29
multitok,NoChar,93.73,81.78,90.21,3.37,90.66,93.57
token,CNN,79.16,55.43,71.45,6.92,74.08,78.42
token,LSTM,78.65,55.33,71.6,6.94,74.28,78.4
token,NoChar,77.97,50.9,69.5,7.71,70.97,77.6


In [1217]:
erdf.to_pickle('final_setup/erdf.pkl')

## Decode

In [18]:
output_folder = 'final_setup/decode_output'
decode_conf_folder = 'final_setup/decode_conf'

In [19]:
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

if not os.path.exists(decode_conf_folder):
    os.mkdir(decode_conf_folder)
    
decode_sets = {
    'morph': {
        'morph_dev_gold': '../NER/data/for_ncrf/morph_gold_dev.bmes',
        'morph_dev_yap': '../NER/data/for_ncrf/morph_yap_dev_dummy_o.bmes',
        'morph_test_gold': '../NER/data/for_ncrf/morph_gold_test.bmes',
        'morph_test_yap': '../NER/data/for_ncrf/morph_yap_test_dummy_o.bmes',
    },
    'token': {
        'token_dev': '../NER/data/for_ncrf/token_gold_dev_fix.bmes',
        'token_test': '../NER/data/for_ncrf/token_gold_test_fix.bmes',
    },
    'multitok': {
        'token_dev': '../NER/data/for_ncrf/token_gold_dev_concat.bmes',
        'token_test': '../NER/data/for_ncrf/token_gold_test_concat.bmes',
    }
}

In [1218]:
params = { 'status': 'decode' }

for i, row in erdf.iterrows():
    unit = row['unit']
    for name, set_path in decode_sets[unit].items():
        row_par = params.copy()
        row_par['load_model_dir'] = os.path.join(models_folder, row['model_file_name'])
        row_par['dset_dir'] = os.path.join(models_folder, row['dset_file_name'])
        row_par['decode_dir'] = os.path.join(output_folder, name+'.'+row['model_base_name']+'.bmes')
        row_par['raw_dir'] = set_path
        
        conf_path = os.path.join(decode_conf_folder, name+'.'+row['model_base_name']+'.decode.conf')
        if not os.path.exists(conf_path):
            with open(conf_path, 'w', encoding='utf8') as of:
                for k, v in row_par.items():
                    of.write(k+'='+str(v)+'\n')        
 

In [2]:
import os, re

In [3]:
pred_line = re.compile('Predict raw 1-best result has been written into file.*')
bads = []
for f in os.scandir('final_setup/decode_logs2'):
    if f.name=='.ipynb_checkpoints' or f.name=='.log':
        continue
    with open(f.path, 'r') as fp:
        data = fp.read()
        if len(re.findall(pred_line, data))==0:
            bads.append (f.name)
            os.remove(f.path)
sorted(bads)

[]

In [4]:
from collections import Counter

In [48]:
xxx = []
for f in os.scandir('final_setup/decode_output'):
    if f.name=='.ipynb_checkpoints' or f.name=='.bmes':
        continue
    elif 'pruned' in f.name:
        xxx.append('.'.join(f.name.split('.')[:-2]))
Counter(xxx).most_common()

[('morph_dev_pruned.morph.char_cnn.ft_oov_tok', 10),
 ('morph_dev_pruned.morph.no_char.ft_oov_tok', 10),
 ('morph_dev_pruned.morph.no_char.ft_oov_yap', 10),
 ('morph_dev_pruned.morph.no_char.ft_tok', 10),
 ('morph_dev_pruned.morph.no_char.ft_yap', 10),
 ('morph_dev_pruned.morph.no_char.glv_tok', 10),
 ('morph_dev_pruned.morph.no_char.glv_yap', 10),
 ('morph_dev_pruned.morph.no_char.no_word', 10),
 ('morph_dev_pruned.morph.char_cnn.ft_oov_yap', 10),
 ('morph_test_pruned.morph.char_cnn.ft_oov_tok', 10),
 ('morph_test_pruned.morph.char_cnn.ft_oov_yap', 10),
 ('morph_test_pruned.morph.char_cnn.ft_tok', 10),
 ('morph_test_pruned.morph.char_cnn.ft_yap', 10),
 ('morph_test_pruned.morph.char_cnn.glv_tok', 10),
 ('morph_test_pruned.morph.char_cnn.glv_yap', 10),
 ('morph_test_pruned.morph.char_cnn.no_word', 10),
 ('morph_test_pruned.morph.char_lstm.ft_oov_tok', 10),
 ('morph_test_pruned.morph.char_lstm.ft_oov_yap', 10),
 ('morph_test_pruned.morph.char_lstm.ft_tok', 10),
 ('morph_test_pruned.morp

## Evaluate decoded folder

In [14]:
erdf = pd.read_pickle('final_setup/erdf.pkl')

In [15]:
import sys
sys.path.append('../NER')
import ne_evaluate_mentions as nem

In [17]:
scores = {}

In [1319]:
if os.path.exists('final_setup/scores.pkl'):
    scores = pickle.load(open('final_setup/scores.pkl', 'rb'))

In [35]:
for file in os.scandir(output_folder):
    if file.name=='.ipynb_checkpoints':
        continue
    gold_name, inp, arch, w_embed, seed_num = file.name.split('.')[:-1]
    if (gold_name, inp, arch, w_embed, seed_num) not in scores:
        if len(gold_name.split('_'))>2:
            unit, pred_set, _ = gold_name.split('_')
            gold_path = decode_sets[unit][unit+'_'+pred_set+'_gold']
        else:
            unit, pred_set = gold_name.split('_')
            gold_path = decode_sets[unit][unit+'_'+pred_set]
        p, r, f = nem.evaluate_files(gold_path, file)
        scores[(gold_name, inp, arch, w_embed, seed_num)] = (p, r, f)
    

In [36]:
import pickle
pickle.dump(scores, open('final_setup/scores2.pkl', 'wb'))

In [62]:
score_tups = [(*k, *v) for k,v in scores.items()]

In [63]:
mev = pd.DataFrame(score_tups, columns=('gold_name', 'unit', 'arch', 
                                        'w_embed', 'seed_num', 
                                        'p_m', 'r_m', 'f_m'))
(mev[mev.gold_name.str.contains('dev')].groupby(['gold_name', 'unit', 'arch'])
 .f_m.agg(['max', 'min', 'mean', 'std', 'median', perc(95)]))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,max,min,mean,std,median,perc_95
gold_name,unit,arch,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
morph_dev_gold,morph,char_cnn,0.80683,0.620773,0.767274,0.05693,0.788462,0.805329
morph_dev_gold,morph,char_lstm,0.809829,0.614118,0.766029,0.056991,0.787123,0.802357
morph_dev_gold,morph,no_char,0.809372,0.587796,0.755272,0.066165,0.77819,0.801265
morph_dev_pruned,morph,char_cnn,0.788584,0.582927,0.741672,0.062661,0.76452,0.785235
morph_dev_pruned,morph,char_lstm,0.796555,0.57783,0.739673,0.062029,0.762613,0.783816
morph_dev_pruned,morph,no_char,0.781876,0.544061,0.727769,0.072962,0.752815,0.778265
morph_dev_yap,morph,char_cnn,0.73774,0.558252,0.692242,0.052256,0.710569,0.73323
morph_dev_yap,morph,char_lstm,0.737527,0.555024,0.690014,0.050185,0.707463,0.726625
morph_dev_yap,morph,no_char,0.737527,0.526582,0.682518,0.060684,0.702264,0.728002
token_dev,multitok,char_cnn,0.779193,0.540201,0.707866,0.068994,0.724617,0.771131


In [64]:
(mev[mev.gold_name.str.contains('dev')].groupby(['gold_name', 'unit', 'arch']).size())

gold_name         unit      arch     
morph_dev_gold    morph     char_cnn     70
                            char_lstm    70
                            no_char      70
morph_dev_pruned  morph     char_cnn     70
                            char_lstm    70
                            no_char      70
morph_dev_yap     morph     char_cnn     70
                            char_lstm    70
                            no_char      70
token_dev         multitok  char_cnn     70
                            char_lstm    70
                            no_char      70
                  token     char_cnn     70
                            char_lstm    70
                            no_char      70
dtype: int64

In [65]:
mev.head()

Unnamed: 0,gold_name,unit,arch,w_embed,seed_num,p_m,r_m,f_m
0,morph_dev_gold,morph,char_cnn,ft_yap,44_seed,0.859155,0.733467,0.791351
1,morph_dev_yap,morph,char_cnn,ft_yap,44_seed,0.780193,0.647295,0.707558
2,morph_test_gold,morph,char_cnn,ft_yap,44_seed,0.80485,0.747854,0.775306
3,morph_dev_gold,morph,char_cnn,ft_oov_tok,44_seed,0.843612,0.767535,0.803778
4,morph_test_yap,morph,char_cnn,ft_yap,44_seed,0.721411,0.636266,0.676169


In [66]:
mev['pred_set'] = mev.gold_name.apply(lambda x: '_'.join(x.split('_')[1:]))

In [67]:
mev = mev.merge(erdf, how='left')

In [68]:
(mev[mev.pred_set.str.contains('dev')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg(['mean', 'std']).mul(100).round(2)
 .assign(mean = lambda x: x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str))[['mean']]
 .unstack([-2,-1]))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,embed_type,ft,ft,ft_oov,ft_oov,glv,glv,no_word
Unnamed: 0_level_2,Unnamed: 1_level_2,cm,Clash,Match,Clash,Match,Clash,Match,na
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
morph,dev_gold,char_cnn,79.39 ± 0.5,78.92 ± 0.4,79.72 ± 0.6,80.03 ± 0.7,77.93 ± 0.4,78.04 ± 0.5,63.06 ± 0.6
morph,dev_gold,char_lstm,78.81 ± 0.6,78.73 ± 0.6,79.43 ± 0.6,79.81 ± 0.7,77.89 ± 0.6,78.65 ± 0.8,62.91 ± 0.9
morph,dev_gold,no_char,78.59 ± 0.7,77.96 ± 0.6,79.75 ± 0.5,79.75 ± 0.7,76.40 ± 0.5,76.43 ± 0.7,59.81 ± 0.5
morph,dev_pruned,char_cnn,76.23 ± 0.6,76.88 ± 0.4,77.55 ± 0.8,78.13 ± 0.7,75.18 ± 0.7,76.03 ± 0.5,59.16 ± 0.6
morph,dev_pruned,char_lstm,75.55 ± 0.7,76.66 ± 0.8,76.89 ± 0.6,78.18 ± 0.7,74.89 ± 1.0,76.39 ± 1.2,59.21 ± 1.1
morph,dev_pruned,no_char,75.48 ± 0.8,76.25 ± 0.6,77.53 ± 0.5,77.00 ± 0.6,73.91 ± 0.6,73.88 ± 1.0,55.40 ± 0.6
morph,dev_yap,char_cnn,71.59 ± 0.7,71.10 ± 0.3,72.34 ± 1.0,72.59 ± 0.8,70.14 ± 0.7,69.99 ± 0.6,56.82 ± 0.6
morph,dev_yap,char_lstm,70.84 ± 0.9,70.55 ± 0.8,71.74 ± 0.7,72.37 ± 0.7,69.93 ± 1.0,70.46 ± 0.9,57.12 ± 1.1
morph,dev_yap,no_char,70.93 ± 0.8,70.48 ± 0.7,72.01 ± 0.6,72.57 ± 0.9,69.03 ± 0.5,68.80 ± 1.0,53.95 ± 0.7
multitok,dev,char_cnn,70.94 ± 0.7,74.86 ± 0.4,75.09 ± 0.7,77.04 ± 0.6,69.80 ± 1.0,72.69 ± 0.8,55.09 ± 0.6


In [69]:
x = (mev[mev.pred_set.str.contains('dev')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg([ 'mean', 'std']).mul(100).round(2)
 .assign(std = lambda x: x['std'].round(1))
 .unstack([-2,-1]))
x.columns = x.columns.reorder_levels([1,2,0])
pd.set_option("max_columns", 30)
x.sort_index(axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,embed_type,ft,ft,ft,ft,ft_oov,ft_oov,ft_oov,ft_oov,glv,glv,glv,glv,no_word,no_word
Unnamed: 0_level_1,Unnamed: 1_level_1,cm,Clash,Clash,Match,Match,Clash,Clash,Match,Match,Clash,Clash,Match,Match,na,na
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
morph,dev_gold,char_cnn,79.39,0.5,78.92,0.4,79.72,0.6,80.03,0.7,77.93,0.4,78.04,0.5,63.06,0.6
morph,dev_gold,char_lstm,78.81,0.6,78.73,0.6,79.43,0.6,79.81,0.7,77.89,0.6,78.65,0.8,62.91,0.9
morph,dev_gold,no_char,78.59,0.7,77.96,0.6,79.75,0.5,79.75,0.7,76.4,0.5,76.43,0.7,59.81,0.5
morph,dev_pruned,char_cnn,76.23,0.6,76.88,0.4,77.55,0.8,78.13,0.7,75.18,0.7,76.03,0.5,59.16,0.6
morph,dev_pruned,char_lstm,75.55,0.7,76.66,0.8,76.89,0.6,78.18,0.7,74.89,1.0,76.39,1.2,59.21,1.1
morph,dev_pruned,no_char,75.48,0.8,76.25,0.6,77.53,0.5,77.0,0.6,73.91,0.6,73.88,1.0,55.4,0.6
morph,dev_yap,char_cnn,71.59,0.7,71.1,0.3,72.34,1.0,72.59,0.8,70.14,0.7,69.99,0.6,56.82,0.6
morph,dev_yap,char_lstm,70.84,0.9,70.55,0.8,71.74,0.7,72.37,0.7,69.93,1.0,70.46,0.9,57.12,1.1
morph,dev_yap,no_char,70.93,0.8,70.48,0.7,72.01,0.6,72.57,0.9,69.03,0.5,68.8,1.0,53.95,0.7
multitok,dev,char_cnn,70.94,0.7,74.86,0.4,75.09,0.7,77.04,0.6,69.8,1.0,72.69,0.8,55.09,0.6


In [70]:
mev[(mev.unit=='morph') & (mev.pred_set.str.contains('pruned')) & (mev.embed_type=='ft_oov') & (mev.arch=='char_cnn')].groupby(['pred_set','cm']).f_m.mean().unstack()

cm,Clash,Match
pred_set,Unnamed: 1_level_1,Unnamed: 2_level_1
dev_pruned,0.7755,0.781282
test_pruned,0.760494,0.767517


In [71]:
mev['pred_set_sub'] = mev.pred_set.apply(lambda x: x.split('_')[1] if '_' in x else '')
mev['pred_set_main'] = mev.pred_set.apply(lambda x: x.split('_')[0] )
(mev[((mev.unit!='morph') & (mev.embed_type=='ft_oov') 
    & (mev.arch=='char_cnn')) 
    |
     ((mev.unit=='morph') 
       & (mev.embed_type=='ft_oov') 
      & (mev.arch=='char_cnn'))].groupby(['unit', 'pred_set_sub', 'cm', 'pred_set_main',])
 .f_m.mean().unstack().mul(100).round(2)
 .assign(ratio = lambda x: (x.test/x.dev -1).mul(100).round(1)))

Unnamed: 0_level_0,Unnamed: 1_level_0,pred_set_main,dev,test,ratio
unit,pred_set_sub,cm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
morph,gold,Clash,79.72,79.3,-0.5
morph,gold,Match,80.03,79.1,-1.2
morph,pruned,Clash,77.55,76.05,-1.9
morph,pruned,Match,78.13,76.75,-1.8
morph,yap,Clash,72.34,69.52,-3.9
morph,yap,Match,72.59,69.29,-4.5
multitok,,Clash,75.09,75.64,0.7
multitok,,Match,77.04,77.91,1.1
token,,Clash,76.01,75.22,-1.0
token,,Match,78.38,77.48,-1.1


In [72]:
mev.to_pickle('final_setup/mev2.pkl')

In [80]:
import os
from collections import defaultdict
for d in os.scandir('hp_search'):
    if d.name.startswith('models'):
        all_models_paths = defaultdict(list)
        all_models_epoch = defaultdict(lambda: -1)
        for f in os.scandir(d.path):
            if f.name!='.model' and f.name.endswith('.model'):
                a, c, _, e, _ = f.name.split('.')
                e = int(e)
                all_models_epoch[(a,c)] = max(alL_models_epoch[(a,c)], e)
                all_models_paths[(a,c)].append((e, f.path))
        for k, v in all_models_paths.items():
            for e, path in v:
                if e!=all_models_epoch[k]:
                    #os.remove(path)

        


In [1277]:
(mev[mev.pred_set.str.contains('test')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg(['mean', 'std']).mul(100).round(2)
 .assign(mean = lambda x: x['mean'].apply('{:,.2f}'.format).astype(str)+' ± '+ x['std'].round(1).astype(str))[['mean']]
 .unstack([-2,-1]))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,embed_type,ft,ft,ft_oov,ft_oov,glv,glv,no_word
Unnamed: 0_level_2,Unnamed: 1_level_2,cm,Clash,Match,Clash,Match,Clash,Match,na
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
morph,test_gold,char_cnn,77.60 ± 0.9,77.64 ± 0.9,79.30 ± 0.6,79.09 ± 1.0,78.19 ± 1.0,77.86 ± 0.6,59.63 ± 2.3
morph,test_gold,char_lstm,77.74 ± 1.2,77.76 ± 0.8,79.03 ± 0.7,78.45 ± 0.4,78.03 ± 1.1,77.22 ± 1.4,62.20 ± 1.5
morph,test_gold,no_char,77.00 ± 0.7,77.01 ± 1.0,79.00 ± 0.5,78.03 ± 0.5,77.10 ± 0.8,76.91 ± 0.8,56.97 ± 1.2
morph,test_pruned,char_cnn,74.28 ± 0.9,73.06 ± 0.7,76.79 ± 0.7,76.08 ± 1.0,74.13 ± 1.0,73.10 ± 0.8,55.68 ± 1.9
morph,test_pruned,char_lstm,74.31 ± 1.1,72.98 ± 0.6,76.82 ± 0.8,76.08 ± 0.6,73.39 ± 1.2,72.22 ± 1.3,57.78 ± 1.4
morph,test_pruned,no_char,73.15 ± 0.9,72.09 ± 0.9,76.77 ± 0.6,74.31 ± 0.7,71.89 ± 0.6,71.43 ± 1.0,53.34 ± 0.7
morph,test_yap,char_cnn,67.51 ± 0.9,67.24 ± 0.9,69.51 ± 0.7,69.28 ± 1.0,67.57 ± 1.0,66.90 ± 0.9,51.06 ± 2.0
morph,test_yap,char_lstm,67.49 ± 1.1,67.41 ± 0.7,69.42 ± 0.6,68.63 ± 0.7,66.93 ± 1.1,66.35 ± 1.2,53.77 ± 1.7
morph,test_yap,no_char,67.13 ± 0.8,66.95 ± 0.8,69.35 ± 0.4,68.54 ± 0.9,66.98 ± 0.5,66.64 ± 0.9,48.76 ± 0.9
multitok,test,char_cnn,72.65 ± 0.8,76.68 ± 0.7,75.59 ± 0.7,77.86 ± 0.9,71.74 ± 1.0,73.98 ± 0.8,54.98 ± 1.6


In [1269]:
x = (mev[mev.pred_set.str.contains('test')].groupby(['unit', 'pred_set', 'arch', 'embed_type', 'cm'])
 .f_m.agg(['mean', 'std']).mul(100).round(2)
 .assign(std = lambda x: x['std'].round(1))
 .unstack([-2,-1]))
x.columns = x.columns.reorder_levels([1,2,0])
pd.set_option("max_columns", 30)
x.sort_index(axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,embed_type,ft,ft,ft,ft,ft_oov,ft_oov,ft_oov,ft_oov,glv,glv,glv,glv,no_word,no_word
Unnamed: 0_level_1,Unnamed: 1_level_1,cm,Clash,Clash,Match,Match,Clash,Clash,Match,Match,Clash,Clash,Match,Match,na,na
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
unit,pred_set,arch,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
morph,test_gold,char_cnn,77.6,0.9,77.64,0.9,79.3,0.6,79.09,1.0,78.19,1.0,77.86,0.6,59.63,2.3
morph,test_gold,char_lstm,77.74,1.2,77.76,0.8,79.03,0.7,78.45,0.4,78.03,1.1,77.22,1.4,62.2,1.5
morph,test_gold,no_char,77.0,0.7,77.01,1.0,79.0,0.5,78.03,0.5,77.1,0.8,76.91,0.8,56.97,1.2
morph,test_pruned,char_cnn,74.28,0.9,73.06,0.7,76.79,0.7,76.08,1.0,74.13,1.0,73.1,0.8,55.68,1.9
morph,test_pruned,char_lstm,74.31,1.1,72.98,0.6,76.82,0.8,76.08,0.6,73.39,1.2,72.22,1.3,57.78,1.4
morph,test_pruned,no_char,73.15,0.9,72.09,0.9,76.77,0.6,74.31,0.7,71.89,0.6,71.43,1.0,53.34,0.7
morph,test_yap,char_cnn,67.51,0.9,67.24,0.9,69.51,0.7,69.28,1.0,67.57,1.0,66.9,0.9,51.06,2.0
morph,test_yap,char_lstm,67.49,1.1,67.41,0.7,69.42,0.6,68.63,0.7,66.93,1.1,66.35,1.2,53.77,1.7
morph,test_yap,no_char,67.13,0.8,66.95,0.8,69.35,0.4,68.54,0.9,66.98,0.5,66.64,0.9,48.76,0.9
multitok,test,char_cnn,72.65,0.8,76.68,0.7,75.59,0.7,77.86,0.9,71.74,1.0,73.98,0.8,54.98,1.6
