In [200]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [201]:
import configs

In [203]:
lstm_gen = configs.config_generator('CharLSTM')

In [204]:
lstm_confs = configs.get_x_configs('CharLSTM', 100)

In [205]:
import pandas as pd
df = pd.DataFrame(lstm_confs)

In [206]:
df.optimizer.value_counts()

SGD        40
RMSProp    32
Adam       28
Name: optimizer, dtype: int64

In [207]:
df.lstm_layer.value_counts()

2    53
1    47
Name: lstm_layer, dtype: int64

In [208]:
df.batch_size.value_counts()

30    37
10    36
20    27
Name: batch_size, dtype: int64

In [212]:
cnn_confs = configs.get_x_configs('CharCNN', 100)

In [213]:
import pandas as pd
cdf = pd.DataFrame(cnn_confs)

In [214]:
cdf.optimizer.value_counts()

SGD        34
Adam       34
RMSProp    32
Name: optimizer, dtype: int64

In [215]:
cdf.lstm_layer.value_counts()

1    52
2    48
Name: lstm_layer, dtype: int64

In [216]:
cdf.batch_size.value_counts()

30    36
20    32
10    32
Name: batch_size, dtype: int64

In [217]:
import os

In [218]:
import json

max_digits = len(str(len(lstm_confs)))
full_confs = []
for i, conf in enumerate(lstm_confs):
    fc = configs.create_conf_file(f'hp_search/conf/morph_charlstm.{str(i).zfill(max_digits)}.conf', 
                             f'hp_search/models/morph_charlstm.{str(i).zfill(max_digits)}.model', 
                             'gold_morpheme', 
                             conf, 
                             'alt_tok_yap_ft_sg')
    full_confs.append(fc)

for i, fc in enumerate(full_confs):
    with open( f'hp_search/conf_json/morph_charlstm.{str(i).zfill(max_digits)}.json', 'w') as of:
        of.write(json.dumps(fc))

In [219]:
max_digits = len(str(len(cnn_confs)))
full_confs = []
for i, conf in enumerate(cnn_confs):
    fc = configs.create_conf_file(f'hp_search/conf/morph_charcnn.{str(i).zfill(max_digits)}.conf', 
                             f'hp_search/models/morph_charcnn.{str(i).zfill(max_digits)}.model', 
                             'gold_morpheme', 
                             conf, 
                             'alt_tok_yap_ft_sg')
    full_confs.append(fc)

for i, fc in enumerate(full_confs):
    with open( f'hp_search/conf_json/morph_charcnn.{str(i).zfill(max_digits)}.json', 'w') as of:
        of.write(json.dumps(fc))

## Token

In [220]:
max_digits = len(str(len(lstm_confs)))
full_confs = []
for i, conf in enumerate(lstm_confs):
    fc = configs.create_conf_file(f'hp_search/conf/token_charlstm.{str(i).zfill(max_digits)}.conf', 
                             f'hp_search/models/token_charlstm.{str(i).zfill(max_digits)}.model', 
                             'gold_token_bioes', 
                             conf, 
                             'alt_tok_tokenized_ft_sg')
    full_confs.append(fc)

for i, fc in enumerate(full_confs):
    with open( f'hp_search/conf_json/token_charlstm.{str(i).zfill(max_digits)}.json', 'w') as of:
        of.write(json.dumps(fc))

In [221]:
max_digits = len(str(len(cnn_confs)))
full_confs = []
for i, conf in enumerate(cnn_confs):
    fc = configs.create_conf_file(f'hp_search/conf/token_charcnn.{str(i).zfill(max_digits)}.conf', 
                             f'hp_search/models/token_charcnn.{str(i).zfill(max_digits)}.model', 
                             'gold_token_bioes', 
                             conf, 
                             'alt_tok_tokenized_ft_sg')
    full_confs.append(fc)

for i, fc in enumerate(full_confs):
    with open( f'hp_search/conf_json/token_charcnn.{str(i).zfill(max_digits)}.json', 'w') as of:
        of.write(json.dumps(fc))

## Check results

In [342]:
len(cnn_confs[0]), len(lstm_confs[0])

(24, 24)

In [352]:
lstm_confs[0]

{'word_seq_feature': 'LSTM',
 'word_emb_dim': 300,
 'char_emb_dim': 30,
 'iteration': 100,
 'bilstm': True,
 'norm_word_emb': False,
 'norm_char_emb': False,
 'ave_batch_loss': False,
 'l2': 1e-08,
 'lstm_layer': 2,
 'batch_size': 10,
 'number_normalized': False,
 'optimizer': 'SGD',
 'nbest': 1,
 'char_seq_feature': 'LSTM',
 'use_char': True,
 'use_crf': True,
 'char_hidden_dim': 50,
 'hidden_dim': 200,
 'dropout': 0.5,
 'cnn_layer': 8,
 'learning_rate': 0.015,
 'lr_decay': 0.05,
 'momentum': 0}

In [359]:
import re
DEV_RES_LINE = re.compile('Dev: .*; acc: (?P<acc>[^,]+), p: (?P<p>[^,]+), r: (?P<r>[^,]+), f: (?P<f>[-\d\.]+)')


res = []
for f in os.scandir('hp_search/logs'):
    if f.name.startswith('.ipy'):
        continue
    arch = f.name.split('.')[0]
    conf_num = f.name.split('.')[1]
    matching_conf = cnn_confs[int(conf_num)] if 'cnn' in arch else lstm_confs[int(conf_num)]
    params = { 'arch': arch, 'conf_num': conf_num}
    params.update(matching_conf)
    with open(f.path, 'r') as fp:
        i= 0
        for line in fp:
            m = DEV_RES_LINE.match(line)
            if m:
                r = m.groupdict().copy()
                for k, v in r.items():
                    r[k] = float(v)
                r.update(params)
                r['epoch'] = i
                i+=1
                res.append(r)

In [360]:
rdf = pd.DataFrame(res)

In [361]:
rdf.head()

Unnamed: 0,acc,p,r,f,arch,conf_num,word_seq_feature,word_emb_dim,char_emb_dim,iteration,...,use_char,use_crf,char_hidden_dim,hidden_dim,dropout,cnn_layer,learning_rate,epoch,lr_decay,momentum
0,0.9027,-1.0,0.0,-1.0,token_charlstm,19,LSTM,300,30,100,...,True,True,50,50,0.3,8,0.001,0,,
1,0.909,0.5775,0.0822,0.1439,token_charlstm,19,LSTM,300,30,100,...,True,True,50,50,0.3,8,0.001,0,,
2,0.9333,0.6075,0.3567,0.4495,token_charlstm,19,LSTM,300,30,100,...,True,True,50,50,0.3,8,0.001,0,,
3,0.94,0.5895,0.4489,0.5097,token_charlstm,19,LSTM,300,30,100,...,True,True,50,50,0.3,8,0.001,0,,
4,0.9397,0.5829,0.4509,0.5085,token_charlstm,19,LSTM,300,30,100,...,True,True,50,50,0.3,8,0.001,0,,


In [362]:
rdf.groupby(['conf_num', 'arch']).f.max().unstack()

arch,morph_charcnn,morph_charlstm,token_charcnn,token_charlstm
conf_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000,0.7443,0.7904,0.7538,0.7697
001,0.0112,0.0480,-1.0000,0.0299
002,0.6211,0.5269,0.5855,0.4356
003,0.6352,0.7829,0.5522,0.7383
004,0.7736,0.5486,0.7313,0.6659
...,...,...,...,...
095,,,0.6344,0.5282
096,,,0.6301,0.5871
097,,,0.5876,0.0480
098,,,0.6276,0.7028


In [363]:
rdf.shape

(34500, 31)

In [364]:
rdf.groupby('arch').f.max()

arch
morph_charcnn     0.7736
morph_charlstm    0.7904
token_charcnn     0.7538
token_charlstm    0.7697
Name: f, dtype: float64

In [368]:
rdf.groupby(['dropout', 'arch']).f.max().unstack()

arch,morph_charcnn,morph_charlstm,token_charcnn,token_charlstm
dropout,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.1,0.7569,0.7829,0.7433,0.7383
0.3,0.7736,0.7879,0.7428,0.7554
0.5,0.7629,0.7904,0.7538,0.7697


In [369]:
rdf.groupby(['hidden_dim', 'arch']).f.max().unstack()

arch,morph_charcnn,morph_charlstm,token_charcnn,token_charlstm
hidden_dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
50,0.7736,0.7879,0.7437,0.7546
100,0.7627,0.7861,0.7538,0.7401
200,0.7478,0.7904,0.6845,0.7697


In [370]:
rdf.groupby(['lstm_layer', 'arch']).f.max().unstack()

arch,morph_charcnn,morph_charlstm,token_charcnn,token_charlstm
lstm_layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.7736,0.79,0.7538,0.7554
2,0.7629,0.7904,0.6845,0.7697


In [371]:
rdf.groupby(['optimizer', 'arch']).f.max().unstack()

arch,morph_charcnn,morph_charlstm,token_charcnn,token_charlstm
optimizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adam,0.6901,0.6951,0.6615,0.6659
RMSProp,0.6791,0.6883,0.6352,0.6591
SGD,0.7736,0.7904,0.7538,0.7697


In [375]:
rdf[rdf.arch.str.contains('cnn')].groupby(['cnn_layer', 'arch']).f.max().unstack()

arch,morph_charcnn,token_charcnn
cnn_layer,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.7736,0.7433
4,0.7629,0.7437
8,0.7443,0.7538


In [376]:
rdf[rdf.arch.str.contains('lstm')].groupby(['char_hidden_dim', 'arch']).f.max().unstack()

arch,morph_charlstm,token_charlstm
char_hidden_dim,Unnamed: 1_level_1,Unnamed: 2_level_1
20,0.79,0.7549
50,0.7904,0.7697
70,0.7829,0.7554


In [377]:
rdf.groupby(['optimizer', 'learning_rate', 'arch']).f.max().unstack()

Unnamed: 0_level_0,arch,morph_charcnn,morph_charlstm,token_charcnn,token_charlstm
optimizer,learning_rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adam,0.0005,0.6477,0.6951,0.6452,0.66
Adam,0.001,0.6901,0.6369,0.6615,0.6495
Adam,0.005,0.6607,0.6835,0.6608,0.6659
RMSProp,0.005,0.6791,0.6883,0.6352,0.6591
RMSProp,0.01,0.653,0.6596,0.5974,0.6056
RMSProp,0.05,0.4304,0.5798,0.3515,0.534
SGD,0.01,0.7736,0.79,0.7437,0.7554
SGD,0.015,0.7629,0.7904,0.7538,0.7697
SGD,0.03,0.7478,0.7419,0.6845,0.7269


## TODO:
1. create more random configs (we will take them sequencially for order's sake)
1. run more in another server
1. match confs and results
1. initial analysis and maybe drop and add some HPs