In [8]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
from incremental_learning.config import es_cloud_id, es_user, es_password
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.express as px

from elasticsearch import Elasticsearch


import eland as ed

# Compare an experiment to the baseline

This notebook compares train and test errors of a given experiment with the corresponding baseline.

In [67]:
# experiments_uid = ['gzkibu', 'oupbhg', 'igfhcg', 'nejauf', 'bmmlxe']
# experiments_uid = ['nejauf', 'zcsekd', 'hknmoa', 'djklwr', 'eocrtw', 
#                    'gewxzo', 'eysmxo', 'unmfmm', 'ydhfdv', 'bbnnot', 'lvowhz']
experiments_uid = ['nejauf','gewxzo', 'zyvgnb', 'swjgmb', 'jjguyi', 'ehelus']

In [68]:
es = Elasticsearch(cloud_id=es_cloud_id,
                       http_auth=(es_user, es_password))

df = ed.DataFrame(es_client=es, es_index_pattern='experiment-multi-step-sampling')

In [69]:
match = df.es_query({
          "bool": {
            "should": [
              {
                "match_phrase": {
                  "experiment_uid.keyword": experiments_uid[0]
                }
              }
            ],
            "minimum_should_match": 1
          }
        })

match_df = match.to_pandas().reset_index()

dataset_name = match_df.loc[0]['config.dataset_name']
seed = match_df.loc[0]['config.seed']
# update_fraction = match_df.loc[0]['config.update_fraction']

In [70]:
baseline = df.es_query({
            "bool": {
              "filter": [
                {
                  "bool": {
                    "should": [
                      {
                        "match_phrase": {
                          "config.dataset_name.keyword": dataset_name
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                },
                {
                  "bool": {
                    "should": [
                      {
                        "match": {
                          "config.seed": seed
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                },
                {
                  "bool": {
                    "should": [
                      {
                        "match_phrase": {
                          "run.meta.comment.keyword": "baseline estimation"
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                },
                {
                  "bool": {
                    "should": [
                      {
                        "exists": {
                          "field": "run.result.baseline.train_error.value"
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                }
              ]
            }
          })

baseline_df = baseline.to_pandas().reset_index()

baseline_df.sort_values('config.training_fraction', inplace=True)

In [71]:
data_all = ed.DataFrame(es_client=es, es_index_pattern='experiment-multi-step-sampling-metrics')
data_df_dict  = {}
for experiment_uid in experiments_uid:  
    data = data_all.es_query({
                "bool": {
                  "should": [
                    {
                      "match_phrase": {
                        "experiment_uid.keyword": experiment_uid
                      }
                    }
                  ],
                  "minimum_should_match": 1
                }
              })

    data_df = data.to_pandas()

    data_df['fraction_of_train'] = data_df['training_fraction'] + (data_df['step']+1)*data_df['run.config.update_fraction']
    data_df_dict[experiment_uid] = data_df

In [72]:
colors={key: px.colors.qualitative.Plotly[i+1] for i, key in enumerate(experiments_uid)}
fig = go.Figure()
# fig = make_subplots(rows=2, cols=2)
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=baseline_df['config.training_fraction'], y=baseline_df['run.result.baseline.test_error.value'], name='baseline', 
                         line_color=px.colors.qualitative.Plotly[0]))
for experiment_uid, data_df in data_df_dict.items():
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.test_error'], name='{}'.format(experiment_uid), 
                             legendgroup=experiment_uid, line_color=colors[experiment_uid]), 
                  secondary_y = False)
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.hyperparameters.retrained_tree_eta'], name='{}'.format(experiment_uid), 
                             legendgroup=experiment_uid, showlegend=False, line_color=colors[experiment_uid],
                            line_dash='dash'), 
                  secondary_y=True)
fig.update_layout(title='Test error for experiment with seed <b>{}</b> on the <b>{}</b> dataset'.format(seed, dataset_name), 
                  xaxis_title='Fraction of train data used', 
                  yaxis_title='MSE')
fig.update_yaxes(title_text='retrained_tree_eta', secondary_y=True)

fig.show()

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=baseline_df['config.training_fraction'], y=baseline_df['run.result.baseline.train_error.value'], name='baseline',
                        line_color=px.colors.qualitative.Plotly[0]))
for experiment_uid, data_df in data_df_dict.items():
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.train_error'], name='{}'.format(experiment_uid), 
                             legendgroup=experiment_uid, line_color=colors[experiment_uid]), 
                  secondary_y = False)
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.hyperparameters.retrained_tree_eta'], name='{}'.format(experiment_uid), 
                             legendgroup=experiment_uid, showlegend=False, line_color=colors[experiment_uid],
                            line_dash='dash'), 
                  secondary_y=True)
fig.update_layout(title='Train error for seed <b>{}</b> on the <b>{}</b> dataset'.format(seed, dataset_name), 
                  xaxis_title='Fraction of train data used', 
                  yaxis_title='MSE')
fig.update_yaxes(title_text='retrained_tree_eta', secondary_y=True)
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=baseline_df['config.training_fraction'], y=baseline_df['run.result.baseline.forest_statistics.tree_nodes_mean.value'], name='baseline'))
for experiment_uid, data_df in data_df_dict.items():
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.forest_statistics.tree_nodes_mean'], name='{}'.format(experiment_uid)))
fig.update_layout(title='Tree nodes mean for seed <b>{}</b> on the <b>{}</b> dataset'.format(seed, dataset_name), 
                  xaxis_title='Fraction of train data used', 
                  yaxis_title='Mean Tree nodes')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=baseline_df['config.training_fraction'], y=baseline_df['run.result.baseline.hyperparameters.alpha'], name='baseline'))
for experiment_uid, data_df in data_df_dict.items():
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.hyperparameters.alpha'], name='{}'.format(experiment_uid)))
fig.update_layout(title='Alpha for seed <b>{}</b> on the <b>{}</b> dataset'.format(seed, dataset_name), 
                  xaxis_title='Fraction of train data used', 
                  yaxis_title='alpha')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=baseline_df['config.training_fraction'], y=baseline_df['run.result.baseline.hyperparameters.gamma'], name='baseline'))
for experiment_uid, data_df in data_df_dict.items():
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.hyperparameters.gamma'], name='{}'.format(experiment_uid)))
fig.update_layout(title='gamma for seed <b>{}</b> on the <b>{}</b> dataset'.format(seed, dataset_name), 
                  xaxis_title='Fraction of train data used', 
                  yaxis_title='gamma')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=baseline_df['config.training_fraction'], y=baseline_df['run.result.baseline.hyperparameters.soft_tree_depth_limit'], name='baseline'))
for experiment_uid, data_df in data_df_dict.items():
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.hyperparameters.soft_tree_depth_limit'], name='{}'.format(experiment_uid)))
fig.update_layout(title='soft_tree_depth_limit for seed <b>{}</b> on the <b>{}</b> dataset'.format(seed, dataset_name), 
                  xaxis_title='Fraction of train data used', 
                  yaxis_title='soft_tree_depth_limit')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=baseline_df['config.training_fraction'], y=baseline_df['run.result.baseline.hyperparameters.retrained_tree_eta'], name='baseline'))
for experiment_uid, data_df in data_df_dict.items():
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.hyperparameters.retrained_tree_eta'], name='{}'.format(experiment_uid)))
fig.update_layout(title='retrained_tree_eta for seed <b>{}</b> on the <b>{}</b> dataset'.format(seed, dataset_name), 
                  xaxis_title='Fraction of train data used', 
                  yaxis_title='retrained_tree_eta')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=baseline_df['config.training_fraction'], y=baseline_df['run.result.baseline.hyperparameters.lambda'], name='baseline'))
for experiment_uid, data_df in data_df_dict.items():
    fig.add_trace(go.Scatter(x=data_df['fraction_of_train'], y=data_df['updated_model.hyperparameters.lambda'], name='{}'.format(experiment_uid)))
fig.update_layout(title='lambda for seed <b>{}</b> on the <b>{}</b> dataset'.format(seed, dataset_name), 
                  xaxis_title='Fraction of train data used', 
                  yaxis_title='lambda')
fig.show()

In [73]:
varr = data_df_dict['gewxzo']['updated_model.hyperparameters.soft_tree_depth_limit'].to_numpy()

In [74]:
for i in range(1,varr.shape[0]):
    print(varr[i]/varr[i-1])

1.125
1.125
1.1250000000000002
1.125
1.125
1.1249999999999998
1.1250000000000002
1.125


In [75]:
varr

array([ 5.86300671,  6.59588254,  7.42036786,  8.34791384,  9.39140307,
       10.56532846, 11.88599452, 13.37174383, 15.04321181])

In [76]:
lambdadir = {}
for i in range(varr.shape[0]):
    lambdadir[0.1*i+0.2] = varr[i]

In [77]:
lambdadir

{0.2: 5.863006705530846,
 0.30000000000000004: 6.595882543722201,
 0.4: 7.420367861687476,
 0.5: 8.347913844398413,
 0.6000000000000001: 9.391403074948215,
 0.7: 10.565328459316742,
 0.8: 11.885994516731333,
 0.9000000000000001: 13.371743831322751,
 1.0: 15.043211810238095}

In [78]:
d = {'a': 1}

In [79]:
d.keys()

dict_keys(['a'])

In [80]:
for k in d.keys(): print(k)

a
