In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from bokeh.io import show, output_notebook
output_notebook()

from scripts_viz.visualization_utils import *
from scripts_viz.visualization_utils import TTQcolor

from bokeh.layouts import gridplot, row, column

import mlflow
from os import listdir
from os.path import isdir, join

In [6]:
expnames = set([exp.name for exp in mlflow.tracking.MlflowClient().list_experiments()])

In [7]:
expnames

{'benchmarks_shuffle_imp',
 'benchmarks_shuffle_opt_imp',
 'benchmarks_shuffle_opt_p180',
 'benchmarks_shuffle_opt_p90',
 'benchmarks_shuffle_p180',
 'benchmarks_shuffle_p90',
 'benchmarks_time_imp',
 'benchmarks_time_opt_imp',
 'benchmarks_time_opt_p180',
 'benchmarks_time_opt_p90',
 'benchmarks_time_p180',
 'benchmarks_time_p90',
 'enriched_shuffle_imp',
 'enriched_shuffle_p90',
 'enriched_time_imp',
 'enriched_time_p90',
 'enriched_time_seq_imp',
 'enriched_time_seq_p90'}

In [99]:
experiment = 'benchmarks_shuffle_imp'

In [100]:
e = mlflow.tracking.MlflowClient().get_experiment_by_name(experiment)

In [101]:
filepath =e.artifact_location

In [102]:
mypath = filepath.split('file:///')[1]
runids = [f for f in listdir(mypath) if isdir(join(mypath, f))]
runids

['21afc69e483a4d8bbb9b1567f20dcf68', '6042e5c20bf74af49d0e2d6fae375747']

In [103]:
all_runs_params = []
all_runs_metrics = []
for run in runids:
    rundata = mlflow.tracking.MlflowClient().get_run(run).to_dictionary()['data']
    params = pd.DataFrame(rundata['params'].values(), index=rundata['params'].keys())
    metrics = pd.DataFrame(rundata['metrics'].values(), index=rundata['metrics'].keys())
    all_runs_params.append(params)
    all_runs_metrics.append(metrics)

In [104]:
all_rows_params = set()
for r in all_runs_params:
    all_rows_params = all_rows_params.union(set(r.index))
    
all_rows_metrics = set()
for r in all_runs_metrics:
    all_rows_metrics = all_rows_metrics.union(set(r.index))

In [113]:
df_base_params = pd.DataFrame([np.NaN]*len(all_rows_params), index=all_rows_params)

df_base_metrics = pd.DataFrame([np.NaN]*len(all_rows_metrics), index=all_rows_metrics)

df_base = pd.concat([df_base_params, df_base_metrics])

In [115]:
runs_df_list = []

runinfo = mlflow.tracking.MlflowClient().get_run(run).to_dictionary()['info']

for run in runids:
    df_base_c = df_base.copy() 
    rundata = mlflow.tracking.MlflowClient().get_run(run).to_dictionary()['data']
    for param in rundata['params']:
        df_base_c.loc[param] = rundata['params'][param]
    for metric in rundata['metrics']:
        df_base_c.loc[metric] = rundata['metrics'][metric]
    for info in runinfo:
        if info in ('start_time', 'end_time'):
            df_base_c.loc[info] = [pd.to_datetime(runinfo[info], unit='ms'), 'info']
        else:
            df_base_c.loc[info] = [runinfo[info], 'info']
    runs_df_list.append(df_base_c)
viz = pd.concat(runs_df_list, axis=1).transpose().dropna(how='all') 
viz = set_clf_cols(viz)

In [116]:
pd.set_option("display.max_rows", 101)

In [117]:
viz

Unnamed: 0,RandomForestClassifier_190817_83619,RandomForestClassifier_190817_83619.1,SGDClassifier_190817_83517,SGDClassifier_190817_83517.1
model_type,RandomForestClassifier,RandomForestClassifier,SGDClassifier,SGDClassifier
train_file_name,shuffle_imp__traindata_19072_750.pkl,shuffle_imp__traindata_19072_750.pkl,shuffle_imp__traindata_19072_750.pkl,shuffle_imp__traindata_19072_750.pkl
max_iter,,param,250,250
validation_fraction,,param,0.1,0.1
class_weight,balanced,balanced,,
model_filepath,../data/models/benchmarks_shuffle_imp/shuffle_...,../data/models/benchmarks_shuffle_imp/shuffle_...,../data/models/benchmarks_shuffle_imp/shuffle_...,../data/models/benchmarks_shuffle_imp/shuffle_...
min_impurity_decrease,0.0,0.0,,param
min_samples_leaf,1,1,,param
roc_test_tpr,"0.0,0.004405286343612335,0.013215859030837005,...","0.0,0.004405286343612335,0.013215859030837005,...","0.0,0.004405286343612335,0.01762114537444934,0...","0.0,0.004405286343612335,0.01762114537444934,0..."
loss,,param,log,log


In [47]:
#files = []
#for modelname in viz.columns:
#    check = viz.loc['model_filepath', modelname].replace('models', 'viz_data').replace('.pkl','_viz.pkl')
#    if type(check) is str:
#        files.append(check)
#    else:
#        string = check.values[0]
#        if string not in set(files):
#            files.append(string)
        

In [49]:
#retrieving dict viz and organizing it in a macrodictionary
#viz_dicts = dict(zip(list(viz.columns), [pd.read_pickle(viz_data) for viz_data in files]))

In [40]:
viz.columns

Index(['RandomForestClassifier_190817_83619', 'SGDClassifier_190817_83517'], dtype='object')

In [32]:
TTQcolor.keys()



In [33]:
colors = [TTQcolor['azureBlue'], TTQcolor['richOrange'], TTQcolor['algae'],
                                                       TTQcolor['yell'], TTQcolor['redBrown'], TTQcolor['bloodRed']]

In [44]:
viz.loc['roc_val_fpr','RandomForestClassifier_190817_83619'].split(',')[-1]

''

In [51]:
val_roc = plot_rocs([{'fpr':[float(v) for v in viz.loc['roc_val_fpr',model].split(',')[:-1]],
                    'tpr':[float(v) for v in viz.loc['roc_val_tpr',model].split(',')[:-1]],
                     'auc':viz.loc['val_auc',model]} for model in viz.columns], label = list(viz.columns), title_lab = 'Validation performance', 
               p_width=600, p_height=600, line_width=2,
                    colors = colors, legend_font_size='9pt', fpr_font_size='9pt',
                   bestFprOnly=True, show_legend=False)

show(val_roc)

In [52]:
test_roc = plot_rocs([{'fpr':[float(v) for v in viz.loc['roc_test_fpr',model].split(',')[:-1]],
                    'tpr':[float(v) for v in viz.loc['roc_test_tpr',model].split(',')[:-1]],
                     'auc':viz.loc['test_auc',model]} for model in viz.columns], label = list(viz.columns), title_lab = 'Test performance',
               p_width=600, p_height=600, line_width=2,
                     colors = colors, legend_font_size='9pt', fpr_font_size='9pt',
                    bestFprOnly=True, show_legend=False)

show(test_roc)

In [53]:
viz.loc['tp_rate'] = viz.loc['test_tp']/(viz.loc['test_tp'] + viz.loc['test_fn'])
viz.loc['tn_rate'] = viz.loc['test_tn']/(viz.loc['test_tn'] + viz.loc['test_fp'])
viz.loc['fp_rate'] = viz.loc['test_fp']/(viz.loc['test_tp'] + viz.loc['test_tn'])
viz.loc['fn_rate'] = viz.loc['test_fn']/(viz.loc['test_tn'] + viz.loc['test_tp'])

In [54]:
single_spider = True

params = ['tp_rate', 'tn_rate', 'fp_rate', 'fn_rate', 'test_auc', 'val_auc']
models = [m for m in list(viz.columns)]

if single_spider:
    
    spider = spiderWebChart(models, params, 
                        [viz.loc[params, m] for m in models], 
                        colors=colors, text_size='12pt',
                           title='Overall Test Comparison', p_height=600, p_width=600, margin_distance=0.25,
                           legend_location='top_right', show_legend=False, line_width=4.5, fill_alpha=0.1)

else:
    
    cols = []
    rows = []

    for m in range(len(models)):
        if m%1==0 or m==len(models):
            rows = []
            cols.append(rows)

        single_spider = spiderWebChart([models[m]], params, 
                                [viz.loc[params, models[m]]], 
                                colors=[colors[m]], text_size='6pt',
                                   title='', p_height=300, p_width=300, margin_distance=0.25,
                                   legend_location='top_right', show_legend=False, line_width=3.5, fill_alpha=0.1)
        
        spider = gridplot(cols)
    

show(spider)

In [55]:
single_row_folds = False


f = [fold for fold in viz.index if 'val_auc_fold_' in fold]
sorter = list(pd.Series([fold for fold in viz.index if 'val_auc_fold_' in fold]).apply(lambda x:int(x.split('_')[-1])))
folds_name = list(pd.Series(sorted(list(zip(f,sorter)), key=lambda f:f[1])).apply(lambda x:x[0]))

if single_row_folds:

    
    
    folds = histfolds([m for m in list(viz.columns)], folds_name, viz, plot_w=1200, title="Validation folds performance",
                 colors=colors, xlabelorientation=1.55, group_text_font_size='6pt')
else:
    cols = []
    rows = []

    breakp = 1

    for m in range(breakp, len(models)+1, breakp):
        if m%breakp==0 or m==len(models):
            rows = []
            cols.append(rows)

        row_folds = histfolds(models[m-breakp:m], folds_name, viz, plot_w=600, title="Validation folds performance",
                     colors=colors[m-breakp:m], xlabelorientation=1.55, group_text_font_size='6pt')

        rows.append(row_folds)
    
    folds = gridplot(cols)

show(folds)

In [56]:
ss = modelSpreadsheet(viz, ['n_estimators', 'max_depth', 'max_features', 'max_leaf_nodes', 
               'min_samples_leaf', 'min_samples_split', 'bootstrap',
              'criterion', 'val_auc', 'test_auc'], ['RandomForestClassifier', 'SGDClassifier'], color_cells=True, colors=colors[:2],
                      index_header='RF', height=80, width=1200, index_width=25,
                     row_height=25)

show(ss)

In [57]:
from bokeh.io import show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure

normalize = True

features = list(viz.index[list(pd.Series(viz.index).str.contains('^f_', regex=True))])
models = [c for c in list(viz.columns) if ('RandomForest' in c or 'SGDClassifier' in c)]


mod_dict = {}
for m in models:
    val_list = []
    for f in features:
        val_list.append(viz.loc[f, m])
    mod_dict[m] = val_list 
        
data = {**{'features': features}, **mod_dict}

x=[]
colors_list = []

for feat in features:
    count=0
    for mod in models:
        x.append((feat, mod))
        colors_list.append(colors[count])
        count+=1

if normalize:
    for model in models:
        data[model] = list(pd.Series(data[model]).apply(lambda x:(x-min(data[model]))/(max(data[model])-min(data[model]))))
        
counts = []
for featval in range(len(data[models[0]])):
    for model in models:
        counts.append(data[model][featval])
        


source = ColumnDataSource(data=dict(x=x, counts=counts, color=colors_list))

p = figure(x_range=FactorRange(*x), plot_width=1200, plot_height=500, title='Normalized feature importances',
           toolbar_location=None, tools="")

p.vbar(x='x', top='counts', width=0.9, source=source, color='color')

p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1.55
p.xaxis.major_tick_line_color = None
p.xaxis.major_label_text_font_size = '0pt'
p.xaxis.group_text_font_size = '10pt'
p.xaxis.group_label_orientation = 1.57
p.xgrid.grid_line_color = None

show(p)

## Gridplot

In [58]:
l = gridplot([[ss],
              [row(val_roc, folds)],
              [row(test_roc, spider)],[p]])
              #[row(val_roc, test_roc)]]) #,[row(val_roc, test_roc)],[folds], [spider]])
show(l)

In [65]:
#from bokeh.io import export_png
#export_png(l, experiment+'.png')