In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from bokeh.io import show, output_notebook
output_notebook()

from scripts_viz.visualization_utils import *
from scripts_viz.visualization_utils import TTQcolor

from bokeh.layouts import gridplot, row, column

In [2]:
datafolder = '../data/viz_data/'
models = 'enriched_time_seq20000_5000_'
postfix = 'imp'

experiment = models+postfix

viz = pd.read_csv(datafolder+experiment+'/'+experiment+'.csv')

In [3]:
pd.set_option("display.max_rows", 101)

In [4]:
viz.columns

Index(['Run ID', 'Name', 'Source Type', 'Source Name', 'User', 'Status',
       'alpha', 'average', 'bootstrap', 'class_weight', 'criterion',
       'early_stopping', 'epsilon', 'eta0', 'experiment_type', 'fit_intercept',
       'l1_ratio', 'learning_rate', 'loss', 'max_depth', 'max_features',
       'max_iter', 'max_leaf_nodes', 'min_impurity_decrease',
       'min_impurity_split', 'min_samples_leaf', 'min_samples_split',
       'min_weight_fraction_leaf', 'model_filename', 'model_filepath',
       'model_type', 'n_estimators', 'n_iter_no_change', 'n_jobs', 'oob_score',
       'penalty', 'power_t', 'random_state', 'shuffle', 'test_file_name',
       'test_file_path', 'test_size', 'tol', 'train_file_name',
       'train_file_path', 'train_size', 'validation_fraction', 'verbose',
       'warm_start', 'f_c_impaired1_c', 'f_c_lent_c', 'f_c_pastdue90_c',
       'f_c_repaid_c', 'f_c_trend_a', 'f_cd_impaired1_c', 'f_cd_lent_c',
       'f_cd_pastdue90_c', 'f_cd_repaid_c', 'f_cd_trend_a',
    

In [5]:
#viz.transpose()['model_filename'].str.split('.pkl')

In [6]:
viz = set_clf_cols(viz)

In [7]:
viz.columns

Index(['RandomForestClassifier_190816_115520', 'SGDClassifier_190816_11559'], dtype='object')

In [8]:
files = []
for modelname in viz.columns:
    check = viz.loc['model_filepath', modelname].replace('models', 'viz_data').replace('.pkl','_viz.pkl')
    if type(check) is str:
        files.append(check)
    else:
        string = check.values[0]
        if string not in set(files):
            files.append(string)
        

In [9]:
files

['../data/viz_data/enriched_time_seq20000_5000_imp/time_2018-04-30_imp_bg__RandomForestClassifier_190816_115520_viz.pkl',
 '../data/viz_data/enriched_time_seq20000_5000_imp/time_2018-04-30_imp_bg__SGDClassifier_190816_11559_viz.pkl']

In [10]:
#retrieving dict viz and organizing it in a macrodictionary
viz_dicts = dict(zip(list(viz.columns), [pd.read_pickle(viz_data) for viz_data in files]))

In [11]:
TTQcolor.keys()



In [12]:
colors = [TTQcolor['azureBlue'], TTQcolor['richOrange'], TTQcolor['algae'],
                                                       TTQcolor['yell'], TTQcolor['redBrown'], TTQcolor['bloodRed']]

In [13]:
val_roc = plot_rocs([viz_dicts[model]['validation'] for model in viz.columns], label = list(viz.columns), title_lab = 'Validation performance', 
               p_width=600, p_height=600, line_width=2,
                    colors = colors, legend_font_size='9pt', fpr_font_size='9pt',
                   bestFprOnly=True, show_legend=False)

show(val_roc)

In [14]:
test_roc = plot_rocs([viz_dicts[model]['testing'] for model in viz.columns], label = list(viz.columns), title_lab = 'Test performance',
               p_width=600, p_height=600, line_width=2,
                     colors = colors, legend_font_size='9pt', fpr_font_size='9pt',
                    bestFprOnly=True, show_legend=False)

show(test_roc)

In [15]:
viz.loc['tp_rate'] = viz.loc['test_tp']/(viz.loc['test_tp'] + viz.loc['test_fn'])
viz.loc['tn_rate'] = viz.loc['test_tn']/(viz.loc['test_tn'] + viz.loc['test_fp'])
viz.loc['fp_rate'] = viz.loc['test_fp']/(viz.loc['test_tp'] + viz.loc['test_tn'])
viz.loc['fn_rate'] = viz.loc['test_fn']/(viz.loc['test_tn'] + viz.loc['test_tp'])

In [16]:
single_spider = True

params = ['tp_rate', 'tn_rate', 'fp_rate', 'fn_rate', 'test_auc', 'val_auc']
models = [m for m in list(viz.columns)]

if single_spider:
    
    spider = spiderWebChart(models, params, 
                        [viz.loc[params, m] for m in models], 
                        colors=colors, text_size='12pt',
                           title='Overall Test Comparison', p_height=600, p_width=600, margin_distance=0.25,
                           legend_location='top_right', show_legend=False, line_width=4.5, fill_alpha=0.1)

else:
    
    cols = []
    rows = []

    for m in range(len(models)):
        if m%1==0 or m==len(models):
            rows = []
            cols.append(rows)

        single_spider = spiderWebChart([models[m]], params, 
                                [viz.loc[params, models[m]]], 
                                colors=[colors[m]], text_size='6pt',
                                   title='', p_height=300, p_width=300, margin_distance=0.25,
                                   legend_location='top_right', show_legend=False, line_width=3.5, fill_alpha=0.1)
        
        spider = gridplot(cols)
    

show(spider)

In [17]:
single_row_folds = False


f = [fold for fold in viz.index if 'val_auc_fold_' in fold]
sorter = list(pd.Series([fold for fold in viz.index if 'val_auc_fold_' in fold]).apply(lambda x:int(x.split('_')[-1])))
folds_name = list(pd.Series(sorted(list(zip(f,sorter)), key=lambda f:f[1])).apply(lambda x:x[0]))

if single_row_folds:

    
    
    folds = histfolds([m for m in list(viz.columns)], folds_name, viz, plot_w=1200, title="Validation folds performance",
                 colors=colors, xlabelorientation=1.55, group_text_font_size='6pt')
else:
    cols = []
    rows = []

    breakp = 1

    for m in range(breakp, len(models)+1, breakp):
        if m%breakp==0 or m==len(models):
            rows = []
            cols.append(rows)

        row_folds = histfolds(models[m-breakp:m], folds_name, viz, plot_w=600, title="Validation folds performance",
                     colors=colors[m-breakp:m], xlabelorientation=1.55, group_text_font_size='6pt')

        rows.append(row_folds)
    
    folds = gridplot(cols)

show(folds)

In [18]:
ss = modelSpreadsheet(viz, ['n_estimators', 'max_depth', 'max_features', 'max_leaf_nodes', 
               'min_samples_leaf', 'min_samples_split', 'bootstrap',
              'criterion', 'val_auc', 'test_auc'], ['RandomForestClassifier', 'SGDClassifier'], color_cells=True, colors=colors[:2],
                      index_header='RF', height=80, width=1200, index_width=25,
                     row_height=25)

show(ss)

In [19]:
from bokeh.io import show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure

normalize = True

features = list(viz.index[list(pd.Series(viz.index).str.contains('^f_', regex=True))])
models = [c for c in list(viz.columns) if ('RandomForest' in c or 'SGDClassifier' in c)]


mod_dict = {}
for m in models:
    val_list = []
    for f in features:
        val_list.append(viz.loc[f, m])
    mod_dict[m] = val_list 
        
data = {**{'features': features}, **mod_dict}

x=[]
colors_list = []

for feat in features:
    count=0
    for mod in models:
        x.append((feat, mod))
        colors_list.append(colors[count])
        count+=1

if normalize:
    for model in models:
        data[model] = list(pd.Series(data[model]).apply(lambda x:(x-min(data[model]))/(max(data[model])-min(data[model]))))
        
counts = []
for featval in range(len(data[models[0]])):
    for model in models:
        counts.append(data[model][featval])
        


source = ColumnDataSource(data=dict(x=x, counts=counts, color=colors_list))

p = figure(x_range=FactorRange(*x), plot_width=1200, plot_height=500, title='Normalized feature importances',
           toolbar_location=None, tools="")

p.vbar(x='x', top='counts', width=0.9, source=source, color='color')

p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1.55
p.xaxis.major_tick_line_color = None
p.xaxis.major_label_text_font_size = '0pt'
p.xaxis.group_text_font_size = '10pt'
p.xaxis.group_label_orientation = 1.57
p.xgrid.grid_line_color = None

show(p)

## Gridplot

In [20]:
l = gridplot([[ss],
              [row(val_roc, folds)],
              [row(test_roc, spider)],[p]])
              #[row(val_roc, test_roc)]]) #,[row(val_roc, test_roc)],[folds], [spider]])
show(l)

In [23]:
from bokeh.io import export_png
export_png(l, experiment+'.png')

'C:\\Users\\DavideMariani\\Tradeteq Dropbox\\Davide Mariani\\thesis_project\\networkAnalysisForML\\enriched_time_seq20000_5000_imp.png'