In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from IPython.display import display, HTML

In [93]:
def format_dataset(results, is_highlight = True, color = "navy"):
    formatted_result = {}
    for key, item in results.items():
        if isinstance(item, dict):
            enc_name = eval(key)[0]
            formatted_result[enc_name] = item 
        elif key == 'model_name':
            name = item
#         else: 
#             print(f'Type validation : {item}')
        
    results = pd.DataFrame.from_dict(formatted_result).T
    results = results[['train_score', 'val_score', 'test_score', 'time']]
    results.name = name
    
    if is_highlight:
        cm = sns.light_palette(color, as_cmap=True)
        return results.style.background_gradient(cmap=cm)
    
    return results

def open_dataset_validation(dataset_name, validation_type):
    with open(f'./results/{dataset_name}_{validation_type}.json', "r") as read_file:
        data = json.load(read_file)
    return data

def print_highlighted_data(data):
    for result in data:
        display(HTML(format_dataset(result, is_highlight=True).render()))
        
def get_list_of_datasets(data):
    results = []
    for result in data:   
        results.append(format_dataset(result, is_highlight=False))
    return results

def highlight_min_col(x):
    return ['background-color: #FF8500' if v == x.min() else '' for v in x]
    #return ['background-color: RGB(249, 201, 16)' if v in x.nsmallest(2).values else '' for v in x]

def highlight_min_row(x):
    return ['background-color: #3AA6D0' if v == x.min() else '' for v in x]

def highlight_max_col(x):
    return ['background-color: #3AA6D0' if v == x.max() else '' for v in x]
    #return ['background-color: RGB(249, 201, 16)' if v in x.nsmallest(2).values else '' for v in x]

def highlight_max_row(x):
    return ['background-color: red' if v == x.max() else '' for v in x]

def get_df_results(results):
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    test_df = pd.DataFrame()
    time = pd.DataFrame()

    names = []

    for result in results:
        names.append(result.name)

        train_df = train_df.append(result['train_score'])
        val_df = val_df.append(result['val_score'])
        test_df = test_df.append(result['test_score'])
        time = time.append(result['time'])

    train_df = train_df.T
    val_df = val_df.T
    test_df = test_df.T
    time = time.T

    train_df.columns = names
    val_df.columns = names
    test_df.columns = names
    time.columns = names
    
    train_df.name = 'train'
    val_df.name = 'val'
    test_df.name = 'test'
    time.name = 'time'
    
    return train_df, val_df, test_df, time

In [26]:
dataset_name = 'mimic'
validation_types = ['None', 'Single', 'Double']

In [96]:
# Посмотрим на обучающую выборку
for validation_type in validation_types:
    data = open_dataset_validation(dataset_name, validation_type)
    results = get_list_of_datasets(data)
    train_df, val_df, test_df, time = get_df_results(results)
    
    for df in [train_df]:#, val_df,, timetest_df, train_df
        print(f'Dataset : {dataset_name}')
        print(f'\t Validation type: {validation_type}')
        print(f'\t Result name : {df.name}')
        highlight_data = df.style.apply(highlight_min_row, axis = 1).apply(highlight_min_col, axis = 0)
        display(HTML(highlight_data.render()))

Dataset : mimic
	 Validation type: None
	 Result name : train


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
BackwardDifferenceEncoder,138.895,206.741,0,14.4394,210.45,37.1765
FrequencyEncoder,143.703,205.105,0,14.4379,215.698,36.5035
HelmertEncoder,138.895,206.705,0,14.4056,189.556,36.0099
JamesSteinEncoder,146.034,206.714,0,14.4346,178.821,37.0423
LeaveOneOutEncoder,145.784,206.685,0,0.132327,161.184,0.0598766
MEstimateEncoder,145.701,206.682,0,14.435,170.101,37.0423
OneHotEncoder,138.895,206.743,0,14.3371,214.858,36.1626
OrdinalEncoder,146.643,206.737,0,14.4152,170.77,35.754
SumEncoder,138.895,206.736,0,14.3487,187.996,36.1626
TargetEncoder,145.702,206.681,0,14.435,157.942,37.0423


Dataset : mimic
	 Validation type: Single
	 Result name : train


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
BackwardDifferenceEncoder,138.895,206.742,0,14.4647,212.18,36.7368
CatBoostEncoder,145.7,206.671,0,15.3024,191.309,48.8308
FrequencyEncoder,143.704,205.562,0,14.4378,215.69,36.5035
HelmertEncoder,138.895,206.711,0,14.3964,205.409,35.72
JamesSteinEncoder,146.024,206.715,0,14.4212,178.403,37.0715
LeaveOneOutEncoder,145.792,206.686,0,0.158855,177.85,6.70418
MEstimateEncoder,145.69,206.682,0,14.4292,170.611,36.9559
OneHotEncoder,138.895,206.743,0,14.3351,202.243,36.1626
OrdinalEncoder,145.69,206.722,0,14.4348,171.897,35.754
SumEncoder,138.895,206.738,0,14.3427,214.632,36.1626


Dataset : mimic
	 Validation type: Double
	 Result name : train


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
CatBoostEncoder,145.77,206.679,0,16.172,161.516,31.0544
FrequencyEncoder,143.704,205.956,0,15.9968,215.693,92.071
JamesSteinEncoder,146.101,206.72,0,16.1306,176.481,33.23
LeaveOneOutEncoder,145.771,206.679,0,16.1897,167.103,33.8058
MEstimateEncoder,145.771,206.679,0,16.1756,154.605,31.1536
TargetEncoder,145.771,206.679,0,16.1897,158.954,33.8058


Данные результаты показывают, что алгоритмы, основанные на решающих деревьях показывают низкие результаты, однако, если мы посмотрим на валидационную выборку, то увидим, что такие алгоритмы подверглись нежелательному эффекту переобучения. 

In [97]:
for validation_type in validation_types:
    data = open_dataset_validation(dataset_name, validation_type)
    results = get_list_of_datasets(data)
    train_df, val_df, test_df, time = get_df_results(results)
    
    for df in [val_df]:#, val_df,, timetest_df, train_df
        print(f'Dataset : {dataset_name}')
        print(f'\t Validation type: {validation_type}')
        print(f'\t Result name : {df.name}')
        highlight_data = df.style.apply(highlight_min_row, axis = 1).apply(highlight_min_col, axis = 0)
        display(HTML(highlight_data.render()))

Dataset : mimic
	 Validation type: None
	 Result name : val


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
BackwardDifferenceEncoder,139.314,215.865,193.721,99.8578,209.799,96.5202
FrequencyEncoder,143.984,213.953,196.688,99.9143,215.697,96.3937
HelmertEncoder,139.314,215.725,199.217,99.7212,190.579,96.3305
JamesSteinEncoder,146.292,215.836,191.424,99.2783,179.15,96.5156
LeaveOneOutEncoder,146.043,215.83,1.118,0.640181,161.546,1.19239
MEstimateEncoder,145.96,215.824,191.426,99.2799,169.482,96.5156
OneHotEncoder,139.314,215.868,197.14,99.5929,214.888,95.8703
OrdinalEncoder,146.911,215.831,200.436,99.9692,170.503,95.5177
SumEncoder,139.314,215.862,200.568,99.3816,188.411,95.8702
TargetEncoder,145.96,215.824,191.426,99.2799,159.079,96.5156


Dataset : mimic
	 Validation type: Single
	 Result name : val


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
BackwardDifferenceEncoder,139.314,215.861,200.62,99.7537,212.528,96.2951
CatBoostEncoder,146.181,215.792,225.303,117.532,190.065,114.936
FrequencyEncoder,143.984,214.379,196.687,99.9147,215.689,96.3937
HelmertEncoder,139.314,215.749,206.022,99.4305,206.07,96.2178
JamesSteinEncoder,146.312,215.837,189.692,99.413,180.83,96.3786
LeaveOneOutEncoder,145.986,215.827,216.391,212.438,177.35,205.581
MEstimateEncoder,145.976,215.826,188.093,99.4722,169.78,96.5874
OneHotEncoder,139.314,215.868,199.159,99.5243,202.749,95.8703
OrdinalEncoder,145.872,215.847,196.512,99.8974,171.218,95.5177
SumEncoder,139.314,215.868,197.954,99.418,215.466,95.8702


Dataset : mimic
	 Validation type: Double
	 Result name : val


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
CatBoostEncoder,145.982,215.823,204.444,109.148,162.253,103.89
FrequencyEncoder,150.421,222.863,379.59,158.24,216.216,127.899
JamesSteinEncoder,146.318,215.835,205.433,109.618,177.403,104.166
LeaveOneOutEncoder,145.983,215.823,206.832,109.058,167.847,104.027
MEstimateEncoder,145.982,215.823,203.571,109.196,155.761,103.933
TargetEncoder,145.983,215.823,206.832,109.058,158.889,104.027


In [98]:
test_list = []
for validation_type in validation_types:
    data = open_dataset_validation(dataset_name, validation_type)
    results = get_list_of_datasets(data)
    train_df, val_df, test_df, time = get_df_results(results)
    
    for df in [test_df]:#, val_df,, timetest_df, train_df
        test_list.append(df)
        print(f'Dataset : {dataset_name}')
        print(f'\t Validation type: {validation_type}')
        print(f'\t Result name : {df.name}')
        highlight_data = df.style.apply(highlight_min_row, axis = 1).apply(highlight_min_col, axis = 0)
        display(HTML(highlight_data.render()))

Dataset : mimic
	 Validation type: None
	 Result name : test


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
BackwardDifferenceEncoder,83.886,114.397,253.053,113.014,116.766,85.3485
FrequencyEncoder,79.5718,115.378,253.572,85.255,116.483,74.9897
HelmertEncoder,77.3658,114.397,221.148,86.3063,79.4962,76.0341
JamesSteinEncoder,83.7425,114.397,398.701,111.25,122.172,87.6164
LeaveOneOutEncoder,81.3309,114.397,118.17,117.894,83.5761,117.53
MEstimateEncoder,83.3314,114.397,398.701,111.25,85.9935,87.6164
OneHotEncoder,83.886,114.397,229.358,95.6498,116.733,84.8071
OrdinalEncoder,100.181,114.397,268.801,85.0247,85.2655,85.7505
SumEncoder,77.3658,114.397,245.434,86.7759,89.2989,74.8041
TargetEncoder,83.3319,114.397,398.701,111.25,101.237,87.6164


Dataset : mimic
	 Validation type: Single
	 Result name : test


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
BackwardDifferenceEncoder,77.3658,114.397,239.393,84.494,117.735,75.9018
CatBoostEncoder,80.7028,114.397,219.27,85.4648,104.588,73.4174
FrequencyEncoder,79.5648,115.323,253.588,85.2524,116.338,74.9897
HelmertEncoder,77.3658,114.397,228.985,86.3876,115.005,74.2031
JamesSteinEncoder,81.6168,114.397,222.607,84.8551,191.12,76.5161
LeaveOneOutEncoder,81.2863,114.397,118.238,118.009,111.164,117.643
MEstimateEncoder,81.2353,114.397,210.099,86.7404,75.8812,74.9903
OneHotEncoder,77.3658,114.397,247.186,86.0236,99.3873,74.8041
OrdinalEncoder,80.2765,114.397,229.016,84.8798,109.886,75.3688
SumEncoder,77.3658,114.397,244.34,86.3527,206.386,74.8041


Dataset : mimic
	 Validation type: Double
	 Result name : test


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
CatBoostEncoder,81.2803,114.397,227.465,79.5997,80.3438,71.7553
FrequencyEncoder,81.1614,114.976,419.356,116.292,118.662,88.2224
JamesSteinEncoder,81.6414,114.397,206.091,80.2682,78.4088,73.2844
LeaveOneOutEncoder,81.2807,114.397,244.362,79.6565,107.507,72.6443
MEstimateEncoder,81.2803,114.397,227.465,79.5997,80.3438,71.7553
TargetEncoder,81.2815,114.397,244.362,79.6565,92.6553,72.6443


In [94]:
for validation_type in validation_types:
    data = open_dataset_validation(dataset_name, validation_type)
    results = get_list_of_datasets(data)
    train_df, val_df, test_df, time = get_df_results(results)
    
    for df in [time]:#, val_df,, timetest_df, train_df
        print(f'Dataset : {dataset_name}')
        print(f'\t Validation type: {validation_type}')
        print(f'\t Result name : {df.name}')
        highlight_data = df.style.apply(highlight_max_row, axis = 1).apply(highlight_max_col, axis = 0)
        display(HTML(highlight_data.render()))

Dataset : mimic
	 Validation type: None
	 Result name : time


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
BackwardDifferenceEncoder,1.42031,13.1031,2.8175,29.5544,20.7538,88.4232
FrequencyEncoder,1.47396,22.1392,2.18576,22.4814,16.0681,58.1082
HelmertEncoder,1.29019,13.7128,3.09291,32.1732,25.0576,65.055
JamesSteinEncoder,0.906109,11.6156,2.03536,19.2994,19.3927,76.5507
LeaveOneOutEncoder,0.944601,12.6252,1.1588,11.5487,22.5216,87.5686
MEstimateEncoder,0.98679,11.5086,2.61247,22.9022,24.5837,80.2929
OneHotEncoder,2.05254,13.7011,3.0976,36.1307,23.7428,75.0894
OrdinalEncoder,1.30625,12.1711,2.67174,19.5394,25.8629,94.9158
SumEncoder,1.5871,14.0857,2.64591,29.2619,17.6508,64.6922
TargetEncoder,0.789698,11.2238,2.33943,20.6454,26.183,81.5447


Dataset : mimic
	 Validation type: Single
	 Result name : time


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
BackwardDifferenceEncoder,2.66837,15.3138,3.96398,30.2204,22.8458,67.585
CatBoostEncoder,2.90801,14.2007,5.96006,52.2576,23.9525,58.5544
FrequencyEncoder,2.84291,19.1313,4.33423,24.2463,19.1661,101.499
HelmertEncoder,3.0712,14.3387,4.3187,34.9682,22.772,96.1568
JamesSteinEncoder,1.47237,13.3474,2.62609,23.2903,21.8387,57.6292
LeaveOneOutEncoder,2.14425,13.3068,2.52665,13.5964,24.8521,20.167
MEstimateEncoder,1.40702,14.698,3.27175,23.3074,33.1961,72.2078
OneHotEncoder,3.74132,16.1672,5.4486,32.3455,27.7844,61.9731
OrdinalEncoder,1.45259,12.3249,2.69628,23.0087,30.2053,63.1075
SumEncoder,3.09671,14.8992,4.267,33.1063,16.7471,87.7841


Dataset : mimic
	 Validation type: Double
	 Result name : time


Unnamed: 0,LinearRegression,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor,MLPRegressor,LGBMRegressor
CatBoostEncoder,34.1491,47.594,44.7895,73.2035,63.8067,130.534
FrequencyEncoder,24.4148,42.7832,26.0223,58.913,36.3952,49.9602
JamesSteinEncoder,15.9087,28.3159,19.3145,54.5897,38.8091,107.171
LeaveOneOutEncoder,25.818,37.6017,29.5039,61.8925,49.9825,118.545
MEstimateEncoder,14.6764,28.1939,19.5099,54.0915,43.1481,137.721
TargetEncoder,18.2096,31.5208,22.873,58.4895,47.7154,100.232


In [95]:
test_results_encoders = []
for df in test_list:
    test_results_encoders.append(df.mean(axis=1))
    
test_df_results_encoders = pd.concat([i for i in test_results_encoders], axis = 1, sort=True)
test_df_results_encoders.columns = ['None', 'Single', 'Double']
test_df_results_encoders.style.apply(highlight_min_row, axis = 0).apply(highlight_max_row, axis = 0)

Unnamed: 0,None,Single,Double
BackwardDifferenceEncoder,127.744,118.214,
CatBoostEncoder,,112.973,109.14
FrequencyEncoder,120.875,120.842,156.445
HelmertEncoder,109.125,116.057,
JamesSteinEncoder,152.98,128.519,105.682
LeaveOneOutEncoder,105.483,110.123,116.641
MEstimateEncoder,146.882,107.224,109.14
OneHotEncoder,120.805,116.527,
OrdinalEncoder,123.237,115.637,
SumEncoder,114.679,133.941,


In [90]:
test_results_methods = []
for df in test_list:
    test_results_methods.append(df.mean(axis=0))
    
test_df_results_methods = pd.concat([i for i in test_results_methods], axis = 1, sort = True)
test_df_results_methods.columns = ['None', 'Single', 'Double']
test_df_results_methods.style.apply(highlight_min_row, axis = 0).apply(highlight_max_row, axis = 0)

Unnamed: 0,None,Single,Double
DecisionTreeRegressor,278.564,220.256,261.517
KNeighborsRegressor,114.495,114.481,114.493
LGBMRegressor,86.2113,78.8754,75.051
LinearRegression,83.3993,79.5803,81.3209
MLPRegressor,99.7023,121.36,92.9869
RandomForestRegressor,100.367,88.6546,85.8454
