# Imports and initialization of general parameters
***

In [1]:
from helpers.pareto_fairness import compute_pareto_metrics
from config.info import AGES, RACES, GENDERS, COMBS_BASELINE
from visualization.subgroup_distribution import plot_dist
from dataprocess.dataloader import load_data
from dataprocess.dataclass import Data
from config.get_args import get_args
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from lightning import seed_everything
from plotly.subplots import make_subplots
import numpy as np
import argparse
import pandas as pd
import random
import torch.nn as nn
import pickle

loss_fct = nn.CrossEntropyLoss()


# Auto reload part
%load_ext autoreload
%autoreload 2

# Load the desired data set
***

In [6]:
# General parameters
metrics_to_test = ['MMPF', 'MMPF_5', 'MMPF_10', 'MMPF_size_2', 'MMPF_adapted_2']
n_seeds = 100

def check_metrics(task, cancer, df : pd.DataFrame = None):
    # Initialization
    metrics = {'MMPF' : [],
            'MMPF_size_2' : [],
            'MMPF_5' : [], 'MMPF_10' : [],
            'MMPF_adapted_2' : []}
    
    # Extract the results pkl files
    preds_path = 'results/preds/run_100/add_protected_atts_1/Baseline'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/results.pkl')
    results.drop_duplicates(subset = ['subj'], inplace = True, keep = 'first')

    # Get multiple seeds
    seeds = []
    for idx in range(n_seeds):
        seeds += [random.randint(0, 10000)]

    # Loop on multiple seeds
    for seed in seeds:
        # Seeding and init
        random.seed(seed)
        test_size = int(len(results) * 0.2)

        # Extract the subjects and get the 20% test set
        subj = results.subj.to_list()
        random.shuffle(subj)
        test_subj = subj[:test_size]

        # Get the results of the data set only
        test_results = results[results.subj.isin(test_subj)]
        test_metrics = compute_pareto_metrics(test_results, loss_fct, ['age_', 'race_', 'gender_'], all_only = True)

        # Add the metrics
        for m in metrics_to_test:
            metrics[m] += [test_metrics[m]]
            
    # Get the sub data frame
    metrics_df = pd.DataFrame(data = metrics)
    metrics_df['task'] = task
    metrics_df['cancer'] = cancer
    
    # Concet the dataframe
    if df is None: return metrics_df
    else: return pd.concat([df, metrics_df])

In [7]:
# Loop on the combinations
df = None
variance = {'task' : [], 'cancer': [],
            'MMPF' : [],
            'MMPF_size_2' : [],
            'MMPF_5' : [], 'MMPF_10' : [],
            'MMPF_adapted_2' : []}
for comb in COMBS_BASELINE:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    
    # Get the metrics
    df = check_metrics(task, cancer, df = df)
    
    # Compute the variance
    variance['task'] += [task]
    variance['cancer'] += [cancer]
    for m in metrics_to_test:
        variance[m] += [df[(df.task == task) & (df.cancer == cancer)][m].var()]

['cancer_classification', 'coad_read_FS']
['cancer_classification', 'coad_read_PM']
['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']
['tumor_detection', 'brca']
['tumor_detection', 'coad']
['tumor_detection', 'kich']
['tumor_detection', 'kirc']
['tumor_detection', 'kirp']
['tumor_detection', 'luad']
['tumor_detection', 'lusc']
['tumor_detection', 'read']


In [8]:
# Plot
px.box(df, y = metrics_to_test, color = 'cancer')

In [9]:
for m in metrics_to_test:
    print(m, ' ', sum(variance[m]))

MMPF   0.45776563345962784
MMPF_5   0.0752951379655899
MMPF_10   0.044745638866580936
MMPF_size_2   0.05510661670681563
MMPF_adapted_2   0.06335235791789501


In [10]:
for m in metrics_to_test:
    print(m, ' ', np.mean(variance[m]))

MMPF   0.025431424081090434
MMPF_5   0.004183063220310552
MMPF_10   0.002485868825921163
MMPF_size_2   0.003061478705934202
MMPF_adapted_2   0.003519575439883056


In [11]:
for m in metrics_to_test:
    print(m, ' ', np.median(variance[m]))

MMPF   0.005026157054315721
MMPF_5   0.0019595076438375467
MMPF_10   0.0012368455664014647
MMPF_size_2   0.0011368059905134988
MMPF_adapted_2   0.0014709759720880403


In [13]:
cols = ['Metrics', 'Mean variance']
func = np.mean
df = pd.DataFrame([['MMPF', func(variance['MMPF'])]], columns = cols)
df = pd.concat([df, pd.DataFrame([['MMPF_5', func(variance['MMPF_5'])]], columns = cols)])
df = pd.concat([df, pd.DataFrame([['MMPF_10', func(variance['MMPF_10'])]], columns = cols)])
df = pd.concat([df, pd.DataFrame([['MMPF_adapted', func(variance['MMPF_adapted_2'])]], columns = cols)])
df = pd.concat([df, pd.DataFrame([['MMPF_size', func(variance['MMPF_size_2'])]], columns = cols)])
fig = px.bar(df, x = 'Metrics', y = 'Mean variance', color = 'Mean variance', color_continuous_scale = 'RdYlBu_r',# color_discrete_sequence = ['rgb(40, 140, 40)']*5,
             text_auto = '.1e')#, range_y = [0.7, 1])
fig.update_layout(height = 400, width = 650, template = 'none')
fig.update_layout(barmode = 'group', bargap = 0.3, bargroupgap = 0.2)
fig.write_image("images/metric_5.pdf")
fig.show()

In [381]:
var_df = pd.DataFrame(variance)
var_df

Unnamed: 0,task,cancer,MMPF,MMPF_size_2,MMPF_5,MMPF_10,MMPF_adapted_2
0,cancer_classification,coad_read_FS,0.07914273,0.02241362,0.02322726,0.008375218,0.02004041
1,cancer_classification,coad_read_PM,0.05316986,0.02053336,0.02861602,0.01761415,0.02087126
2,cancer_classification,kich_kirc_FS,0.002084677,0.0003731179,0.0004447973,0.0003962178,0.000433146
3,cancer_classification,kich_kirc_PM,0.0440756,0.0004603398,0.002126053,0.0008180326,0.0009311117
4,cancer_classification,kich_kirp_FS,0.002916986,0.002031086,0.002916989,0.00186664,0.002180224
5,cancer_classification,kich_kirp_PM,0.003485097,0.003485097,0.003485097,0.002946098,0.003485097
6,cancer_classification,kirc_kirp_FS,0.0003697142,0.0003697142,0.0003697142,0.0003397931,0.0003697142
7,cancer_classification,kirc_kirp_PM,0.004885879,0.001038062,0.002493725,0.001466405,0.001466405
8,cancer_classification,luad_lusc_FS,0.1431994,0.001091704,0.008939993,0.002130084,0.003850535
9,cancer_classification,luad_lusc_PM,0.1384665,0.00349763,0.01092352,0.003761663,0.007866398


In [382]:
red = var_df[var_df.task == 'cancer_classification']
fig = go.Figure(go.Bar(x = red.cancer, y = red['MMPF_10']))
fig.show()

In [383]:
fig = px.line(var_df, x = 'cancer', y = metrics_to_test, log_y=True)
fig.show()

# Create manual scenarios
***

In [67]:
#### Build the distribution of the data among the different subgroups
# Initialization
metrics_to_test = ['MMPF', 'MMPF_5', 'MMPF_10', 'MMPF_size', 'MMPF_adapted']
attributes = ['att_1', 'att_2']
cols = attributes + ['pred_raw', 'label', 'pred']
data_df = pd.DataFrame(columns = cols)
range_atts = [[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]
nb_patients_sg = [[1 , 16, 50 , 61 , 42, 23], 
                  [3 , 2 , 1  , 7  , 15, 14], 
                  [8 , 3 , 3  , 1  , 5 , 1], 
                  [37, 93, 136, 120, 67, 11], 
                  [2 , 12, 22 , 18 , 15, 1], 
                  [1 , 1 , 2  , 4  , 1 , 2]]

# Loop on attributes
for att_1 in range_atts[0]:
    for att_2 in range_atts[1]:
        nb = nb_patients_sg[att_1][att_2]
        sub_df = pd.DataFrame({'att_1' : [att_1] * nb,
                               'att_2' : [att_2] * nb,
                               'pred_raw' : [None] * nb,
                               'label' : list(np.random.binomial(1, 0.5, nb)),
                               'pred' : [None] * nb})
        data_df = pd.concat([data_df, sub_df])
data_df = data_df.astype({'att_1' : 'int32',
                            'att_2' : 'int32',
                            'label' : 'int32'})

In [68]:
#### Create a fair and unfair scenario
def build_scenario(data_df : pd.DataFrame, seed : int):
    # Initialization
    seed_everything(seed)
    df_f = data_df.copy()
    df_u = data_df.copy()
    acc_pos_f = 0.7
    acc_pos_u = 0.9
    acc_neg = 0.5
    sigma = 0.05
    
    
    # Fair scenario
    # Create 6x6 fair preds ratio
    fair_ratio = np.random.normal(acc_pos_f, sigma, 36).reshape((6, 6))
    fair_ratio[fair_ratio >= 1] = 0.99
    for att_1 in range_atts[0]:
        for att_2 in range_atts[1]:
            
            # conditon subdf
            cond = (df_f.att_1 == att_1) & (df_f.att_2 == att_2)
            sub_df = df_f[cond].copy()
            n_sub = len(sub_df)
            
            # Build the preds
            n_good = int(n_sub * fair_ratio[att_1][att_2] + 0.5)
            sub_preds = np.ones(n_sub, int)
            sub_preds[:n_good] = 0
            random.shuffle(sub_preds)
            sub_preds = abs(sub_preds - np.array(df_f[cond].label))
            df_f.loc[cond, 'pred'] = sub_preds
            
            # Build the raw preds
            sub_preds_raw = []
            poss = np.random.uniform(0.5, 1, n_sub)
            negs = np.random.uniform(0, 0.5, n_sub)
            for i, p in enumerate(sub_preds):
                pos = poss[i]
                neg = negs[i]
                raw = [0, 0]
                raw[int(p)] = pos
                raw[int(1-p)] = neg
                sub_preds_raw += [raw]
            df_f.loc[cond, 'pred_raw'] = pd.Series(sub_preds_raw)
            
            
    # Unfair scenario
    # Create 6x6 unfair preds ratio
    unfair_ratio1 = np.random.normal(acc_pos_u, sigma, 18)
    unfair_ratio2 = np.random.normal(acc_neg, sigma, 18)
    unfair_ratio1[unfair_ratio1 >= 1] = 0.99
    unfair_ratio2[unfair_ratio2 >= 1] = 0.99
    unfair_ratio = list(np.concatenate((unfair_ratio1, unfair_ratio2)))
    random.shuffle(unfair_ratio)
    unfair_ratio = np.array(unfair_ratio).reshape((6,6))
    for att_1 in range_atts[0]:
        for att_2 in range_atts[1]:
            
            # conditon subdf
            cond = (df_u.att_1 == att_1) & (df_u.att_2 == att_2)
            sub_df = df_u[cond].copy()
            n_sub = len(sub_df)
            
            # Build the preds
            n_good = int(n_sub * unfair_ratio[att_1][att_2] + 0.5)
            sub_preds = np.ones(n_sub, int)
            sub_preds[:n_good] = 0
            random.shuffle(sub_preds)
            sub_preds = abs(sub_preds - np.array(df_f[cond].label))
            df_u.loc[cond, 'pred'] = sub_preds
            
            # Build the raw preds
            sub_preds_raw = []
            poss = np.random.uniform(0.5, 1, n_sub)
            negs = np.random.uniform(0, 0.5, n_sub)
            for i, p in enumerate(sub_preds):
                pos = poss[i]
                neg = negs[i]
                raw = [0, 0]
                raw[int(p)] = pos
                raw[int(1-p)] = neg
                sub_preds_raw += [raw]
            df_u.loc[cond, 'pred_raw'] = pd.Series(sub_preds_raw)
            
    # reset indices
    df_f.reset_index(inplace = True, drop = True)
    df_f.reset_index(inplace = True, drop = False, names = 'subj')
    df_u.reset_index(inplace = True, drop = True)
    df_u.reset_index(inplace = True, drop = False, names = 'subj')
    
    # Return fair and unfair
    return df_f, df_u, fair_ratio, unfair_ratio

In [69]:
def check_scenario(df_f, df_u, seed):
    # Initialization
    seed_everything(seed)
    success = {'MMPF' : [],
            'MMPF_size' : [],
            'MMPF_5' : [], 'MMPF_10' : [],
            'MMPF_adapted' : []}
    
    # Extract the subjects and get the 20% test set
    test_subj_list = []
    
    # Get the values of unique subgroups prensent in the data set
    unique_subgroups = np.unique(df_f[attributes].values, axis = 0)

    # Loop on all the unique subgroups in the references
    for subgroup in unique_subgroups:
        
        # Extract the references that are in this subgroup only
        cond = df_f[attributes[0]] == subgroup[0]
        for idx, att in enumerate(attributes[1:]):
            cond = cond & (df_f[att] == subgroup[idx+1])
        subjects = list(df_f[cond].subj)
        
        # Shuffle and split it between train / validation / test sets
        random.shuffle(subjects)
        if len(subjects) == 1: pass
        elif len(subjects) == 2: pass
        elif len(subjects) == 3: test_subj_list += [subjects[2]]
        elif len(subjects) == 4: test_subj_list += [subjects[3]]
        else:
            sub_n = int(0.2 * len(subjects))
            test_subj_list += subjects[: sub_n]

    # Get the results of the data set only
    test_results_fair = df_f[df_f.subj.isin(test_subj_list)]
    test_results_unfair = df_u[df_u.subj.isin(test_subj_list)]
    test_metrics_f = compute_pareto_metrics(test_results_fair, loss_fct, attributes, all_only = True)
    test_metrics_u = compute_pareto_metrics(test_results_unfair, loss_fct, attributes, all_only = True)

    # Add the metrics
    for m in metrics_to_test:
        if test_metrics_f[m] < test_metrics_u[m]: success[m] = 1
        else: success[m] = 0
        
    return success

In [None]:
# Initialization
nb_success = {'MMPF' : 0,
            'MMPF_size' : 0,
            'MMPF_5' : 0, 'MMPF_10' : 0,
            'MMPF_adapted' : 0}
n_scenario = 10
n_splits = 10
n = 0

# Get multiple seeds
seeds_scenario = []
for idx in range(n_scenario):
    seeds_scenario += [random.randint(0, 1e7)]

# Loop on the number of scenarios
for seed_sc in seeds_scenario:
    
    # Get scenario 
    df_f, df_u, _, _ = build_scenario(data_df, seed_sc)
    
    # Get multiple seeds
    seeds_splits = []
    for idx in range(n_splits):
        seeds_splits += [random.randint(0, 1e7)]
    
    # Loop on the splits
    for seed_sp in seeds_splits:
        print(n)
        n+=1
        
        # Check scenario 
        success = check_scenario(df_f, df_u, seed_sp)

        # Add success 
        nb_success['n'] = n
        for m in metrics_to_test:
            nb_success[m] += success[m]
            print(m, ' ', nb_success[m]/n)
            
    # Save dict
    with open('success.pkl', 'wb') as fp:
        pickle.dump(nb_success, fp)
        print('dictionary saved successfully to file')

In [None]:
nb_success

In [None]:
for att_1 in range_atts[0]:
    for att_2 in range_atts[1]:
        cond = (a.att_1 == att_1) & (a.att_2 == att_2)
        print(att_1, ' ', att_2, ' ', 1-mean_squared_error(a[cond].label, a[cond].pred), 1-mean_squared_error(b[cond].label, b[cond].pred))

In [351]:
with open('success.pkl', 'rb') as fp:
    b = pickle.load(fp)
b

{'MMPF': 115742,
 'MMPF_5': 140993,
 'MMPF_10': 143646,
 'MMPF_size': 136240,
 'MMPF_size_2': 141351,
 'MMPF_adapted': 134103,
 'MMPF_adapted_2': 140006,
 'n': 144400}

# Plots
***

In [70]:
#### Build the distribution of the data among the different subgroups
# Initialization
metrics_to_test = ['MMPF', 'MMPF_5', 'MMPF_10', 'MMPF_size', 'MMPF_adapted']
attributes = ['att_1', 'att_2']
cols = attributes + ['pred_raw', 'label', 'pred']
data_df = pd.DataFrame(columns = cols)
range_atts = [[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]
nb_patients_sg = [[1 , 16, 50 , 61 , 42, 23], 
                  [3 , 2 , 1  , 7  , 15, 14], 
                  [8 , 3 , 3  , 1  , 5 , 1], 
                  [37, 93, 136, 120, 67, 11], 
                  [2 , 12, 22 , 18 , 15, 1], 
                  [1 , 1 , 2  , 4  , 1 , 2]]

# Loop on attributes
for att_1 in range_atts[0]:
    for att_2 in range_atts[1]:
        nb = nb_patients_sg[att_1][att_2]
        sub_df = pd.DataFrame({'att_1' : [att_1] * nb,
                               'att_2' : [att_2] * nb,
                               'pred_raw' : [None] * nb,
                               'label' : list(np.random.binomial(1, 0.5, nb)),
                               'pred' : [None] * nb})
        data_df = pd.concat([data_df, sub_df])
data_df = data_df.astype({'att_1' : 'int32',
                            'att_2' : 'int32',
                            'label' : 'int32'})
fair, unfair, _, _ = build_scenario(data_df, 1200)

Seed set to 1200


In [71]:
fair

Unnamed: 0,subj,att_1,att_2,pred_raw,label,pred
0,0,0,0,"[0.19801931800394595, 0.885074334725312]",1,1
1,1,0,1,"[0.3449263807634206, 0.6277834696475809]",1,1
2,2,0,1,"[0.46446621213581657, 0.6821174658701089]",1,1
3,3,0,1,"[0.03082195202146587, 0.9954021422942199]",1,1
4,4,0,1,"[0.5550770996127448, 0.016046562898626993]",1,0
...,...,...,...,...,...,...
796,796,5,3,"[0.13182926218457564, 0.5376363205060805]",1,1
797,797,5,3,"[0.305522582038011, 0.6350283472305764]",1,1
798,798,5,4,"[0.0041329075416220995, 0.6897971441357291]",1,1
799,799,5,5,"[0.9546980941754606, 0.041473930010521076]",0,0


In [72]:
unfair

Unnamed: 0,subj,att_1,att_2,pred_raw,label,pred
0,0,0,0,"[0.6831665002930872, 0.4712336368928336]",1,0
1,1,0,1,"[0.37417620678308744, 0.7154597900328423]",1,1
2,2,0,1,"[0.5547978965237172, 0.3903418115727909]",1,0
3,3,0,1,"[0.38623361936554895, 0.911354126486787]",1,1
4,4,0,1,"[0.30458747133966735, 0.566403152962637]",1,1
...,...,...,...,...,...,...
796,796,5,3,"[0.2740815452971961, 0.6544998979127403]",1,1
797,797,5,3,"[0.23345041193050559, 0.9453448854710214]",1,1
798,798,5,4,"[0.5176439088044117, 0.24797860635059366]",1,0
799,799,5,5,"[0.7760896811512823, 0.09443736343118692]",0,0


In [85]:
# Dist
df_dist = pd.DataFrame(columns = range_atts[0], index = range_atts[1])
for att_1 in range_atts[0]:
    for att_2 in range_atts[1]:
        cond = (fair.att_1 == att_1) & (fair.att_2 == att_2)
        df_dist.loc[att_1, att_2] = len(fair[cond])

# Fair case
df_fair = pd.DataFrame(columns = range_atts[0], index = range_atts[1])
for att_1 in range_atts[0]:
    for att_2 in range_atts[1]:
        cond = (fair.att_1 == att_1) & (fair.att_2 == att_2)
        df_fair.loc[att_1, att_2] = int((1 - mean_squared_error(fair[cond].label, fair[cond].pred))*100)/100
        
# Unfair case
df_unfair = pd.DataFrame(columns = range_atts[0], index = range_atts[1])
for att_1 in range_atts[0]:
    for att_2 in range_atts[1]:
        cond = (unfair.att_1 == att_1) & (unfair.att_2 == att_2)
        df_unfair.loc[att_1, att_2] = int((1 - mean_squared_error(unfair[cond].label, unfair[cond].pred))*100)/100

In [364]:
# Build the subplot
fig = make_subplots(1, 3, specs=[[{'r':0.02}, {'l':0.02}, {'l':0.02}]], #horizontal_spacing = 0.1,
                    subplot_titles=('Distribution of the data', 'Accuracy (Unfair scenario)', 'Accuracy (Fair scenario)'))

# Add the heatmaps
fig.add_trace(go.Heatmap(x = df_dist.columns, y = df_dist.index, z = df_dist, coloraxis = 'coloraxis', texttemplate = '%{z:.2f}'), 1, 1)
fig.add_trace(go.Heatmap(x = df_unfair.columns, y = df_unfair.index, z = df_unfair, coloraxis = 'coloraxis2', texttemplate = '%{z:.2f}'), 1, 2)
fig.add_trace(go.Heatmap(x = df_fair.columns, y = df_fair.index, z = df_fair, coloraxis = 'coloraxis2', texttemplate = '%{z:.2f}'), 1, 3)

# Update the figures
fig.update_layout(height = 550, width = 1500,
                  template = 'none',
                  yaxis = {"title": 'att_2'},
                  yaxis2 = {"title": 'att_2'},
                  yaxis3 = {"title": 'att_2'},
                  xaxis = {"title": 'att_1'},
                  xaxis2 = {"title": 'att_1'},
                  xaxis3 = {"title": 'att_1'},
                  coloraxis = dict(colorscale='ice', colorbar_x=0.27, colorbar_thickness=23, colorbar_title = 'Nb of data ', colorbar_title_side = 'right'),
                  coloraxis2 = dict(colorscale='RdYlGn', colorbar_x=1, colorbar_thickness=23, colorbar_title = 'Accuracy', colorbar_title_side = 'right'))
fig.update_layout(xaxis1 = dict(tickvals=range_atts[0]),
                  yaxis1 = dict(tickvals=range_atts[1]),
                  xaxis2 = dict(tickvals=range_atts[0]),
                  yaxis2 = dict(tickvals=range_atts[1]),
                  xaxis3 = dict(tickvals=range_atts[0]),
                  yaxis3 = dict(tickvals=range_atts[1]))
fig.write_image("images/metric_1.eps")
fig.show()

In [219]:
acc_pos_f = 0.7
acc_pos_u = 0.82
acc_neg = 0.58
sigma = 0.05
N = 1000
df_prob_fair = pd.DataFrame({'Even accuracy' : np.random.normal(acc_pos_f, sigma, N)})
df_prob = pd.DataFrame({'High accuracy' : np.random.normal(acc_pos_u, sigma, N),
                        'Low accuracy' : np.random.normal(acc_neg, sigma, N)})

In [222]:
fig = ff.create_distplot([df_prob_fair[c] for c in df_prob_fair.columns], df_prob_fair.columns, 
                         bin_size = 0.0025, show_rug = False, histnorm = 'probability')
fig.update_layout(height = 500, width = 700,
                  template = 'none',
                  xaxis = {'title': 'Accuracy values', 'tickvals' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'range' : [0.4,1]},
                  yaxis = {'title': 'Density'})
fig.write_image("images/metric_2.eps")
fig.show()

In [None]:
fig = ff.create_distplot([df_prob[c] for c in df_prob.columns], df_prob.columns, 
                         bin_size = 0.0025, show_rug = False, histnorm='probability', colors = ['rgb(61, 170, 40)', 'rgb(180, 40, 40)'])
fig.update_layout(height = 500, width = 700,
                  template = 'none',
                  xaxis = {'title': 'Accuracy values', 'tickvals' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'range' : [0.4,1]},
                  yaxis = {'title': 'Density'})
fig.write_image("images/metric_3.eps")
fig.show()

In [16]:
with open('success_1e3_1e3.pkl', 'rb') as fp:
    metrics = pickle.load(fp)
metrics_df = pd.DataFrame(columns = ['Metrics', 'Accuracy'])
for m in metrics:
    if m != 'n':
        sub_df = pd.DataFrame([[m, metrics[m] / metrics['n']]], columns = ['Metrics', 'Accuracy'])
        metrics_df = pd.concat([metrics_df, sub_df])
metrics_df.reset_index(inplace = True, drop = True)
metrics_df.drop(labels = [3, 5], inplace = True)
metrics_df.loc[4, 'Metrics'] = 'MMPF_size'
metrics_df.loc[6, 'Metrics'] = 'MMPF_adapted'
metrics_df.loc[4], metrics_df.loc[6] =  metrics_df.loc[6], metrics_df.loc[4].copy()


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



In [17]:
fig = px.bar(metrics_df, x = 'Metrics', y = 'Accuracy', color = 'Accuracy', color_continuous_scale = 'RdYlGn',# color_discrete_sequence = ['rgb(40, 140, 40)']*5,
            range_y = [0.4, 0.7], text_auto = '.1%')
fig.update_layout(height = 400, width = 650, template = 'none')
fig.update_layout(barmode = 'group', bargap = 0.3, bargroupgap = 0.2)
fig.write_image("images/metric_4.pdf")
fig.show()