# Imports and initialization of general parameters
***

In [1]:
from helpers.pareto_fairness import compute_pareto_metrics
from config.info import AGES, RACES, GENDERS, COMBS_BASELINE, COMBS_FOULDS, COMBS_MARTINEZ
from visualization.subgroup_distribution import plot_dist
from dataprocess.dataloader import load_data
from dataprocess.dataclass import Data
from config.get_args import get_args
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from lightning import seed_everything
from plotly.subplots import make_subplots
import numpy as np
import argparse
import pandas as pd
import random
import torch.nn as nn
import torch
import torchmetrics as tm
import pickle

loss_fct = nn.CrossEntropyLoss()
protected_atts = ['age_', 'race_', 'gender_']


# Auto reload part
%load_ext autoreload
%autoreload 2

# Baseline
***

In [2]:
def plot_results(df : pd.DataFrame, filename : str):
    # Initialization of the plot
    task_cc = 'cancer_classification'
    task_td = 'tumor_detection'
    fig = make_subplots(rows = 2, cols = 1, vertical_spacing = 0.2,
                        subplot_titles = ('Tumor detection tasks', 'Cancer classification tasks'))

    # Traces
    cc = px.bar(df[df.task == task_cc], x = 'cancer', y = ['Accuracy', 'F1-score', 'MMPF_size'],
                color_discrete_sequence = ['rgb(101, 171, 200)', 'rgb(50, 100, 170)', 'rgb(21, 21, 45)'], text_auto = '.3f')
    td = px.bar(df[df.task == task_td], x = 'cancer', y = ['Accuracy', 'F1-score', 'MMPF_size'],
                color_discrete_sequence = ['rgb(101, 171, 200)', 'rgb(50, 100, 170)', 'rgb(21, 21, 45)'], text_auto = '.3f').update_traces(showlegend = False)

    # Update the figure
    for t in range(len(cc['data'])): fig.add_trace(cc['data'][t], row = 2, col = 1)
    for t in range(len(td['data'])): fig.add_trace(td['data'][t], row = 1, col = 1)
    fig.update_layout(height = 700, width = 1200, 
                    template = 'none',
                    xaxis = {'title': 'Cancer'},
                    yaxis = {'title': 'Values [no unit]'},
                    xaxis2 = {'title': 'Cancer'},
                    yaxis2 = {'title': 'Values [no unit]'})
    fig.update_layout(barmode = 'group', bargap = 0.3, bargroupgap = 0.1)
    fig.write_image(filename)
    fig.show()

In [23]:
def check_results_baseline(task : str, 
                           cancer : str, 
                           df : pd.DataFrame,
                           set_ : str):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer]}
    
    # Extract the results pkl files
    preds_path = 'results/preds/run_100/add_protected_atts_1/Baseline'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/best_results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, loss_fct, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

In [24]:
# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_baseline = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_BASELINE:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    
    # Get the metrics
    df_baseline = check_results_baseline(task, cancer, df_baseline, 'test_')
df_baseline

['cancer_classification', 'coad_read_FS']
['cancer_classification', 'coad_read_PM']
['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']
['tumor_detection', 'brca']
['tumor_detection', 'coad']
['tumor_detection', 'kich']
['tumor_detection', 'kirc']
['tumor_detection', 'kirp']
['tumor_detection', 'luad']
['tumor_detection', 'lusc']
['tumor_detection', 'read']


Unnamed: 0,task,cancer,Accuracy,F1-score,MMPF_size
0,cancer_classification,coad_read_FS,0.812044,0.52968,0.668881
0,cancer_classification,coad_read_PM,0.750799,0.0,0.662391
0,cancer_classification,kich_kirc_FS,0.982929,0.990281,0.338233
0,cancer_classification,kich_kirc_PM,0.978456,0.986928,0.418525
0,cancer_classification,kich_kirp_FS,0.985222,0.98913,0.379836
0,cancer_classification,kich_kirp_PM,0.993671,0.995434,0.313311
0,cancer_classification,kirc_kirp_FS,0.991235,0.979667,0.335001
0,cancer_classification,kirc_kirp_PM,0.985316,0.977376,0.315457
0,cancer_classification,luad_lusc_FS,0.921099,0.917516,0.416868
0,cancer_classification,luad_lusc_PM,0.94875,0.943135,0.387147


In [25]:
plot_results(df_baseline, 'images/results_baseline.eps')

# Foulds
***

In [20]:
def check_results_foulds(task : str, 
                           cancer : str, 
                           lambda_ : float,
                           pt_method : str,
                           df : pd.DataFrame,
                           set_ : str):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer], 'lambda_' : [lambda_], 'pt_method' : [pt_method]}
    
    # Extract the results pkl files
    preds_path = 'results/preds/run_100/add_protected_atts_1/Foulds'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/lambda_{lambda_}/{pt_method}/best_results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, loss_fct, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

In [21]:
# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_foulds = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_FOULDS:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    l = comb[2]
    pt_method = comb[3]
    
    # Get the metrics
    df_foulds = check_results_foulds(task, cancer, l, pt_method, df_foulds, 'test_')
df_foulds.reset_index(inplace = True, drop = True)

['cancer_classification', 'coad_read_FS', 0.001, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.001, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.001, 'DF_max']
['cancer_classification', 'coad_read_FS', 0.005, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.005, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.005, 'DF_max']
['cancer_classification', 'coad_read_FS', 0.01, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.01, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.01, 'DF_max']
['cancer_classification', 'coad_read_FS', 0.05, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.05, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.05, 'DF_max']
['cancer_classification', 'coad_read_FS', 0.1, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.1, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.1, 'DF_max']
['cancer_classification', 'coad_read_PM', 0.001, 'DF_pos']
['cancer_classification', 'coad_read_PM', 0.001, 'DF_sum']
['cancer_

In [22]:
# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_foulds_best = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_BASELINE:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    cond = (df_foulds.task == task) & (df_foulds.cancer == cancer)
    
    # Get the metrics from the models with the best hyperparameters
    # Get hyperparameters
    best_idx = df_foulds[cond].MMPF_size.argmin()
    sub_dict = {'task' : [task], 'cancer' : [cancer]}
    sub_dict['Accuracy'] = df_foulds[cond].iloc[best_idx]['Accuracy']
    sub_dict['F1-score'] = df_foulds[cond].iloc[best_idx]['F1-score']
    sub_dict['MMPF_size'] = df_foulds[cond].iloc[best_idx]['MMPF_size']
    
    # Update the dataframe
    if df_foulds_best.empty: df_foulds_best = pd.DataFrame(data = sub_dict)
    else: df_foulds_best = pd.concat([df_foulds_best, pd.DataFrame(data = sub_dict)])
plot_results(df_foulds_best, 'images/results_foulds.eps')

['cancer_classification', 'coad_read_FS']
['cancer_classification', 'coad_read_PM']
['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']
['tumor_detection', 'brca']
['tumor_detection', 'coad']
['tumor_detection', 'kich']
['tumor_detection', 'kirc']
['tumor_detection', 'kirp']
['tumor_detection', 'luad']
['tumor_detection', 'lusc']
['tumor_detection', 'read']


In [51]:
t = 'cancer_classification'
c = 'coad_read_FS'
reduced_df = df_foulds[(df_foulds.task == t) & (df_foulds.cancer == c)]
table = pd.pivot_table(reduced_df, values = 'MMPF_size', index = 'pt_method', columns = ['lambda_'])
print(table.to_latex())

\begin{tabular}{lrrrrr}
\toprule
lambda_ & 0.001000 & 0.005000 & 0.010000 & 0.050000 & 0.100000 \\
pt_method &  &  &  &  &  \\
\midrule
DF_max & 0.749837 & 0.731817 & 0.779527 & 0.795815 & 0.756377 \\
DF_pos & 0.785690 & 0.750942 & 0.777638 & 0.859162 & 0.695458 \\
DF_sum & 0.750136 & 0.748594 & 0.732998 & 0.783693 & 0.787716 \\
\bottomrule
\end{tabular}



# Diana 
***

In [8]:
def check_results_diana(task : str, 
                           cancer : str, 
                           df : pd.DataFrame,
                           set_ : str):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer]}
    
    # Extract the results pkl files
    preds_path = 'results/preds/run_100/add_protected_atts_1/Diana'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, loss_fct, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

In [9]:
# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_diana = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_BASELINE:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    
    # Get the metrics
    df_diana = check_results_diana(task, cancer, df_diana, 'test_')
df_diana

['cancer_classification', 'coad_read_FS']
['cancer_classification', 'coad_read_PM']
['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']
['tumor_detection', 'brca']
['tumor_detection', 'coad']
['tumor_detection', 'kich']
['tumor_detection', 'kirc']
['tumor_detection', 'kirp']
['tumor_detection', 'luad']
['tumor_detection', 'lusc']
['tumor_detection', 'read']


Unnamed: 0,task,cancer,Accuracy,F1-score,MMPF_size
0,cancer_classification,coad_read_FS,0.806569,0.615942,0.686846
0,cancer_classification,coad_read_PM,0.84345,0.647482,0.747876
0,cancer_classification,kich_kirc_FS,0.960467,0.977296,0.363247
0,cancer_classification,kich_kirc_PM,0.965889,0.979189,0.456579
0,cancer_classification,kich_kirp_FS,0.977832,0.983725,0.379908
0,cancer_classification,kich_kirp_PM,0.987342,0.990826,0.313284
0,cancer_classification,kirc_kirp_FS,0.988048,0.972274,0.335003
0,cancer_classification,kirc_kirp_PM,0.98091,0.970787,0.313269
0,cancer_classification,luad_lusc_FS,0.923759,0.92037,0.42446
0,cancer_classification,luad_lusc_PM,0.94625,0.940526,0.401339


In [10]:
plot_results(df_diana, 'images/results_diana.eps')

# Martinez
***

In [11]:
def check_results_martinez(task : str, 
                           cancer : str, 
                           df : pd.DataFrame,
                           set_ : str):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer]}
    
    # Extract the results pkl files
    preds_path = 'results/preds/run_100/add_protected_atts_1/Martinez'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/alpha_0.5/results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, loss_fct, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

In [16]:
# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_martinez = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_BASELINE:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    
    # Get the metrics
    df_martinez = check_results_martinez(task, cancer, df_martinez, 'test_')
df_martinez

['cancer_classification', 'coad_read_FS']
['cancer_classification', 'coad_read_PM']
['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']
['tumor_detection', 'brca']
['tumor_detection', 'coad']
['tumor_detection', 'kich']
['tumor_detection', 'kirc']
['tumor_detection', 'kirp']
['tumor_detection', 'luad']
['tumor_detection', 'lusc']
['tumor_detection', 'read']


Unnamed: 0,task,cancer,Accuracy,F1-score,MMPF_size
0,cancer_classification,coad_read_FS,0.722628,0.0,0.64128
0,cancer_classification,coad_read_PM,0.750799,0.0,0.668209
0,cancer_classification,kich_kirc_FS,0.964061,0.979466,0.363271
0,cancer_classification,kich_kirc_PM,0.965889,0.979098,0.415733
0,cancer_classification,kich_kirp_FS,0.980296,0.985455,0.313262
0,cancer_classification,kich_kirp_PM,0.984177,0.988506,0.313262
0,cancer_classification,kirc_kirp_FS,0.984861,0.965642,0.335003
0,cancer_classification,kirc_kirp_PM,0.988253,0.9819,0.313269
0,cancer_classification,luad_lusc_FS,0.890071,0.887273,0.438467
0,cancer_classification,luad_lusc_PM,0.91625,0.908345,0.446377


In [18]:
df_martinez

Unnamed: 0,task,cancer,Accuracy,F1-score,MMPF_size
0,cancer_classification,coad_read_FS,0.722628,0.0,0.64128
0,cancer_classification,coad_read_PM,0.750799,0.0,0.668209
0,cancer_classification,kich_kirc_FS,0.964061,0.979466,0.363271
0,cancer_classification,kich_kirc_PM,0.965889,0.979098,0.415733
0,cancer_classification,kich_kirp_FS,0.980296,0.985455,0.313262
0,cancer_classification,kich_kirp_PM,0.984177,0.988506,0.313262
0,cancer_classification,kirc_kirp_FS,0.984861,0.965642,0.335003
0,cancer_classification,kirc_kirp_PM,0.988253,0.9819,0.313269
0,cancer_classification,luad_lusc_FS,0.890071,0.887273,0.438467
0,cancer_classification,luad_lusc_PM,0.91625,0.908345,0.446377


In [17]:
plot_results(df_martinez, 'images/results_martinez.eps')