# Imports and initialization of general parameters
***

In [1]:
from helpers.pareto_fairness import compute_pareto_metrics
from config.info import AGES, RACES, GENDERS, COMBS_BASELINE, COMBS_FOULDS, COMBS_MARTINEZ
from visualization.subgroup_distribution import plot_dist
from dataprocess.dataloader import load_data
from dataprocess.dataclass import Data
from config.get_args import get_args
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from lightning import seed_everything
from plotly.subplots import make_subplots
import numpy as np
import argparse
import pandas as pd
import random
import torch.nn as nn
import torch
import torchmetrics as tm
import pickle

loss_fct = nn.CrossEntropyLoss()
protected_atts = ['age_', 'race_', 'gender_']


# Auto reload part
%load_ext autoreload
%autoreload 2

# Baseline
***

In [2]:
def check_results_baseline(task : str, 
                           cancer : str, 
                           df : pd.DataFrame,
                           set_ : str):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer]}
    
    # Extract the results pkl files
    preds_path = 'results/preds/run_100/add_protected_atts_1/Baseline'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, loss_fct, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

In [3]:
# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_baseline = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_BASELINE:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    
    # Get the metrics
    df_baseline = check_results_baseline(task, cancer, df_baseline, 'test_')
df_baseline

['cancer_classification', 'coad_read_FS']
['cancer_classification', 'coad_read_PM']
['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']
['tumor_detection', 'brca']
['tumor_detection', 'coad']
['tumor_detection', 'kich']
['tumor_detection', 'kirc']
['tumor_detection', 'kirp']
['tumor_detection', 'luad']
['tumor_detection', 'lusc']
['tumor_detection', 'read']


Unnamed: 0,task,cancer,Accuracy,F1-score,MMPF_size
0,cancer_classification,coad_read_FS,0.812044,0.542222,0.617882
0,cancer_classification,coad_read_PM,0.766773,0.120482,0.646595
0,cancer_classification,kich_kirc_FS,0.982929,0.990271,0.338254
0,cancer_classification,kich_kirc_PM,0.980251,0.988004,0.418655
0,cancer_classification,kich_kirp_FS,0.982759,0.987296,0.379921
0,cancer_classification,kich_kirp_PM,0.990506,0.993135,0.313268
0,cancer_classification,kirc_kirp_FS,0.992829,0.983486,0.335001
0,cancer_classification,kirc_kirp_PM,0.986784,0.979684,0.31387
0,cancer_classification,luad_lusc_FS,0.945035,0.942056,0.398515
0,cancer_classification,luad_lusc_PM,0.955,0.950549,0.404332


In [32]:
# Initialization of the plot
task_cc = 'cancer_classification'
task_td = 'tumor_detection'
fig = make_subplots(rows = 2, cols = 1, vertical_spacing = 0.2,
                    subplot_titles = ('Tumor detection tasks', 'Cancer classification tasks'))

# Traces
cc = px.bar(df_baseline[df_baseline.task == task_cc], x = 'cancer', y = ['Accuracy', 'F1-score', 'MMPF_size'],
            color_discrete_sequence = ['rgb(20, 130, 60)', 'rgb(200, 70, 50)', 'rgb(60, 120, 180)'], text_auto = '.3f')
td = px.bar(df_baseline[df_baseline.task == task_td], x = 'cancer', y = ['Accuracy', 'F1-score', 'MMPF_size'],
            color_discrete_sequence = ['rgb(20, 130, 60)', 'rgb(200, 70, 50)', 'rgb(60, 120, 180)'], text_auto = '.3f').update_traces(showlegend = False)

# Update the figure
for t in range(len(cc['data'])): fig.add_trace(cc['data'][t], row = 2, col = 1)
for t in range(len(td['data'])): fig.add_trace(td['data'][t], row = 1, col = 1)
fig.update_layout(height = 700, width = 1200, 
                  template = 'none',
                  xaxis = {'title': 'Cancer'},
                  yaxis = {'title': 'Values [no unit]'},
                  xaxis2 = {'title': 'Cancer'},
                  yaxis2 = {'title': 'Values [no unit]'})
fig.update_layout(barmode = 'group', bargap = 0.3, bargroupgap = 0.1)
fig.write_image("images/results_baseline.eps")
fig.show()

# Foulds
***

In [5]:
def check_results_foulds(task : str, 
                           cancer : str, 
                           lambda_ : float,
                           pt_method : str,
                           df : pd.DataFrame,
                           set_ : str):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer], 'lambda_' : [lambda_], 'pt_method' : [pt_method]}
    
    # Extract the results pkl files
    preds_path = 'results/preds/run_100/add_protected_atts_1/Foulds'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/lambda_{lambda_}/{pt_method}/results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, loss_fct, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

In [6]:
# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_foulds = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_FOULDS:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    l = comb[2]
    pt_method = comb[3]
    
    # Get the metrics
    df_foulds = check_results_foulds(task, cancer, l, pt_method, df_foulds, 'val_')
df_foulds

['cancer_classification', 'coad_read_FS', 0.001, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.001, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.001, 'DF_max']
['cancer_classification', 'coad_read_FS', 0.005, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.005, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.005, 'DF_max']
['cancer_classification', 'coad_read_FS', 0.01, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.01, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.01, 'DF_max']
['cancer_classification', 'coad_read_FS', 0.05, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.05, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.05, 'DF_max']
['cancer_classification', 'coad_read_FS', 0.1, 'DF_pos']
['cancer_classification', 'coad_read_FS', 0.1, 'DF_sum']
['cancer_classification', 'coad_read_FS', 0.1, 'DF_max']
['cancer_classification', 'coad_read_PM', 0.001, 'DF_pos']
['cancer_classification', 'coad_read_PM', 0.001, 'DF_sum']
['cancer_

Unnamed: 0,task,cancer,lambda_,pt_method,Accuracy,F1-score,MMPF_size
0,cancer_classification,coad_read_FS,0.001,DF_pos,0.813869,0.564103,0.785690
0,cancer_classification,coad_read_FS,0.001,DF_sum,0.817518,0.553571,0.750136
0,cancer_classification,coad_read_FS,0.001,DF_max,0.817518,0.568965,0.749837
0,cancer_classification,coad_read_FS,0.005,DF_pos,0.810219,0.566667,0.750942
0,cancer_classification,coad_read_FS,0.005,DF_sum,0.817518,0.557522,0.748594
...,...,...,...,...,...,...,...
0,tumor_detection,read,0.050,DF_sum,0.987578,0.993421,0.344093
0,tumor_detection,read,0.050,DF_max,0.987578,0.993464,0.347429
0,tumor_detection,read,0.100,DF_pos,0.987578,0.993421,0.375255
0,tumor_detection,read,0.100,DF_sum,0.987578,0.993421,0.327538


In [10]:
t = 'cancer_classification'
c = 'coad_read_FS'
reduced_df = df_foulds[(df_foulds.task == t) & (df_foulds.cancer == c)]
table = pd.pivot_table(reduced_df, values = 'MMPF_size', index = 'pt_method', columns = ['lambda_'])

In [11]:
table

lambda_,0.001,0.005,0.010,0.050,0.100
pt_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DF_max,0.749837,0.731817,0.779527,0.795815,0.756377
DF_pos,0.78569,0.750942,0.777638,0.859162,0.695458
DF_sum,0.750136,0.748594,0.732998,0.783693,0.787716


In [12]:
t = 'cancer_classification'
c = 'coad_read_PM'
reduced_df = df_foulds[(df_foulds.task == t) & (df_foulds.cancer == c)]
table = pd.pivot_table(reduced_df, values = 'MMPF_size', index = 'pt_method', columns = ['lambda_'])

In [26]:
table.index

Index(['DF_max', 'DF_pos', 'DF_sum'], dtype='object', name='pt_method')

In [27]:
print(table.to_latex())

\begin{tabular}{lrrrrr}
\toprule
lambda_ & 0.001000 & 0.005000 & 0.010000 & 0.050000 & 0.100000 \\
pt_method &  &  &  &  &  \\
\midrule
DF_max & 0.757706 & 0.757068 & 0.770168 & 0.715982 & 0.904218 \\
DF_pos & 0.757706 & 0.757696 & 0.754570 & 0.673909 & 0.773930 \\
DF_sum & 0.757706 & 0.865449 & 0.757980 & 0.700615 & 0.798344 \\
\bottomrule
\end{tabular}

