# Imports and initialization of general parameters
***

In [1]:
from helpers.pareto_fairness import compute_pareto_metrics
from config.info import AGES, RACES, GENDERS, COMBS_BASELINE, COMBS_FOULDS, COMBS_MARTINEZ
from visualization.subgroup_distribution import plot_dist
from dataprocess.dataloader import load_data
from dataprocess.dataclass import Data
from config.get_args import get_args
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from lightning import seed_everything
from plotly.subplots import make_subplots
import numpy as np
import argparse
import pandas as pd
import random
import torch.nn as nn
import torch
import torchmetrics as tm
import pickle

loss_fct = nn.CrossEntropyLoss()
protected_atts = ['age_', 'race_', 'gender_']


# Auto reload part
%load_ext autoreload
%autoreload 2

# Baseline
***

In [122]:
def keep_best(df100 : pd.DataFrame, df1000 : pd.DataFrame):
    # Check the match between the cancer
    cs = [c for c in df100.cancer.to_list() if c in df1000.cancer.to_list()]
    df100.set_index('cancer', drop = False, inplace = True)
    df1000.set_index('cancer', drop = False, inplace = True)
    df = df100[df100.cancer.isin(cs)].copy()
    df['ID'] = 100
    
    # Check the best MMPF_size value between df100 and df1000, and add it to a new df
    for c in cs:
        
        # Compare the values 
        v1 = df100[df100.cancer == c]['MMPF_size'][0]
        v2 = df1000[df1000.cancer == c]['MMPF_size'][0]
        if v2 <= v1:
            df.loc[c, 'Accuracy'] = df1000[df1000.cancer == c]['Accuracy'][0]
            df.loc[c, 'F1-score'] = df1000[df1000.cancer == c]['F1-score'][0]
            df.loc[c, 'MMPF_size'] = df1000[df1000.cancer == c]['MMPF_size'][0]
            df.loc[c, 'ID'] = 1000
    return df

def select_best_test(df_val, df100, df1000):
    # Copy the df
    cs = [c for c in df100.cancer.to_list() if c in df1000.cancer.to_list()]
    df100.set_index('cancer', drop = False, inplace = True)
    df1000.set_index('cancer', drop = False, inplace = True)
    df = df_val.copy()

    # Select the best inside the test df depending of the df val
    for c in cs:
        
        # Check which ID is the best validation MMPF value
        if df.loc[c, 'ID'] == 100:
            df.loc[c, 'Accuracy'] = df100[df100.cancer == c]['Accuracy'][0]
            df.loc[c, 'F1-score'] = df100[df100.cancer == c]['F1-score'][0]
            df.loc[c, 'MMPF_size'] = df100[df100.cancer == c]['MMPF_size'][0]
        else:
            df.loc[c, 'Accuracy'] = df1000[df1000.cancer == c]['Accuracy'][0]
            df.loc[c, 'F1-score'] = df1000[df1000.cancer == c]['F1-score'][0]
            df.loc[c, 'MMPF_size'] = df1000[df1000.cancer == c]['MMPF_size'][0]
            
    return df

In [44]:
def plot_results_baseline(df : pd.DataFrame, filename : str):
    # Initialization of the plot
    task_cc = 'cancer_classification'
    task_td = 'tumor_detection'
    fig = make_subplots(rows = 2, cols = 1, vertical_spacing = 0.2,
                        subplot_titles = ('Tumor detection tasks', 'Cancer classification tasks'))

    # Traces
    cc = px.bar(df[df.task == task_cc], x = 'cancer', y = ['Accuracy', 'F1-score', 'MMPF_size'],
                color_discrete_sequence = ['rgb(101, 171, 200)', 'rgb(50, 100, 170)', 'rgb(21, 21, 45)'], text_auto = '.3f')
    td = px.bar(df[df.task == task_td], x = 'cancer', y = ['Accuracy', 'F1-score', 'MMPF_size'],
                color_discrete_sequence = ['rgb(101, 171, 200)', 'rgb(50, 100, 170)', 'rgb(21, 21, 45)'], text_auto = '.3f').update_traces(showlegend = False)

    # Update the figure
    for t in range(len(cc['data'])): fig.add_trace(cc['data'][t], row = 2, col = 1)
    for t in range(len(td['data'])): fig.add_trace(td['data'][t], row = 1, col = 1)
    fig.update_layout(height = 700, width = 1200, 
                    template = 'none',
                    xaxis = {'title': 'Cancer'},
                    yaxis = {'title': 'Values [no unit]'},
                    xaxis2 = {'title': 'Cancer'},
                    yaxis2 = {'title': 'Values [no unit]'})
    fig.update_layout(barmode = 'group', bargap = 0.3, bargroupgap = 0.1)
    fig.write_image(filename)
    fig.show()
    
def plot_results_others(df : pd.DataFrame, filename : str):
    # Initialization of the plot
    task_cc = 'cancer_classification'

    # Traces
    fig = px.bar(df[df.task == task_cc], x = 'cancer', y = ['Accuracy', 'F1-score', 'MMPF_size'],
                color_discrete_sequence = ['rgb(101, 171, 200)', 'rgb(50, 100, 170)', 'rgb(21, 21, 45)'], text_auto = '.3f')

    # Update the figure
    # for t in range(len(cc['data'])): fig.add_trace(cc['data'][t], row = 1, col = 1)
    fig.update_layout(height = 400, width = 1200, 
                      title = 'Cancer classification tasks',
                    template = 'none',
                    xaxis = {'title': 'Cancer'},
                    yaxis = {'title': 'Values [no unit]'})
    fig.update_layout(barmode = 'group', bargap = 0.3, bargroupgap = 0.1)
    fig.write_image(filename)
    fig.show()

In [144]:
def check_results_baseline(task : str, 
                           cancer : str, 
                           df : pd.DataFrame,
                           set_ : str, ID : int):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer]}
    
    # Extract the results pkl files
    preds_path = f'results/preds/run_{ID}/add_protected_atts_1/Baseline'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_baseline_100 = pd.DataFrame(columns = cols)
df_baseline_1000 = pd.DataFrame(columns = cols)
df_baseline_100_test = pd.DataFrame(columns = cols)
df_baseline_1000_test = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_BASELINE[2:]:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    
    # Get the metrics - validation
    df_baseline_100 = check_results_baseline(task, cancer, df_baseline_100, 'val_', 100)
    df_baseline_1000 = check_results_baseline(task, cancer, df_baseline_1000, 'val_', 1000)
    
    # Get the metrics - test
    df_baseline_100_test = check_results_baseline(task, cancer, df_baseline_100_test, 'test_', 100)
    df_baseline_1000_test = check_results_baseline(task, cancer, df_baseline_1000_test, 'test_', 1000)

# Plot
df_b = keep_best(df_baseline_100, df_baseline_1000)
df_b_test = select_best_test(df_b, df_baseline_100_test, df_baseline_1000_test)
plot_results_baseline(df_b_test, 'images/results_baseline.eps')

['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']
['tumor_detection', 'brca']
['tumor_detection', 'coad']
['tumor_detection', 'kich']
['tumor_detection', 'kirc']
['tumor_detection', 'kirp']
['tumor_detection', 'luad']
['tumor_detection', 'lusc']
['tumor_detection', 'read']



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as la

# Foulds
***

In [145]:
def check_results_foulds(task : str, 
                           cancer : str, 
                           lambda_ : float,
                           pt_method : str,
                           df : pd.DataFrame,
                           set_ : str, ID : int):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer], 'lambda_' : [lambda_], 'pt_method' : [pt_method]}
    
    # Extract the results pkl files
    preds_path = f'results/preds/run_{ID}/add_protected_atts_1/Foulds'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/lambda_{lambda_}/{pt_method}/results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df


def select_hyper(df_foulds):
    # Initialization
    df_foulds.reset_index(inplace = True, drop = True)
    cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
    df_foulds_best = pd.DataFrame(columns = cols)

    # Loop on the combinations for baseline
    for comb in COMBS_BASELINE[2:10]:
        # Track
        print(comb)
        
        # Extract combinations
        task = comb[0]
        cancer = comb[1]
        cond = (df_foulds.task == task) & (df_foulds.cancer == cancer)
        
        # Get the metrics from the models with the best hyperparameters
        # Get hyperparameters
        best_idx = df_foulds[cond].MMPF_size.argmin()
        sub_dict = {'task' : [task], 'cancer' : [cancer]}
        sub_dict['Accuracy'] = df_foulds[cond].iloc[best_idx]['Accuracy']
        sub_dict['F1-score'] = df_foulds[cond].iloc[best_idx]['F1-score']
        sub_dict['MMPF_size'] = df_foulds[cond].iloc[best_idx]['MMPF_size']
        
        # Update the dataframe
        if df_foulds_best.empty: df_foulds_best = pd.DataFrame(data = sub_dict)
        else: df_foulds_best = pd.concat([df_foulds_best, pd.DataFrame(data = sub_dict)])
    
    return df_foulds_best


# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_foulds_100 = pd.DataFrame(columns = cols)
df_foulds_1000 = pd.DataFrame(columns = cols)
df_foulds_100_test = pd.DataFrame(columns = cols)
df_foulds_1000_test = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_FOULDS[6:30]:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    l = comb[2]
    pt_method = comb[3]
    
    # Get the metrics
    df_foulds_100 = check_results_foulds(task, cancer, l, pt_method, df_foulds_100, 'val_', 100)
    df_foulds_1000 = check_results_foulds(task, cancer, l, pt_method, df_foulds_1000, 'val_', 1000)
    
    # Get the metrics
    df_foulds_100_test = check_results_foulds(task, cancer, l, pt_method, df_foulds_100_test, 'test_', 100)
    df_foulds_1000_test = check_results_foulds(task, cancer, l, pt_method, df_foulds_1000_test, 'test_', 1000)
    
# Select hyperparameters
df_foulds_100 = select_hyper(df_foulds_100)
df_foulds_1000 = select_hyper(df_foulds_1000)
df_foulds_100_test = select_hyper(df_foulds_100_test)
df_foulds_1000_test = select_hyper(df_foulds_1000_test)

df_f = keep_best(df_foulds_100, df_foulds_1000)
df_f_test = select_best_test(df_f, df_foulds_100_test, df_foulds_1000_test)
plot_results_others(df_f_test, 'images/results_foulds.eps')

['cancer_classification', 'kich_kirc_FS', 0.01, 'DF_pos']
['cancer_classification', 'kich_kirc_FS', 0.01, 'DF_sum']
['cancer_classification', 'kich_kirc_FS', 0.01, 'DF_max']
['cancer_classification', 'kich_kirc_PM', 0.01, 'DF_pos']
['cancer_classification', 'kich_kirc_PM', 0.01, 'DF_sum']
['cancer_classification', 'kich_kirc_PM', 0.01, 'DF_max']
['cancer_classification', 'kich_kirp_FS', 0.01, 'DF_pos']
['cancer_classification', 'kich_kirp_FS', 0.01, 'DF_sum']
['cancer_classification', 'kich_kirp_FS', 0.01, 'DF_max']
['cancer_classification', 'kich_kirp_PM', 0.01, 'DF_pos']
['cancer_classification', 'kich_kirp_PM', 0.01, 'DF_sum']
['cancer_classification', 'kich_kirp_PM', 0.01, 'DF_max']
['cancer_classification', 'kirc_kirp_FS', 0.01, 'DF_pos']
['cancer_classification', 'kirc_kirp_FS', 0.01, 'DF_sum']
['cancer_classification', 'kirc_kirp_FS', 0.01, 'DF_max']
['cancer_classification', 'kirc_kirp_PM', 0.01, 'DF_pos']
['cancer_classification', 'kirc_kirp_PM', 0.01, 'DF_sum']
['cancer_class


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as la

In [51]:
# FOR THE TABLES IN THE REPORT
t = 'cancer_classification'
c = 'coad_read_FS'
reduced_df = df_foulds[(df_foulds.task == t) & (df_foulds.cancer == c)]
table = pd.pivot_table(reduced_df, values = 'MMPF_size', index = 'pt_method', columns = ['lambda_'])
print(table.to_latex())

\begin{tabular}{lrrrrr}
\toprule
lambda_ & 0.001000 & 0.005000 & 0.010000 & 0.050000 & 0.100000 \\
pt_method &  &  &  &  &  \\
\midrule
DF_max & 0.749837 & 0.731817 & 0.779527 & 0.795815 & 0.756377 \\
DF_pos & 0.785690 & 0.750942 & 0.777638 & 0.859162 & 0.695458 \\
DF_sum & 0.750136 & 0.748594 & 0.732998 & 0.783693 & 0.787716 \\
\bottomrule
\end{tabular}



# Diana 
***

In [138]:
def check_results_diana(task : str, 
                           cancer : str, 
                           df : pd.DataFrame,
                           set_ : str, ID : int):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer]}
    
    # Extract the results pkl files
    preds_path = f'results/preds/run_{ID}/add_protected_atts_1/Diana'
    results = pd.read_pickle(preds_path + f'/{task}/{cancer}/results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_diana_100 = pd.DataFrame(columns = cols)
df_diana_1000 = pd.DataFrame(columns = cols)
df_diana_100_test = pd.DataFrame(columns = cols)
df_diana_1000_test = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_BASELINE[2:10]:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    
    # Get the metrics
    df_diana_100 = check_results_diana(task, cancer, df_diana_100, 'val_', 100)
    df_diana_1000 = check_results_diana(task, cancer, df_diana_1000, 'val_', 1000)
    
    # Get the metrics
    df_diana_100_test = check_results_diana(task, cancer, df_diana_100_test, 'test_', 100)
    df_diana_1000_test = check_results_diana(task, cancer, df_diana_1000_test, 'test_', 1000)

df_d = keep_best(df_diana_100, df_diana_1000)
df_d_test = select_best_test(df_d, df_diana_100_test, df_diana_1000_test)
plot_results_others(df_d_test, 'images/results_diana.eps')

['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as la

# Martinez
***

In [139]:
def check_results_martinez(task : str, 
                           cancer : str, 
                           df : pd.DataFrame,
                           set_ : str, ID : int):
    # Initialization
    sub_dict = {'task' : [task], 'cancer' : [cancer]}
    
    # Extract the results pkl files
    preds_path = f'results/preds/run_{ID}/add_protected_atts_1/Martinez'
    if ID == 100 : results = pd.read_pickle(preds_path + f'/{task}/{cancer}/alpha_0.5/results.pkl')
    else: results = pd.read_pickle(preds_path + f'/{task}/{cancer}/results.pkl')
    
    # Compute the metrics
    # Accuracy 
    sub_dict['Accuracy'] = tm.classification.BinaryAccuracy()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # F1-score
    sub_dict['F1-score'] = tm.classification.BinaryF1Score()(torch.Tensor(results.pred), torch.Tensor(results.label)).item()
    
    # MMPF_size
    mmpf_metrics = compute_pareto_metrics(results, protected_atts)
    sub_dict['MMPF_size'] = [mmpf_metrics[f'{set_}MMPF_size_2']]
    
    # Return the updated data frame
    if df.empty: df = pd.DataFrame(data = sub_dict)
    else: df = pd.concat([df, pd.DataFrame(data = sub_dict)])
    return df

# Initialization
cols = ['task', 'cancer', 'Accuracy', 'F1-score', 'MMPF_size']
df_martinez_100 = pd.DataFrame(columns = cols)
df_martinez_1000 = pd.DataFrame(columns = cols)
df_martinez_100_test = pd.DataFrame(columns = cols)
df_martinez_1000_test = pd.DataFrame(columns = cols)

# Loop on the combinations for baseline
for comb in COMBS_BASELINE[2:10]:
    # Track
    print(comb)
    
    # Extract combinations
    task = comb[0]
    cancer = comb[1]
    
    # Get the metrics
    df_martinez_100 = check_results_martinez(task, cancer, df_martinez_100, 'val_', 100)
    df_martinez_1000 = check_results_martinez(task, cancer, df_martinez_1000, 'val_', 1000)

    # Get the metrics
    df_martinez_100_test = check_results_martinez(task, cancer, df_martinez_100_test, 'test_', 100)
    df_martinez_1000_test = check_results_martinez(task, cancer, df_martinez_1000_test, 'test_', 1000)
    
df_m = keep_best(df_martinez_100, df_martinez_1000)
df_m_test = select_best_test(df_m, df_martinez_100_test, df_martinez_1000_test)
plot_results_others(df_m_test, 'images/results_martinez.eps')

['cancer_classification', 'kich_kirc_FS']
['cancer_classification', 'kich_kirc_PM']
['cancer_classification', 'kich_kirp_FS']
['cancer_classification', 'kich_kirp_PM']
['cancer_classification', 'kirc_kirp_FS']
['cancer_classification', 'kirc_kirp_PM']
['cancer_classification', 'luad_lusc_FS']
['cancer_classification', 'luad_lusc_PM']



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as la