In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

pd.set_option("display.max_colwidth", None)

In [3]:
#out = {}
out_support = {}

In [4]:
COL_info_detail = ["itemsets", "support", "tn", "fp", "fn", "tp"]

COL_info = ["itemsets", "support"]

# Parameters

In [5]:
K=3

metric = "d_fpr"
t_value_col = "t_value_fp"

min_sup_divergence=0.05

In [6]:
name_output_dir = "output_compas"

In [7]:
saveFig = False
show_fig = False

# Dataset

In [8]:
abbreviations = {
    "age_cat": "age",
    "priors_count": "#prior",
    "Greater than 45": ">45",
    "25 - 45": "25-45",
    "African-American": "Afr-Am",
    "c_charge_degree": "charge",
    "Less than 25": "<25",
    "=>": ">",
    "=<": "<",
    "length_of_stay": "stay",
    "Caucasian": "Cauc",
}

dataset_name = "compas"
risk_class_type = True

from import_datasets import import_process_compas

dfI, class_map = import_process_compas(
    risk_class=risk_class_type, continuous_col=True
)
dfI.reset_index(drop=True, inplace=True)

dfI["predicted"] = dfI["predicted"].replace({"Medium-Low": 0, "High": 1})
true_class_name, pred_class_name = "class", "predicted"
class_and_pred_names = [true_class_name, pred_class_name]
attributes = list(dfI.columns.drop(class_and_pred_names))

dfI = dfI[attributes + class_and_pred_names]
dfI.head()

Unnamed: 0,c_charge_degree,race,sex,age,priors_count,length_of_stay,class,predicted
0,F,Other,Male,69,0,1.0,0,0
1,F,African-American,Male,34,0,10.0,1,0
2,F,African-American,Male,24,4,1.0,1,0
3,M,Other,Male,44,0,1.0,0,0
4,F,Caucasian,Male,41,14,6.0,1,0


In [9]:
min_sup_divergences = [0.05, 0.025, 0.01]

# Domain expert discretization

As in NIPS article:
https://github.com/fair-preprocessing/nips2017/blob/master/compas/code/Generate_Compas_Data.ipynb

## Divergence

In [10]:
from divexplorer_generalized.FP_Divergence import FP_Divergence

In [11]:
from import_datasets import discretize

dfI_discr = discretize(dfI, dataset_name=dataset_name)

# ### Extract divergence

from divexplorer_generalized.FP_DivergenceExplorer import FP_DivergenceExplorer


for min_sup_divergence in min_sup_divergences:
    fp_diver = FP_DivergenceExplorer(
    dfI_discr, true_class_name=true_class_name, predicted_class_name=pred_class_name
    )


    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup_divergence, metrics=[metric]
    )
    fp_divergence_i = FP_Divergence(FP_fm, metric=metric)

    most_divergent = (
        fp_divergence_i.getDivergence(th_redundancy=0)
        .sort_values(
            [fp_divergence_i.metric, fp_divergence_i.t_value_col], ascending=False
        )
        .head(1)
    )
    if min_sup_divergence not in out_support:
        out_support[min_sup_divergence] = {}
    out_support[min_sup_divergence]["default"] = most_divergent

# Tree divergence - FPR

In [12]:
min_support_tree = 0.1

In [13]:
import os


tree_outputdir = os.path.join(".", "output", "figures", "compas", "tree")
if saveFig:
    from pathlib import Path

    Path(tree_outputdir).mkdir(parents=True, exist_ok=True)

In [14]:
cols_c = ["class", "predicted"]
continuous_attributes = ["priors_count", "length_of_stay", "age"]

## Tree divergence - divergence_criterion

In [15]:
type_criterion="divergence_criterion"

type_experiment = "one_at_time"

In [16]:
df_analyze = dfI.copy()

In [17]:
from tree_discretization import TreeDiscretization

tree_discr = TreeDiscretization()

# ## Extract tree
generalization_dict, discretizations = tree_discr.get_tree_discretization(
    df_analyze,
    type_splitting=type_experiment,
    min_support=min_support_tree,
    metric=metric,
    class_map=class_map,
    continuous_attributes=list(continuous_attributes),
    class_and_pred_names=cols_c,
    storeTree=True,
    type_criterion=type_criterion,
    # minimal_gain = 0.0015
)

In [18]:
if show_fig:
    
    if type(tree_discr.trees) is dict:
        dot = {}
        for attribute in continuous_attributes:
            dot[attribute] = tree_discr.trees[attribute].visualizeTreeDiGraph(all_info = False)
    dot[attribute]
    
    if type(tree_discr.trees) is dict:
        dot = {}
        for attribute in continuous_attributes:
            dot[attribute] = tree_discr.trees[attribute].visualizeTreeDiGraph(abbreviations=abbreviations, all_info = False, show_condition=True)
    suffix=f"{type_experiment}_{type_criterion}_sd_{min_support_tree}_{metric}"
    saveFig = True
    attribute = "priors_count"
    if saveFig:
        dot[attribute].render(
           os.path.join(tree_outputdir, f"tree_{attribute}_{suffix}")
                    )
    print(os.path.join(tree_outputdir, f"tree_{attribute}_{suffix}"))       
    dot[attribute]
    
    if type(tree_discr.trees) is dict:
        dot = {}
        for attribute in tree_discr.trees:
            dot[attribute] = tree_discr.trees[attribute].visualizeTreeDiGraph(all_info = False)
    else:
        dot_show = tree_discr.trees.visualizeTreeDiGraph()
    dot["priors_count"]

In [19]:
if show_fig:
    viz_tree(
        tree_discr,
        continuous_attributes,
        tree_outputdir,
        suffix=f"{type_experiment}_{type_criterion}_sd_{min_support_tree}_{metric}",
        saveFig=saveFig,
    )

In [20]:
considerOnlyContinuos = True
if considerOnlyContinuos:
    for k in list(generalization_dict.keys()):
        if k not in continuous_attributes:
            generalization_dict.pop(k, None)

### Base

In [21]:
from utils_extract_divergence_generalized import (
    extract_divergence_generalized,
)

for min_sup_divergence in min_sup_divergences:
    FP_fm = extract_divergence_generalized(
        df_analyze,
        discretizations,
        generalization_dict,
        continuous_attributes,
        min_sup_divergence=min_sup_divergence,
        apply_generalization=False,
        true_class_name=true_class_name,
        predicted_class_name=pred_class_name,
        class_map=class_map,
        metrics_divergence = ["d_fpr"],
        FPM_type="fpgrowth",
    )
    
    
    fp_divergence_i = FP_Divergence(FP_fm, metric=metric)

    most_divergent = (
        fp_divergence_i.getDivergence(th_redundancy=0)
        .sort_values(
            [fp_divergence_i.metric, fp_divergence_i.t_value_col], ascending=False
        )
        .head(1)
    )
    out_support.setdefault(min_sup_divergence, {})["base"] = most_divergent

### Generalized

In [22]:
from utils_extract_divergence_generalized import (
    extract_divergence_generalized,
)

for min_sup_divergence in min_sup_divergences:

    FP_fm = extract_divergence_generalized(
        df_analyze,
        discretizations,
        generalization_dict,
        continuous_attributes,
        min_sup_divergence=min_sup_divergence,
        apply_generalization=True,
        true_class_name=true_class_name,
        predicted_class_name=pred_class_name,
        class_map=class_map,
        metrics_divergence = ["d_fpr"],
        FPM_type="fpgrowth",
    )
    
    fp_divergence_i = FP_Divergence(FP_fm, metric=metric)

    most_divergent = (
        fp_divergence_i.getDivergence(th_redundancy=0)
        .sort_values(
            [fp_divergence_i.metric, fp_divergence_i.t_value_col], ascending=False
        )
        .head(1)
    )
    
    out_support.setdefault(min_sup_divergence, {})["generalized"] = most_divergent

# Print results

In [23]:
method_name = "Exploration approach"

for min_sup_divergence in min_sup_divergences:
    res = pd.concat([out_support[min_sup_divergence][k] for k in out_support[min_sup_divergence]])
    res[method_name] = out_support[min_sup_divergence].keys()
    from utils_printable import printable

    COLS = [method_name] + COL_info + [metric, t_value_col]

    res_pr = printable(res[COLS], abbreviations=abbreviations, resort_cols = False)
    
    print(f"Minimum support divergence: {min_sup_divergence}")
    display(res_pr)

Minimum support divergence: 0.05


Unnamed: 0,Exploration approach,itemsets,sup,Δ_fpr,t_fp
113,default,"age=[25-45], #prior>3, race=Afr-Am, sex=Male",0.13,0.22,7.1
106,base,"#prior>=9, race=Afr-Am",0.09,0.363,8.2
1052,generalized,"age<=32, stay>=3.0, #prior>=4, sex=Male",0.06,0.378,6.7


Minimum support divergence: 0.025


Unnamed: 0,Exploration approach,itemsets,sup,Δ_fpr,t_fp
473,default,"age=[25-45], stay=1w-3M, #prior>3, race=Afr-Am, sex=Male",0.03,0.292,4.4
453,base,"age=[28-32], #prior>=9, sex=Male",0.03,0.59,6.8
2416,generalized,"age=[25-32], charge=F, #prior>=9, sex=Male",0.03,0.621,7.7


Minimum support divergence: 0.01


Unnamed: 0,Exploration approach,itemsets,sup,Δ_fpr,t_fp
710,default,"age<25, charge=F, #prior>3",0.02,0.618,5.7
848,base,"age<=24, charge=F, #prior=[4-8]",0.02,0.662,6.2
3697,generalized,"age=[25-32], stay>=3.0, #prior>=9",0.02,0.745,8.1
