In [167]:
from mdlp.discretization import MDLP

In [168]:
# https://github.com/hlin117/mdlp-discretization
# https://pypi.org/project/mdlp-discretization/0.3/

In [169]:
dataset_name = "compas"
risk_class_type = True

from import_datasets import import_process_compas

dfI, class_map = import_process_compas(
    risk_class=risk_class_type, continuous_col=True
)
dfI.reset_index(drop=True, inplace=True)

dfI["predicted"] = dfI["predicted"].replace({"Medium-Low": 0, "High": 1})
true_class_name, pred_class_name = "class", "predicted"
class_and_pred_names = [true_class_name, pred_class_name]
attributes = list(dfI.columns.drop(class_and_pred_names))

dfI = dfI[attributes + class_and_pred_names]
dfI.head()

Unnamed: 0,c_charge_degree,race,sex,age,priors_count,length_of_stay,class,predicted
0,F,Other,Male,69,0,1.0,0,0
1,F,African-American,Male,34,0,10.0,1,0
2,F,African-American,Male,24,4,1.0,1,0
3,M,Other,Male,44,0,1.0,0,0
4,F,Caucasian,Male,41,14,6.0,1,0


In [188]:
continuous_attributes = ["priors_count", "length_of_stay", "age"]

In [198]:
df_input = dfI.copy()
Y_col = "class"

def transform_intervals(x):
    if x[0] == -np.inf:
        return f"<={x[1]}"
    elif x[1] == np.inf:
        return f">{x[0]}"
    else:
        return f"({x[0]}-{x[1]}]"
    
def transform_MDLP(df_input, Y_col, continuous_attributes):
    transformer = MDLP()
    X_disc = transformer.fit_transform(df_input[continuous_attributes], df_input[Y_col])
    df_discr = pd.DataFrame(df_input, columns =  continuous_attributes)
    for e, c in enumerate(continuous_attributes):
        df_discr[c] = transformer.cat2intervals(X_disc, e)
        df_discr[c] = df_discr[c].apply(lambda x: transform_intervals(x))
    for c in df_input.columns:
        if c not in df_discr:
            df_discr[c] = df_input[c].copy()
    return df_discr[df_input.columns]

In [233]:
target_metric_mdlp = "true_class"
if target_metric_mdlp == "d_error":
    target_col = "error"
    df_input[target_col] = (df_input[true_class_name]!=df_input[pred_class_name]).astype(int)
elif target_metric_mdlp == "d_fpr":
    target_col = "fp"    
    df_input[target_col] = ((df_input[true_class_name]!=df_input[pred_class_name]) & df_input[pred_class_name]==1).astype(int)
elif target_metric_mdlp == "d_fnr":
    target_col = "fn"    
    df_input[target_col] = ((df_input[true_class_name]!=df_input[pred_class_name]) & df_input[pred_class_name]==0).astype(int)
elif target_metric_mdlp == "d_fnr":
    target_col = "fn"    
    df_input[target_col] = ((df_input[true_class_name]!=df_input[pred_class_name]) & df_input[pred_class_name]==0).astype(int)
elif target_metric_mdlp == "true_class":
    target_col = true_class_name
elif target_metric_mdlp == "predicted_class":
    target_col = pred_class_name
else:
    raise ValueError(target_metric_mdlp)
    
df_discr_mdlp = transform_MDLP(df_input, target_col, continuous_attributes)
if target_col not in dfI.columns:
    df_input.drop(columns = [target_col], inplace=True)
    df_discr_mdlp.drop(columns = [target_col], inplace=True)
df_discr_mdlp

Unnamed: 0,c_charge_degree,race,sex,age,priors_count,length_of_stay,class,predicted,fp
0,F,Other,Male,>52.0,<=1.0,<=2.0,0,0,0
1,F,African-American,Male,(20.0-34.0],<=1.0,(5.0-360.0],1,0,0
2,F,African-American,Male,(20.0-34.0],(2.0-6.0],<=2.0,1,0,0
3,M,Other,Male,(34.0-52.0],<=1.0,<=2.0,0,0,0
4,F,Caucasian,Male,(34.0-52.0],>6.0,(5.0-360.0],1,0,0
...,...,...,...,...,...,...,...,...,...
6167,F,African-American,Male,(20.0-34.0],<=1.0,<=2.0,0,0,0
6168,F,African-American,Male,(20.0-34.0],<=1.0,<=2.0,0,0,0
6169,F,Other,Male,>52.0,<=1.0,<=2.0,0,0,0
6170,M,African-American,Female,(20.0-34.0],(2.0-6.0],<=2.0,0,0,0


In [234]:
# bins = transformer.cut_points_[0]

In [235]:
def printable_top_k(FP_fm, metric, COL_info, K=3, abbreviations={}, th_redundancy=0):
    from divexplorer_generalized.FP_Divergence import FP_Divergence

    fp_divergence_i = FP_Divergence(FP_fm, metric=metric)

    FP_sorted_i = (
        fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
        .sort_values(
            [fp_divergence_i.metric, fp_divergence_i.t_value_col], ascending=False
        )
        .head(K)
    )

    from utils_printable import printable

    COLS = COL_info + [fp_divergence_i.metric, fp_divergence_i.t_value_col]

    FP_print_i = printable(FP_sorted_i[COLS], abbreviations=abbreviations)
    return FP_print_i

In [236]:
COL_info_detail = ["itemsets", "support", "tn", "fp", "fn", "tp"]

COL_info = ["itemsets", "support"]

In [237]:
K=3

metric = "d_fpr"



In [238]:
abbreviations = {
    "age_cat": "age",
    "priors_count": "#prior",
    "Greater than 45": ">45",
    "25 - 45": "25-45",
    "African-American": "Afr-Am",
    "c_charge_degree": "charge",
    "Less than 25": "<25",
    "=>": ">",
    "=<": "<",
    "length_of_stay": "stay",
    "Caucasian": "Cauc",
}

In [239]:
# ### Extract divergence
pd.set_option("display.max_colwidth", None)

from divexplorer_generalized.FP_DivergenceExplorer import FP_DivergenceExplorer
metric = "d_fpr"


min_sup_divergence=0.05

fp_diver = FP_DivergenceExplorer(
    df_discr_mdlp,
    true_class_name=true_class_name,
    predicted_class_name=pred_class_name,
)

FP_fm = fp_diver.getFrequentPatternDivergence(
                min_support=min_sup_divergence,
                metrics=["d_fpr", "d_fnr", "d_accuracy", "d_error"],
            )

display(printable_top_k(
    FP_fm, metric, COL_info_detail, K=K, abbreviations=abbreviations))

FP_fm.sort_values(metric, ascending = False).head()

  df_print.columns = df_print.columns.str.replace("d_*", f"{div_name}_")


Unnamed: 0,itemsets,sup,tn,fp,fn,tp,Δ_fpr,t_fp
435,"age=(20.0-34.0], #prior>6.0, race=Afr-Am",0.06,35,33,108,225,0.397,6.7
411,"age=(20.0-34.0], charge=F, #prior>6.0",0.07,40,37,114,225,0.392,7.0
477,"age=(20.0-34.0], #prior>6.0, race=Afr-Am, sex=Male",0.06,32,28,101,211,0.378,6.0


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,fnr,accuracy,error,d_fnr,d_fpr,d_accuracy,d_error,t_value_fp,t_value_fn,t_value_tp_tn,t_value_fp_fn
435,0.064971,"(priors_count=>6.0, race=African-American, age=(20.0-34.0])",35,33,108,225,3,401.0,0.485294,0.324324,0.648379,0.351621,-0.374145,0.39698,0.014387,-0.014387,6.673016,13.820664,0.557918,0.557918
411,0.067401,"(priors_count=>6.0, c_charge_degree=F, age=(20.0-34.0])",40,37,114,225,3,416.0,0.480519,0.336283,0.637019,0.362981,-0.362186,0.392205,0.003027,-0.003027,6.998658,13.378499,0.099428,0.099428
477,0.060272,"(sex=Male, priors_count=>6.0, race=African-American, age=(20.0-34.0])",32,28,101,211,4,372.0,0.466667,0.323718,0.653226,0.346774,-0.374751,0.378353,0.019234,-0.019234,6.013695,13.449346,0.72829,0.72829
567,0.051847,"(priors_count=>6.0, c_charge_degree=F, race=African-American, age=(20.0-34.0])",30,26,85,179,4,320.0,0.464286,0.32197,0.653125,0.346875,-0.3765,0.375972,0.019133,-0.019133,5.788334,12.540308,0.670022,0.670022
470,0.061568,"(sex=Male, priors_count=>6.0, c_charge_degree=F, age=(20.0-34.0])",36,31,104,209,4,380.0,0.462687,0.332268,0.644737,0.355263,-0.366201,0.374373,0.010745,-0.010745,6.273866,13.084417,0.397663,0.397663


In [240]:
FP_fm.sort_values("length").head(19)

Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,fnr,accuracy,error,d_fnr,d_fpr,d_accuracy,d_error,t_value_fp,t_value_fn,t_value_tp_tn,t_value_fp_fn
0,1.0,(),3066,297,1962,847,0,6172.0,0.088314,0.698469,0.633992,0.366008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.340732,(race=Caucasian),1220,61,660,162,1,2103.0,0.047619,0.80292,0.657156,0.342844,0.104451,-0.040695,0.023164,-0.023164,5.203064,6.3528,1.917645,1.917645
266,0.10013,(age=>52.0),439,11,154,14,1,618.0,0.024444,0.916667,0.73301,0.26699,0.218197,-0.06387,0.099017,-0.099017,6.888717,9.139408,5.23032,5.23032
31,0.356773,(c_charge_degree=M),1294,83,655,170,1,2202.0,0.060276,0.793939,0.66485,0.33515,0.09547,-0.028038,0.030858,-0.030858,3.417615,5.74359,2.611554,2.611554
108,0.190376,(sex=Female),709,53,315,98,1,1175.0,0.069554,0.762712,0.686809,0.313191,0.064243,-0.01876,0.052816,-0.052816,1.705829,2.790621,3.540142,3.540142
526,0.055574,(race=Other),216,3,105,19,1,343.0,0.013699,0.846774,0.685131,0.314869,0.148305,-0.074615,0.051139,-0.051139,6.90797,4.259105,1.947236,1.947236
45,0.300713,(age=(34.0-52.0]),1108,55,541,152,1,1856.0,0.047291,0.780664,0.678879,0.321121,0.082195,-0.041023,0.044887,-0.044887,5.092241,4.54632,3.594763,3.594763
95,0.20269,(priors_count=(2.0-6.0]),468,66,519,198,1,1251.0,0.123596,0.723849,0.532374,0.467626,0.02538,0.035282,-0.101618,0.101618,2.415198,1.325355,6.613786,6.613786
66,0.253402,(length_of_stay=(5.0-360.0]),544,105,534,381,1,1564.0,0.161787,0.583607,0.591432,0.408568,-0.114863,0.073473,-0.04256,0.04256,4.864915,6.23459,3.078407,3.078407
228,0.110337,(priors_count=(1.0-2.0]),338,27,240,76,1,681.0,0.073973,0.759494,0.60793,0.39207,0.061024,-0.014341,-0.026063,0.026063,0.835487,2.334787,1.34018,1.34018
