In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
import os
import numpy as np
import pandas as pd
from divexplorer_generalized.FP_Divergence import FP_Divergence

# Import data

In [8]:
DATASET_DIR = os.path.join(os.path.curdir, "datasets")

def import_process_adult(discretize=False, bins=3, inputDir=DATASET_DIR):
    education_map = {
        "10th": "Dropout",
        "11th": "Dropout",
        "12th": "Dropout",
        "1st-4th": "Dropout",
        "5th-6th": "Dropout",
        "7th-8th": "Dropout",
        "9th": "Dropout",
        "Preschool": "Dropout",
        "HS-grad": "High School grad",
        "Some-college": "High School grad",
        "Masters": "Masters",
        "Prof-school": "Prof-School",
        "Assoc-acdm": "Associates",
        "Assoc-voc": "Associates",
    }
    occupation_map = {
        "Adm-clerical": "Admin",
        "Armed-Forces": "Military",
        "Craft-repair": "Blue-Collar",
        "Exec-managerial": "White-Collar",
        "Farming-fishing": "Blue-Collar",
        "Handlers-cleaners": "Blue-Collar",
        "Machine-op-inspct": "Blue-Collar",
        "Other-service": "Service",
        "Priv-house-serv": "Service",
        "Prof-specialty": "Professional",
        "Protective-serv": "Other",
        "Sales": "Sales",
        "Tech-support": "Other",
        "Transport-moving": "Blue-Collar",
    }
    married_map = {
        "Never-married": "Never-Married",
        "Married-AF-spouse": "Married",
        "Married-civ-spouse": "Married",
        "Married-spouse-absent": "Separated",
        "Separated": "Separated",
        "Divorced": "Separated",
        "Widowed": "Widowed",
    }

    country_map = {
        "Cambodia": "SE-Asia",
        "Canada": "British-Commonwealth",
        "China": "China",
        "Columbia": "South-America",
        "Cuba": "Other",
        "Dominican-Republic": "Latin-America",
        "Ecuador": "South-America",
        "El-Salvador": "South-America",
        "England": "British-Commonwealth",
        "France": "Euro_1",
        "Germany": "Euro_1",
        "Greece": "Euro_2",
        "Guatemala": "Latin-America",
        "Haiti": "Latin-America",
        "Holand-Netherlands": "Euro_1",
        "Honduras": "Latin-America",
        "Hong": "China",
        "Hungary": "Euro_2",
        "India": "British-Commonwealth",
        "Iran": "Other",
        "Ireland": "British-Commonwealth",
        "Italy": "Euro_1",
        "Jamaica": "Latin-America",
        "Japan": "Other",
        "Laos": "SE-Asia",
        "Mexico": "Latin-America",
        "Nicaragua": "Latin-America",
        "Outlying-US(Guam-USVI-etc)": "Latin-America",
        "Peru": "South-America",
        "Philippines": "SE-Asia",
        "Poland": "Euro_2",
        "Portugal": "Euro_2",
        "Puerto-Rico": "Latin-America",
        "Scotland": "British-Commonwealth",
        "South": "Euro_2",
        "Taiwan": "China",
        "Thailand": "SE-Asia",
        "Trinadad&Tobago": "Latin-America",
        "United-States": "United-States",
        "Vietnam": "SE-Asia",
    }
    # as given by adult.names
    column_names = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        "income-per-year",
    ]

    #check_dataset_availability("credit-g.csv", inputDir=inputDir)
    train = pd.read_csv(
        os.path.join(inputDir, "adult.data"),
        header=None,
        names=column_names,
        skipinitialspace=True,
        na_values="?",
    )

    #check_dataset_availability("adult.test", inputDir=inputDir)

    test = pd.read_csv(
        os.path.join(inputDir, "adult.test"),
        header=0,
        names=column_names,
        skipinitialspace=True,
        na_values="?",
    )
    dt = pd.concat([test, train], ignore_index=True)
    dt["education"] = dt["education"].replace(education_map)
    dt.drop(columns=["education-num", "fnlwgt"], inplace=True)
    dt["occupation"] = dt["occupation"].replace(occupation_map)
    dt["marital-status"] = dt["marital-status"].replace(married_map)
    dt["native-country"] = dt["native-country"].replace(country_map)

    dt.rename(columns={"income-per-year": "class"}, inplace=True)
    dt["class"] = (
        dt["class"].astype("str").replace({">50K.": ">50K", "<=50K.": "<=50K"})
    )
    dt.dropna(inplace=True)
    dt.reset_index(drop=True, inplace=True)
    if discretize:
        dt = KBinsDiscretizer_continuos(dt, bins=bins)
    dt.drop(columns=["native-country"], inplace=True)
    return dt, {"N": "<=50K", "P": ">50K"}


In [10]:
df, class_map = import_process_adult()

In [12]:
df.describe()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
count,45222.0,45222.0,45222.0,45222.0
mean,38.547941,1101.430344,88.595418,40.938017
std,13.21787,7506.430084,404.956092,12.007508
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,47.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [22]:
from sklearn.preprocessing import LabelEncoder


attributes = df.columns.drop("class")
X = df[attributes].copy()
y = df["class"].copy()

encoders = {}
for column in attributes:
    if df.dtypes[column] == np.object:
        print(column)
        le = LabelEncoder()
        X[column] = le.fit_transform(df[column])
        encoders[column] = le

workclass
education
marital-status
occupation
relationship
race
sex


# Train and predict with RF classifier

In [23]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier




clf = RandomForestClassifier(random_state=42)


k_cv = 10


cv = StratifiedKFold(n_splits=k_cv, random_state=42, shuffle=True
            )  # Added to fix the random state  #Added shuffle=True for new version sklearn, Value Error
       
y_predicted = cross_val_predict(clf, X, y.values, cv=cv)

In [24]:
df["predicted"] = y_predicted

In [33]:
df.shape[0]*0.03

1356.6599999999999

# Tree divergence

In [34]:
true_class_name = "class"
pred_class_name = "predicted"
cols_c = [true_class_name, pred_class_name]

In [35]:
continuous_attributes = list(df.describe().columns)

In [36]:
df_analyze = df.copy()

In [37]:
min_support_tree = 0.1

In [38]:
metric = "d_fpr"

In [39]:
type_criterion="divergence_criterion"

type_experiment = "one_at_time"

In [40]:
from tree_discretization import TreeDiscretization

tree_discr = TreeDiscretization()

# ## Extract tree
generalization_dict, discretizations = tree_discr.get_tree_discretization(
    df_analyze,
    type_splitting=type_experiment,
    min_support=min_support_tree,
    metric=metric,
    class_map=class_map,
    continuous_attributes=list(continuous_attributes),
    class_and_pred_names=cols_c,
    storeTree=True,
    type_criterion='entropy', #type_criterion,
    #minimal_gain = 0.0015
)

In [41]:
tree_discr.printDiscretizationTrees(round_v =2, show_condition = True)

age
 root s=1.00 --> d_fpr=0.00
         age<=27 s=0.24 --> d_fpr=-0.07
                 age<=22 s=0.11 --> d_fpr=-0.08
                 age>22 s=0.13 --> d_fpr=-0.07
         age>27 s=0.76 --> d_fpr=0.03
                 age<=35 s=0.22 --> d_fpr=-0.01
                         age<=31 s=0.11 --> d_fpr=-0.02
                         age>31 s=0.11 --> d_fpr=0.01
                 age>35 s=0.55 --> d_fpr=0.05
                         age<=56 s=0.44 --> d_fpr=0.06
                                 age<=46 s=0.28 --> d_fpr=0.05
                                         age<=39 s=0.11 --> d_fpr=0.04
                                         age>39 s=0.17 --> d_fpr=0.05
                                 age>46 s=0.16 --> d_fpr=0.08
                         age>56 s=0.11 --> d_fpr=0.02

hours-per-week
 root s=1.00 --> d_fpr=0.00
         hours-per-week<=40 s=0.70 --> d_fpr=-0.03
                 hours-per-week<=36 s=0.20 --> d_fpr=-0.06
                 hours-per-week>36 s=0.49 --> d_fpr=-0.01
    

# Extract patterns

In [72]:
min_sup_divergences = [0.01, 0.025, 0.03, 0.05, 0.1]

In [73]:
import time

In [74]:
out_support = {}
out_time = {}

## Base

In [75]:
from utils_extract_divergence_generalized import (
    extract_divergence_generalized,
)

for min_sup_divergence in min_sup_divergences:
    s_time = time.time()
    FP_fm = extract_divergence_generalized(
        df_analyze,
        discretizations,
        generalization_dict,
        continuous_attributes,
        min_sup_divergence=min_sup_divergence,
        apply_generalization=False,
        true_class_name=true_class_name,
        predicted_class_name=pred_class_name,
        class_map=class_map,
        metrics_divergence = ["d_fpr"],
        FPM_type="fpgrowth",
    )
    
    out_time.setdefault(min_sup_divergence, {})["base"] = time.time()-s_time
    
    
    fp_divergence_i = FP_Divergence(FP_fm, metric=metric)

    most_divergent = (
        fp_divergence_i.getDivergence(th_redundancy=0)
        .sort_values(
            [fp_divergence_i.metric, fp_divergence_i.t_value_col], ascending=False
        )
        .head(1)
    )
    out_support.setdefault(min_sup_divergence, {})["base"] = most_divergent
    
    

## Generalized

In [81]:
from utils_extract_divergence_generalized import (
    extract_divergence_generalized,
)
import time



for min_sup_divergence in min_sup_divergences:
    s = time.time()
    FP_fm = extract_divergence_generalized(
        df_analyze,
        discretizations,
        generalization_dict,
        continuous_attributes,
        min_sup_divergence=min_sup_divergence,
        apply_generalization=True,
        true_class_name=true_class_name,
        predicted_class_name=pred_class_name,
        class_map=class_map,
        metrics_divergence = ["d_fpr"],
        FPM_type="fpgrowth",
        save_in_progress = False
    )
    
    e = time.time()
    out_time.setdefault(min_sup_divergence, {})["generalized"] = time.time()-s_time
    
    rt = e-s
    fp_divergence_i = FP_Divergence(FP_fm, metric=metric)

    most_divergent = (
        fp_divergence_i.getDivergence(th_redundancy=0)
        .sort_values(
            [fp_divergence_i.metric, fp_divergence_i.t_value_col], ascending=False
        )
        .head(1)
    )
    out_support.setdefault(min_sup_divergence, {})["generalized"] = most_divergent

1 10000
2 20000


In [82]:
out_time

{0.01: {'base': 4.17091178894043, 'generalized': 68.76981902122498},
 0.025: {'base': 2.2312047481536865, 'generalized': 74.73068594932556},
 0.03: {'base': 2.1193981170654297, 'generalized': 78.23400402069092},
 0.05: {'base': 2.37870717048645, 'generalized': 81.05516481399536},
 0.1: {'base': 1.9696078300476074, 'generalized': 83.20603585243225}}

In [83]:
out_time

{0.01: {'base': 4.17091178894043, 'generalized': 68.76981902122498},
 0.025: {'base': 2.2312047481536865, 'generalized': 74.73068594932556},
 0.03: {'base': 2.1193981170654297, 'generalized': 78.23400402069092},
 0.05: {'base': 2.37870717048645, 'generalized': 81.05516481399536},
 0.1: {'base': 1.9696078300476074, 'generalized': 83.20603585243225}}

In [85]:
for k in out_support:
    print(k)
    for t_gen in out_support[k]:
        print(t_gen)
        display(out_support[k][t_gen])

0.01
base


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
7144,0.010946,"(workclass=Private, marital-status=Married, oc...",18,72,50,355,4,495.0,0.8,0.721885,17.031069


generalized


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
26105,0.010614,"(race=White, education=Bachelors, marital-stat...",20,80,42,338,5,480.0,0.8,0.721885,17.958778


0.025
base


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
2246,0.027155,"(marital-status=Married, hours-per-week=[41-53...",102,195,122,809,3,1228.0,0.656566,0.578451,21.015314


generalized


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
8730,0.02585,"(race=White, education=Bachelors, age=>=36, ho...",79,188,115,787,5,1169.0,0.70412,0.626005,22.41668


0.03
base


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
1786,0.031799,"(marital-status=Married, education=Masters)",132,194,126,986,2,1438.0,0.595092,0.516977,19.04869


generalized


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
6321,0.032727,"(race=White, occupation=Professional, sex=Male...",107,232,152,989,5,1480.0,0.684366,0.606251,24.016593


0.05
base


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
659,0.061674,"(race=White, occupation=Professional, marital-...",357,435,317,1680,3,2789.0,0.549242,0.471128,26.598005


generalized


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
3361,0.051059,"(marital-status=Married, age=[36-56], educatio...",229,391,243,1446,3,2309.0,0.630645,0.55253,28.464784


0.1
base


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
297,0.10245,"(race=White, hours-per-week=[41-53], relations...",1396,633,697,1907,3,4633.0,0.311976,0.233861,22.54118


generalized


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
1081,0.102096,"(race=White, age=[36-56], hours-per-week=>=41,...",1060,700,677,2180,4,4617.0,0.397727,0.319612,27.214797
