In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
import pandas as pd
from divexplorer_generalized.FP_Divergence import FP_Divergence

# Import data

In [4]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")

df["quality"] = df["quality"].apply(lambda x: "good" if x>5 else "bad")
class_map = {'P': 'good', 'N': 'bad'}
df.rename(columns = {"quality": "class"}, inplace=True)

# Train and predict with RF classifier

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier


attributes = df.columns.drop("class")
X = df[attributes].copy()
y = df["class"].copy()

clf = RandomForestClassifier(random_state=42)


k_cv = 10


cv = StratifiedKFold(n_splits=k_cv, random_state=42, shuffle=True
            )  # Added to fix the random state  #Added shuffle=True for new version sklearn, Value Error
       
y_predicted = cross_val_predict(clf, X, y.values, cv=cv)

In [6]:
df["predicted"] = y_predicted

In [7]:
df.shape[0]*0.03

47.97

# Tree divergence

In [8]:
true_class_name = "class"
pred_class_name = "predicted"
cols_c = [true_class_name, pred_class_name]

In [9]:
continuous_attributes = list(df.describe().columns)

In [10]:
df_analyze = df.copy()

In [11]:
min_support_tree = 0.1

In [12]:
metric = "d_fpr"

In [13]:
type_criterion="divergence_criterion"

type_experiment = "one_at_time"

In [14]:
from tree_discretization import TreeDiscretization

tree_discr = TreeDiscretization()

# ## Extract tree
generalization_dict, discretizations = tree_discr.get_tree_discretization(
    df_analyze,
    type_splitting=type_experiment,
    min_support=min_support_tree,
    metric=metric,
    class_map=class_map,
    continuous_attributes=list(continuous_attributes),
    class_and_pred_names=cols_c,
    storeTree=True,
    type_criterion='entropy', #type_criterion,
    #minimal_gain = 0.0015
)

In [15]:
tree_discr.printDiscretizationTrees(round_v =2, show_condition = True)

fixed acidity
 root s=1.00 --> d_fpr=0.00
         fixed acidity<=10.1 s=0.85 --> d_fpr=-0.03
                 fixed acidity<=6.5 s=0.10 --> d_fpr=0.08
                 fixed acidity>6.5 s=0.75 --> d_fpr=-0.04
                         fixed acidity<=7.0 s=0.12 --> d_fpr=-0.01
                         fixed acidity>7.0 s=0.62 --> d_fpr=-0.05
                                 fixed acidity<=7.7 s=0.22 --> d_fpr=-0.08
                                         fixed acidity<=7.3 s=0.11 --> d_fpr=-0.09
                                         fixed acidity>7.3 s=0.12 --> d_fpr=-0.08
                                 fixed acidity>7.7 s=0.40 --> d_fpr=-0.03
                                         fixed acidity<=8.5 s=0.18 --> d_fpr=0.00
                                         fixed acidity>8.5 s=0.21 --> d_fpr=-0.06
                                                 fixed acidity<=9.1 s=0.11 --> d_fpr=-0.11
                                                 fixed acidity>9.1 s=0.11 --> d_fpr=-0.0

# Extract patterns

In [17]:
out_support = {}
out_time = {}

In [26]:
min_sup_divergences = [0.05]

## Base

In [27]:
apply_generalization = False

if apply_generalization:
    type_gen = 'generalized'
else:
    type_gen = 'base'



from utils_extract_divergence_generalized import (
    extract_divergence_generalized,
)
import time

for min_sup_divergence in min_sup_divergences:
    s_time = time.time()
    FP_fm_base = extract_divergence_generalized(
        df_analyze,
        discretizations,
        generalization_dict,
        continuous_attributes,
        min_sup_divergence=min_sup_divergence,
        apply_generalization=False,
        true_class_name=true_class_name,
        predicted_class_name=pred_class_name,
        class_map=class_map,
        metrics_divergence = [metric],
        FPM_type="fpgrowth",
    )
    
    out_time.setdefault(min_sup_divergence, {})[type_gen] = time.time()-s_time
    fp_divergence_i_base = FP_Divergence(FP_fm_base, metric=metric)

    most_divergent_base = (
        fp_divergence_i_base.getDivergence(th_redundancy=0)
        .sort_values(
            [fp_divergence_i_base.metric, fp_divergence_i_base.t_value_col], ascending=False
        )
        .head(1)
    )
    out_support.setdefault(min_sup_divergence, {})[type_gen] = most_divergent_base

In [20]:
out_support[0.075]['base']

Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
5,0.195122,(alcohol=>=11.4),10,27,7,268,1,312.0,0.72973,0.540214,7.268591


## Generalized

In [29]:
from utils_extract_divergence_generalized import (
    extract_divergence_generalized,
)
import time

apply_generalization = True

if apply_generalization:
    type_gen = 'generalized'
else:
    type_gen = 'base'



#for min_sup_divergence in min_sup_divergences:
for min_sup_divergence in min_sup_divergences:
    s_time = time.time()
    FP_fm = extract_divergence_generalized(
        df_analyze,
        discretizations,
        generalization_dict,
        continuous_attributes,
        min_sup_divergence=min_sup_divergence,
        apply_generalization=apply_generalization,
        true_class_name=true_class_name,
        predicted_class_name=pred_class_name,
        class_map=class_map,
        metrics_divergence = [metric],
        FPM_type="fpgrowth",
        save_in_progress = False
    )
    
    out_time.setdefault(min_sup_divergence, {})[type_gen] = time.time()-s_time
    fp_divergence_i = FP_Divergence(FP_fm, metric=metric)

    most_divergent = (
        fp_divergence_i.getDivergence(th_redundancy=0)
        .sort_values(
            [fp_divergence_i.metric, fp_divergence_i.t_value_col], ascending=False
        )
        .head(1)
    )
    out_support.setdefault(min_sup_divergence, {})[type_gen] = most_divergent

1 10000
2 20000
3 30000
4 40000
5 50000
6 60000
7 70000
8 80000
9 90000
10 100000
11 110000
12 120000
13 130000
14 140000
15 150000
16 160000
17 170000
18 180000
19 190000
20 200000
21 210000
22 220000
23 230000
24 240000
25 250000
26 260000
27 270000
28 280000
29 290000
30 300000
31 310000
32 320000
33 330000
34 340000
35 350000
36 360000
37 370000
38 380000
39 390000
40 400000
41 410000
42 420000
43 430000
44 440000
45 450000
46 460000
47 470000
48 480000
49 490000
50 500000
51 510000
52 520000
53 530000
54 540000
55 550000
56 560000
57 570000
58 580000
59 590000
60 600000
61 610000
62 620000
63 630000
64 640000
65 650000
66 660000
67 670000
68 680000
69 690000
70 700000
71 710000
72 720000
73 730000
74 740000
75 750000
76 760000
77 770000
78 780000
79 790000
80 800000
81 810000
82 820000
83 830000


# Results

In [30]:
out_time

{0.075: {'base': 0.3269538879394531, 'generalized': 70.16418313980103},
 0.1: {'base': 0.25703001022338867, 'generalized': 33.832151889801025},
 0.05: {'base': 1.0451099872589111, 'generalized': 266.29328632354736}}

In [31]:
for k in out_support:
    print(k)
    for t_gen in out_support[k]:
        print(t_gen)
        display(out_support[k][t_gen])

0.075
base


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
5,0.195122,(alcohol=>=11.4),10,27,7,268,1,312.0,0.72973,0.540214,7.268591


generalized


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
70484,0.118199,"(alcohol=>=10.2, total sulfur dioxide=<=85.0, ...",0,22,6,161,4,189.0,1.0,0.810484,18.083825


0.1
base


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
5,0.195122,(alcohol=>=11.4),10,27,7,268,1,312.0,0.72973,0.540214,7.268591


generalized


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
70094,0.118199,"(alcohol=>=10.2, total sulfur dioxide=<=85.0, ...",0,22,6,161,4,189.0,1.0,0.810484,18.083825


0.05
base


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
119,0.051907,"(citric acid=>=0.47, alcohol=>=11.4)",0,6,1,76,2,83.0,1.0,0.810484,6.158519


generalized


Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,fpr,d_fpr,t_value_fp
70528,0.118199,"(alcohol=>=10.2, total sulfur dioxide=<=85.0, ...",0,22,6,161,4,189.0,1.0,0.810484,18.083825


In [None]:
462/60