In [1]:
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
import graphviz
import itertools
#from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from typing import List, Union, Any, Tuple, Dict
import time

import warnings
warnings.filterwarnings('ignore')

# Project path
ppath = sys.path[0] + '/../'
sys.path.append(os.path.join(ppath, 'code'))
from discretizers import *

In [2]:
def add_strategies_to_dict(binning_strategies:Dict, strategies:Dict):
    """
    """
    for key in strategies.keys():
        if key in binning_strategies:
            binning_strategies[key].append(strategies[key])
        else: binning_strategies[key] = [strategies[key]]
    return binning_strategies

In [35]:
df = pd.read_csv(os.path.join(ppath, 'data', 'uciml_pima-indians-diabetes-database', 'diabetes.csv'))
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
# Make binning strategies 
#attrs = ['Glucose', 'BMI', 'Age', 'Pregnancies', 'BloodPressure', 'Insulin', 'DiabetesPedigreeFunction', 'SkinThickness']
start_time = time.time()
attrs = ['Glucose', 'BMI', 'Age']
target = 'Outcome'
binning_strategies = {
    'Glucose': [
        np.array([-1, 140, 200]),
        ],
    'BMI': [
        np.array([-1, 18.5, 25, 30, 68]),
        ],
    'Age': [
        np.array([-1, 18, 35, 50, 65, 100]),
        np.array([-1, 25, 40, 55, 75, 100]),
        ],
}
gpt_bins = [[-1, 140, 200],[-1, 18.5, 25, 30, 68],[-1, 18, 35, 50, 65, 100],[-1, 25, 40, 55, 75, 100],]

for n_bins in range(2, 10):
    df = pd.read_csv(os.path.join(ppath, 'data', 'uciml_pima-indians-diabetes-database', 'diabetes.csv'))
    # Equal width
    intervals = equal_width(df, n_bins, attrs)
    binning_strategies = add_strategies_to_dict(binning_strategies, intervals)
    # Equal frequency
    intervals = equal_frequency(df, n_bins, attrs)
    binning_strategies = add_strategies_to_dict(binning_strategies, intervals)
    # ChiMerge
    intervals = chimerge_wrap(df, attrs, target, n_bins)
    binning_strategies = add_strategies_to_dict(binning_strategies, intervals)
    # KMeans
    intervals = KBinsDiscretizer_wrap(df, attrs, n_bins)
    binning_strategies = add_strategies_to_dict(binning_strategies, intervals)

print("--- %s seconds ---" % (time.time() - start_time))
#print(binning_strategies)

--- 245.2841248512268 seconds ---


In [36]:
val_df = df[df["Age"] > 40]
val_df = val_df.sample(frac=0.4, random_state=1)
df = df.drop(val_df.index)
X_val = val_df.drop(columns=[target])
y_val = val_df[target]

In [27]:
strategy_combos = list(itertools.product(*binning_strategies.values()))
print("number of strategy combinations:", len(strategy_combos))

number of strategy combinations: 16250


In [37]:
start_time = time.time()
tree_depth = 5
cols = attrs
x0 = []
x1 = []
x2 = []
x3 = []
results = []
for strategy in strategy_combos:
    # load data
    df = pd.read_csv(os.path.join(ppath, 'data', 'uciml_pima-indians-diabetes-database', 'diabetes.csv'))
    # bin fixed columns
    df['DiabetesPedigreeFunction.binned'] = pd.cut(df['DiabetesPedigreeFunction'], bins=[-1, 0.5, 1, 100], labels=[0.5, 1, 100])
    df['DiabetesPedigreeFunction.binned'] = df['DiabetesPedigreeFunction.binned'].astype('float64')
    df['Pregnancies.binned'] = pd.cut(df['Pregnancies'], bins=[-1, 2, 100], labels=[0, 1])
    df['Pregnancies.binned'] = df['Pregnancies.binned'].astype('int64')
    df['Insulin.binned'] = pd.cut(df['Insulin'], bins=[-1, 30, 120, 1000], labels=[30, 120, 1000])
    df['Insulin.binned'] = df['Insulin.binned'].astype('int64')
    df['BloodPressure.binned'] = pd.cut(df['BloodPressure'], bins=[-1, 60, 90, 100, 1000], labels=[60, 90, 100, 1000])
    df['BloodPressure.binned'] = df['BloodPressure.binned'].astype('int64')
    df['SkinThickness.binned'] = pd.cut(df['SkinThickness'], bins=[-1, 20, 40, 60, 80, 100], labels=[20, 40, 60, 80, 100])
    df['SkinThickness.binned'] = df['SkinThickness.binned'].astype('int64')
    # bin variable columns
    gpt_bin_count = 0
    small_bin_count = 0
    for i in range(len(cols)):
        col = cols[i]
        bins = strategy[i]
        df[col + '.binned'] = pd.cut(df[col], bins=bins, labels=bins[1:])
        df[col + '.binned'] = df[col + '.binned'].astype('float64')
        try: 
            if list(bins) in gpt_bins: gpt_bin_count += 1
        except: gpt_bin_count += 0
        if len(bins) <= 6: small_bin_count += 1
    # split data
    df = df.dropna()
    X = df[['Glucose.binned', 'Pregnancies.binned', 'Insulin.binned', 'BMI.binned', 'Age.binned',
            'BloodPressure.binned', 'DiabetesPedigreeFunction.binned', 'SkinThickness.binned']]
    y = df['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
    #print(X_train[X_train.isna().any(axis=1)])
    #print(y_train[y_train.isna().any()])
    try: 
        clf = DecisionTreeClassifier(random_state=0,max_depth=tree_depth).fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_val_pred = clf.predict(X_val)
    except: 
        #print("Error:", sys.exc_info()[0])
        #print(X_train[X_train.isna().any(axis=1)])
        print("Strategy:", strategy)
        #break
    accuracy = accuracy_score(y_test, y_pred)
    valid_accuracy = accuracy_score(y_val, y_val_pred)
    #print("Strategy:", strategy)
    #print("Accuracy:", accuracy, "Strategy:", strategy)
    #print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    #print("Classification report:\n", classification_report(y_test, y_pred))
    row = [accuracy, valid_accuracy, strategy[0], strategy[1], strategy[2], gpt_bin_count, small_bin_count]
    results.append(row)

print("Number of results:", len(results))
results = pd.DataFrame(results, columns=['Accuracy', 'Valid','Glucose', 'BMI', 'Age', 'gpt_bin_count', 'small_bin_count'])
print("--- %s seconds ---" % (time.time() - start_time))

Number of results: 16250
--- 284.9900050163269 seconds ---


In [21]:
df = results
df[(df['Accuracy'] > df['Accuracy'].max()-0.02) & df['gpt_bin_count'] > 0]

Unnamed: 0,Accuracy,Valid,Glucose,BMI,Age,gpt_bin_count,small_bin_count
4186,0.801325,0.788079,"[0.0, 99.0, 154.0, 199.0]","[0.0, 25.9, 30.1, 33.7, 37.8, 67.1]","[-1, 18, 35, 50, 65, 100]",1,3


In [22]:
df = results
df[(df['Accuracy'] > df['Accuracy'].max()-0.02)].sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Valid,Glucose,BMI,Age,gpt_bin_count,small_bin_count
12224,0.821192,0.821192,"[0.0, 99.0, 114.0, 127.0, 154.0, 166.0, 199.0]","[0.0, 24.288, 27.3, 29.762, 32.0, 34.138, 36.6...","[0.0, 30.0, 81.0]",0,1
16124,0.821192,0.821192,"[0.0, 99.0, 114.0, 115.0, 125.0, 127.0, 154.0,...","[0.0, 24.288, 27.3, 29.762, 32.0, 34.138, 36.6...","[0.0, 30.0, 81.0]",0,1
14174,0.821192,0.821192,"[0.0, 99.0, 114.0, 115.0, 127.0, 154.0, 166.0,...","[0.0, 24.288, 27.3, 29.762, 32.0, 34.138, 36.6...","[0.0, 30.0, 81.0]",0,1
10274,0.821192,0.821192,"[0.0, 99.0, 127.0, 154.0, 166.0, 199.0]","[0.0, 24.288, 27.3, 29.762, 32.0, 34.138, 36.6...","[0.0, 30.0, 81.0]",0,2
6374,0.821192,0.821192,"[0.0, 99.0, 127.0, 154.0, 199.0]","[0.0, 24.288, 27.3, 29.762, 32.0, 34.138, 36.6...","[0.0, 30.0, 81.0]",0,2
...,...,...,...,...,...,...,...
4490,0.801325,0.774834,"[0.0, 99.0, 154.0, 199.0]","[0.0, 7.456, 22.367, 29.822, 37.278, 44.733, 5...","[0.0, 22.0, 25.0, 27.0, 31.0, 38.0, 46.0, 81.0]",0,1
4492,0.801325,0.774834,"[0.0, 99.0, 154.0, 199.0]","[0.0, 7.456, 22.367, 29.822, 37.278, 44.733, 5...","[0.0, 28.5, 36.0, 43.5, 51.0, 58.5, 66.0, 73.5...",0,1
10014,0.801325,0.801325,"[0.0, 99.0, 127.0, 154.0, 166.0, 199.0]","[0.0, 13.42, 26.84, 40.26, 53.68, 67.1]","[0.0, 30.0, 81.0]",0,3
4518,0.801325,0.807947,"[0.0, 99.0, 154.0, 199.0]","[0.0, 24.0, 26.5, 28.9, 30.9, 32.9, 34.733, 37...","[0.0, 28.5, 36.0, 43.5, 51.0, 58.5, 66.0, 73.5...",0,1


## breaking IID assumption

In [38]:
df = results
df[(df['Accuracy'] > df['Accuracy'].max()-0.02) & df['gpt_bin_count'] > 0].sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Valid,Glucose,BMI,Age,gpt_bin_count,small_bin_count
15623,0.830065,0.512821,"[0.0, 99.0, 114.0, 115.0, 125.0, 127.0, 154.0,...","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 27.667, 34.333, 41.0, 47.667, 54.333, 61...",1,1
14323,0.830065,0.512821,"[0.0, 22.111, 44.222, 66.333, 88.444, 110.556,...","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 27.667, 34.333, 41.0, 47.667, 54.333, 61...",1,1
13673,0.830065,0.512821,"[0.0, 99.0, 114.0, 115.0, 127.0, 154.0, 166.0,...","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 27.667, 34.333, 41.0, 47.667, 54.333, 61...",1,1
11723,0.830065,0.512821,"[0.0, 99.0, 114.0, 127.0, 154.0, 166.0, 199.0]","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 27.667, 34.333, 41.0, 47.667, 54.333, 61...",1,1
9773,0.830065,0.512821,"[0.0, 99.0, 127.0, 154.0, 166.0, 199.0]","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 27.667, 34.333, 41.0, 47.667, 54.333, 61...",1,2
8470,0.830065,0.512821,"[0.0, 33.167, 66.333, 99.5, 132.667, 165.833, ...","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 28.5, 36.0, 43.5, 51.0, 58.5, 66.0, 73.5...",1,1
5862,0.830065,0.512821,"[0.0, 99.0, 127.0, 154.0, 199.0]","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 23.0, 27.0, 33.0, 42.6, 81.0]",1,3
5873,0.830065,0.512821,"[0.0, 99.0, 127.0, 154.0, 199.0]","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 27.667, 34.333, 41.0, 47.667, 54.333, 61...",1,2
7812,0.830065,0.512821,"[0.0, 99.0, 127.0, 154.0, 199.0]","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 23.0, 27.0, 33.0, 42.6, 81.0]",1,3
7823,0.830065,0.512821,"[0.0, 99.0, 127.0, 154.0, 199.0]","[-1.0, 18.5, 25.0, 30.0, 68.0]","[0.0, 27.667, 34.333, 41.0, 47.667, 54.333, 61...",1,2


In [39]:
df = results
df[(df['Accuracy'] > df['Accuracy'].max()-0.02)& df['gpt_bin_count'] == 0].sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Valid,Glucose,BMI,Age,gpt_bin_count,small_bin_count
7351,0.827815,0.512821,"[0.0, 95.0, 109.0, 125.0, 147.0, 199.0]","[0.0, 16.775, 33.55, 50.325, 67.1]","[0.0, 24.0, 30.0, 42.0, 54.0, 62.0, 65.0, 81.0]",0,2
7339,0.827815,0.512821,"[0.0, 95.0, 109.0, 125.0, 147.0, 199.0]","[0.0, 16.775, 33.55, 50.325, 67.1]","[0.0, 24.0, 30.0, 81.0]",0,3
7348,0.827815,0.512821,"[0.0, 95.0, 109.0, 125.0, 147.0, 199.0]","[0.0, 16.775, 33.55, 50.325, 67.1]","[0.0, 24.0, 30.0, 42.0, 54.0, 62.0, 81.0]",0,2
15536,0.827815,0.512821,"[0.0, 87.0, 97.0, 105.0, 112.0, 122.0, 130.0, ...","[0.0, 7.456, 22.367, 29.822, 37.278, 44.733, 5...","[0.0, 31.0, 41.0, 51.0, 61.0, 71.0, 81.0]",0,0
9036,0.827815,0.512821,"[0.0, 33.167, 66.333, 99.5, 132.667, 165.833, ...","[0.0, 7.456, 22.367, 29.822, 37.278, 44.733, 5...","[0.0, 31.0, 41.0, 51.0, 61.0, 71.0, 81.0]",0,0
...,...,...,...,...,...,...,...
1386,0.635762,0.512821,"[0.0, 117.0, 199.0]","[0.0, 26.3, 67.1]","[0.0, 36.0, 51.0, 66.0, 81.0]",0,3
1383,0.635762,0.512821,"[0.0, 117.0, 199.0]","[0.0, 26.3, 67.1]","[0.0, 41.0, 61.0, 81.0]",0,3
1461,0.629139,0.512821,"[0.0, 117.0, 199.0]","[0.0, 26.3, 47.9, 67.1]","[0.0, 41.0, 61.0, 81.0]",0,3
1799,0.629139,0.551282,"[0.0, 117.0, 199.0]","[0.0, 8.388, 25.162, 33.55, 41.938, 50.325, 58...","[0.0, 41.0, 61.0, 81.0]",0,2
