# Extracting the statistics for the number of conditions with thresholds on splits

In [33]:
import os

import datetime
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold

from flipping_random_forest import count_lattice_splits

from datasets import regr_datasets, binclas_datasets
from config import data_dir, random_seed, n_repeats_ms, n_splits_ms

In [34]:
labels = ['dtc', 'dtr', 'rfc', 'rfr']
params = {}

In [35]:
for label in labels:
    params[label] = {}
    best_params = pd.read_csv(os.path.join(data_dir, f'params_{label}.csv'))
    for idx, row in best_params.iterrows():
        params[label][row['name']] = eval(row['params'])

In [36]:
counts = {}

for idx, row in binclas_datasets.iterrows():
    dataset = row['data_loader_function']()

    param = params['dtc'][dataset['name']]

    X = dataset['data']
    y = dataset['target']

    estimator = DecisionTreeClassifier(**(param | {'random_state': random_seed})).fit(X, y)
    n_lattice_splits, n_splits = count_lattice_splits(X, estimator)
    counts[dataset['name']] = {'n_lattice_splits': n_lattice_splits, 'n_splits': n_splits, 'n_lattice_splits_kfold': 0, 'n_splits_kfold': 0}

    for idx, (train, test) in enumerate(RepeatedStratifiedKFold(n_splits=n_splits_ms, n_repeats=n_repeats_ms).split(X, y, y)):
        X_train = X[train]
        y_train = y[train]

        estimator = DecisionTreeClassifier(**(param | {'random_state': random_seed})).fit(X_train, y_train)

        for tree in [estimator]:
            n_lattice_splits, n_splits = count_lattice_splits(X_train, tree)
            counts[dataset['name']]['n_lattice_splits_kfold'] += n_lattice_splits
            counts[dataset['name']]['n_splits_kfold'] += n_splits

pd.DataFrame.from_dict(counts).T.to_csv(os.path.join(data_dir, 'splits_dtc.csv'))

In [37]:
counts = {}

for idx, row in regr_datasets.iterrows():
    dataset = row['data_loader_function']()

    param = params['dtr'][dataset['name']]

    X = dataset['data']
    y = dataset['target']

    estimator = DecisionTreeRegressor(**(param | {'random_state': random_seed})).fit(X, y)
    n_lattice_splits, n_splits = count_lattice_splits(X, estimator)
    counts[dataset['name']] = {'n_lattice_splits': n_lattice_splits, 'n_splits': n_splits, 'n_lattice_splits_kfold': 0, 'n_splits_kfold': 0}

    for idx, (train, test) in enumerate(RepeatedKFold(n_splits=n_splits_ms, n_repeats=n_repeats_ms).split(X, y)):
        X_train = X[train]
        y_train = y[train]

        estimator = DecisionTreeRegressor(**(param | {'random_state': random_seed})).fit(X_train, y_train)

        for tree in [estimator]:
            n_lattice_splits, n_splits = count_lattice_splits(X_train, tree)
            counts[dataset['name']]['n_lattice_splits_kfold'] += n_lattice_splits
            counts[dataset['name']]['n_splits_kfold'] += n_splits

pd.DataFrame.from_dict(counts).T.to_csv(os.path.join(data_dir, 'splits_dtr.csv'))

In [38]:
counts = {}

for idx, row in binclas_datasets.iterrows():
    dataset = row['data_loader_function']()

    print(datetime.datetime.now(), dataset['name'])

    param = params['rfc'][dataset['name']]

    X = dataset['data']
    y = dataset['target']

    estimator = RandomForestClassifier(**(param | {'random_state': random_seed})).fit(X, y)
    counts[dataset['name']] = {'n_lattice_splits': 0, 'n_splits': 0, 'n_lattice_splits_kfold': 0, 'n_splits_kfold': 0}

    for tree in estimator.estimators_:
        n_lattice_splits, n_splits = count_lattice_splits(X, tree)
        counts[dataset['name']]['n_lattice_splits'] += n_lattice_splits
        counts[dataset['name']]['n_splits'] += n_splits

    for idx, (train, test) in enumerate(RepeatedStratifiedKFold(n_splits=n_splits_ms, n_repeats=n_repeats_ms).split(X, y, y)):
        X_train = X[train]
        y_train = y[train]

        estimator = RandomForestClassifier(**(param | {'random_state': random_seed})).fit(X_train, y_train)

        for tree in estimator.estimators_:
            n_lattice_splits, n_splits = count_lattice_splits(X_train, tree)
            counts[dataset['name']]['n_lattice_splits_kfold'] += n_lattice_splits
            counts[dataset['name']]['n_splits_kfold'] += n_splits

pd.DataFrame.from_dict(counts).T.to_csv(os.path.join(data_dir, 'splits_rfc.csv'))

2023-11-26 14:12:04.456177 appendicitis
2023-11-26 14:12:23.580414 haberman
2023-11-26 14:12:38.360011 new_thyroid1
2023-11-26 14:12:54.194640 glass0
2023-11-26 14:13:25.403137 shuttle-6_vs_2-3
2023-11-26 14:13:44.336407 bupa
2023-11-26 14:14:15.489204 cleveland-0_vs_4
2023-11-26 14:14:35.866355 ecoli1
2023-11-26 14:14:56.758234 poker-9_vs_7
2023-11-26 14:15:15.662181 monk-2
2023-11-26 14:15:35.496493 hepatitis
2023-11-26 14:16:01.054524 yeast-0-3-5-9_vs_7-8
2023-11-26 14:16:38.488780 mammographic
2023-11-26 14:16:56.698137 saheart
2023-11-26 14:17:20.004246 page-blocks-1-3_vs_4
2023-11-26 14:17:45.503898 lymphography-normal-fibrosis
2023-11-26 14:18:14.038528 pima
2023-11-26 14:18:45.411852 wisconsin
2023-11-26 14:19:10.633983 abalone9_18
2023-11-26 14:19:41.918514 winequality-red-3_vs_5


In [39]:
counts = {}

for idx, row in regr_datasets.iterrows():
    dataset = row['data_loader_function']()

    print(datetime.datetime.now(), dataset['name'])

    param = params['rfr'][dataset['name']]

    X = dataset['data']
    y = dataset['target']

    estimator = RandomForestRegressor(**(param | {'random_state': random_seed})).fit(X, y)
    counts[dataset['name']] = {'n_lattice_splits': 0, 'n_splits': 0, 'n_lattice_splits_kfold': 0, 'n_splits_kfold': 0}

    for tree in estimator.estimators_:
        n_lattice_splits, n_splits = count_lattice_splits(X, tree)
        counts[dataset['name']]['n_lattice_splits'] += n_lattice_splits
        counts[dataset['name']]['n_splits'] += n_splits

    for idx, (train, test) in enumerate(RepeatedKFold(n_splits=n_splits_ms, n_repeats=n_repeats_ms).split(X, y)):
        X_train = X[train]
        y_train = y[train]

        estimator = RandomForestRegressor(**(param | {'random_state': random_seed})).fit(X_train, y_train)

        for tree in estimator.estimators_:
            n_lattice_splits, n_splits = count_lattice_splits(X_train, tree)
            counts[dataset['name']]['n_lattice_splits_kfold'] += n_lattice_splits
            counts[dataset['name']]['n_splits_kfold'] += n_splits

pd.DataFrame.from_dict(counts).T.to_csv(os.path.join(data_dir, 'splits_rfr.csv'))

2023-11-26 14:20:04.626244 diabetes
2023-11-26 14:20:16.561708 o-ring
2023-11-26 14:20:28.705680 stock_portfolio_performance
2023-11-26 14:20:48.079229 wsn-ale
2023-11-26 14:21:24.323832 daily-demand
2023-11-26 14:23:14.050039 slump_test
2023-11-26 14:23:52.598030 servo
2023-11-26 14:24:10.088462 yacht_hydrodynamics
2023-11-26 14:25:18.901111 autoMPG6
2023-11-26 14:26:44.870620 excitation_current
2023-11-26 14:28:35.347290 real_estate_valuation
2023-11-26 14:30:08.252712 wankara
2023-11-26 14:31:31.643037 plastic
2023-11-26 14:32:15.914531 laser
2023-11-26 14:36:40.947093 qsar-aquatic-toxicity
2023-11-26 14:39:07.633942 baseball
2023-11-26 14:39:45.284310 maternal_health_risk
2023-11-26 14:41:05.388827 cpu_performance
2023-11-26 14:42:11.340478 airfoil
2023-11-26 14:48:18.558259 medical_cost


In [43]:
data = pd.read_csv('splits_rfc.csv')
data['perc'] = data['n_lattice_splits'] / data['n_splits']
data['perc_kfold'] = data['n_lattice_splits_kfold'] / data['n_splits_kfold']
data

Unnamed: 0.1,Unnamed: 0,n_lattice_splits,n_splits,n_lattice_splits_kfold,n_splits_kfold,perc,perc_kfold
0,appendicitis,3,133,470,15026,0.022556,0.031279
1,haberman,28,711,3564,68539,0.039381,0.052
2,new_thyroid1,41,570,4733,52046,0.07193,0.090939
3,glass0,118,2401,9193,202552,0.049146,0.045386
4,shuttle-6_vs_2-3,46,339,4144,31453,0.135693,0.131752
5,bupa,1095,4397,86851,357207,0.249033,0.243139
6,cleveland-0_vs_4,41,432,4282,40029,0.094907,0.106972
7,ecoli1,150,1273,14846,120900,0.117832,0.122796
8,poker-9_vs_7,101,709,9738,59477,0.142454,0.163727
9,monk-2,1,1814,496,190179,0.000551,0.002608
