In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.width', 200)

from sklearn.tree import DecisionTreeRegressor

from flipping_random_forest import (MirroredDecisionTreeRegressor, evaluate, grid_point_splits)

from datasets import regr_datasets

2023-02-20 19:42:01 INFO     querying the filtered classification datasets
2023-02-20 19:42:01 INFO     ranking the datasets
2023-02-20 19:42:01 INFO     binary classification datasets prepared
2023-02-20 19:42:01 INFO     querying the filtered regression datasets
2023-02-20 19:42:01 INFO     ranking the datasets
2023-02-20 19:42:01 INFO     regression datasets prepared


In [2]:
results = []
for idx, row in regr_datasets.iterrows():
    dataset = row['data_loader_function']()
    print('processing', dataset['name'])
    X = dataset['data']
    y = dataset['target']
    dtr = DecisionTreeRegressor().fit(X, y)
    result = grid_point_splits(dtr, X, dataset['grid'])
    result['name'] = dataset['name']
    results.append(result)
results = pd.concat(results)

processing o-ring
processing stock_portfolio_performance
processing wsn-ale
processing daily-demand
processing servo
processing yacht_hydrodynamics
processing autoMPG6
processing excitation_current
processing real_estate_valuation
processing wankara
processing plastic
processing laser
processing qsar-aquatic-toxicity
processing baseball
processing maternal_health_risk
processing medical_cost
processing boom_bikes
processing wizmir
processing forestfires
processing winequality_red


In [3]:
results.groupby('name').apply(lambda pdf: pd.Series({'name': pdf.iloc[0]['name'],
                                                     'all_nodes': pdf.iloc[0]['all_nodes'],
                                                     'grid': np.sum(pdf['grid_split'])}))

Unnamed: 0_level_0,name,all_nodes,grid
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
autoMPG6,autoMPG6,322,45
baseball,baseball,318,94
boom_bikes,boom_bikes,720,30
daily-demand,daily-demand,59,3
excitation_current,excitation_current,346,1
forestfires,forestfires,332,30
laser,laser,799,300
maternal_health_risk,maternal_health_risk,166,20
medical_cost,medical_cost,1154,13
o-ring,o-ring,5,0


In [4]:
regr_datasets

Unnamed: 0,name,citation_key,n_col,n,n_grid,grid,data_loader_function
0,o-ring,uci,6,23,3,"[True, False, True, True, True, True]",<function load_o_ring at 0x7f52752876d0>
1,stock_portfolio_performance,uci,6,63,1,"[False, False, False, False, False, True]",<function load_stock_portfolio_performance at ...
2,wsn-ale,uci,5,107,1,"[False, False, True, False, False]",<function load_wsn_ale at 0x7f52752877f0>
3,daily-demand,uci,12,60,2,"[True, True, False, False, False, False, False...",<function load_daily_demand at 0x7f5275287760>
4,servo,uci,10,167,2,"[True, True, True, True, True, True, True, Tru...",<function load_servo at 0x7f5275287880>
5,yacht_hydrodynamics,krnn,6,307,1,"[False, False, False, False, False, True]",<function load_yacht_hydrodynamics at 0x7f5275...
6,autoMPG6,keel,5,392,2,"[False, True, False, False, True]",<function load_autoMPG6 at 0x7f5275286d40>
7,excitation_current,uci,4,557,2,"[True, True, False, False]",<function load_excitation_current at 0x7f52752...
8,real_estate_valuation,uci,6,414,2,"[True, False, False, True, False, False]",<function load_real_estate_valuation at 0x7f52...
9,wankara,keel,9,321,2,"[False, False, False, False, True, True, False...",<function load_wankara at 0x7f5275286e60>


In [5]:
data_loaders = regr_datasets['data_loader_function'].values.tolist()

In [6]:
scenarios=[{'name': 'original',
            'estimator': DecisionTreeRegressor,
            'estimator_params': {'min_samples_leaf': 2, 'max_features': 'sqrt'},
            'multiplier': 1},
            
            {'name': 'mirrored',
            'estimator': MirroredDecisionTreeRegressor,
            'estimator_params': {'min_samples_leaf': 2, 'max_features': 'sqrt'},
            'multiplier': 1},
            
            {'name': 'inverted',
            'estimator': DecisionTreeRegressor,
            'estimator_params': {'min_samples_leaf': 2, 'max_features': 'sqrt'},
            'multiplier': -1}]
            
compare=[('original', 'mirrored'), 
            ('original', 'inverted')]

total = 10_000

validator_params = {'n_repeats': total/5,
                    'n_splits': 5,
                    'random_state': 5}

validator_params_5 = {'n_repeats': total/5,
                    'n_splits': 5,
                    'random_state': 5}

validator_params_4 = {'n_repeats': total/4,
                    'n_splits': 4,
                    'random_state': 5}

validator_params_3 = {'n_repeats': int(total/3),
                    'n_splits': 3,
                    'random_state': 5}

validator_params_2 = {'n_repeats': total/2,
                    'n_splits': 2,
                    'random_state': 5}

In [7]:
results_pdf = evaluate(scenarios=scenarios, 
                        compare=compare, 
                        data_loaders=data_loaders,
                        validator_params=validator_params,
                        score='r2',
                        random_state=5)

ValueError: Number of repetitions must be of Integral type.

In [None]:
results_pdf[['name', 'r2_mean_original', 'r2_mean_mirrored', 'r2_mean_inverted', 'p_original_mirrored', 'p_original_inverted']]

In [None]:
results_pdf.to_csv('existance-regression-dt.csv')