In [5]:
import numpy as np
import pandas as pd

from scipy.stats import wilcoxon

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score

from flipping_random_forest import FlippingRandomForestRegressor

import common_datasets.regression as regr

In [6]:
datasets = regr.get_filtered_data_loaders(n_bounds=(1, 2000), n_col_bounds=(1, 200), n_from_phenotypes=1)
names = [dataset()['name'] for dataset in datasets]

In [7]:
names

['airfoil',
 'autoMPG6',
 'baseball',
 'cpu_performance',
 'daily-demand',
 'diabetes',
 'excitation_current',
 'forestfires',
 'laser',
 'mortgage',
 'o-ring',
 'qsar-aquatic-toxicity',
 'real_estate_valuation',
 'residential_building',
 'servo',
 'slump_test',
 'stock_portfolio_performance',
 'treasury',
 'wankara',
 'winequality_red',
 'wizmir',
 'wsn-ale',
 'yacht_hydrodynamics']

In [8]:
summary = regr.get_summary_pdf()
summary = summary[summary['name'].isin(names)]

In [9]:
tmp = summary[summary[['grid', 'n_feature_uniques']].apply(lambda x: np.sum(np.array(x['grid']) & (np.array(x['n_feature_uniques']) > 4)) > 0, axis=1)]
data_loaders = tmp['data_loader_function']

In [None]:
data_loaders = [data_loader for data_loader in data_loaders if not data_loader in [regr.load_forestfires]]

In [10]:
results = []

validator = RepeatedKFold(n_splits=5, n_repeats=400, random_state=5)

for data_loader in data_loaders:
    dataset = data_loader()
    X = dataset['data']
    y = dataset['target']
    
    r2s_orig = []
    r2s_flipped = []
    r2s_baseline = []
    r2s_baseline_flipped = []
    r2s_flipping_full = []
    r2s_flipping_coord = []
    
    for train, test in validator.split(X, y, y):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        
        min_samples_leaf = 1#np.random.randint(1, 21)
        max_depth = None#np.random.choice([None, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        
        params = {'n_jobs': 1,
                    'min_samples_leaf': min_samples_leaf,
                    'max_depth': max_depth}
        
        #try:
        pred = RandomForestRegressor(**params).fit(X_train, y_train).predict(X_test)
        r2s_orig.append(r2_score(y_test, pred))
        
        #pred = RandomForestRegressor().fit(-X_train, y_train).predict(-X_test)
        #r2s_flipped.append(r2_score(y_test, pred))
        r2s_flipped.append(0)
        
        pred = FlippingRandomForestRegressor(**params).fit(X_train, y_train).predict(X_test)
        r2s_baseline.append(r2_score(y_test, pred))
        
        #pred = FlippingRandomForestRegressor().fit(-X_train, y_train).predict(-X_test)
        #r2s_baseline_flipped.append(r2_score(y_test, pred))
        r2s_baseline_flipped.append(0)
        
        pred = FlippingRandomForestRegressor(flipping='full', **params).fit(X_train, y_train).predict(X_test)
        r2s_flipping_full.append(r2_score(y_test, pred))
        
        pred = FlippingRandomForestRegressor(flipping='coordinate', **params).fit(X_train, y_train).predict(X_test)
        r2s_flipping_coord.append(r2_score(y_test, pred))
        #except:
        #    pass
    
    tmp = [dataset['name'], np.mean(r2s_orig), np.mean(r2s_flipped), np.mean(r2s_baseline), 
                np.mean(r2s_baseline_flipped), np.mean(r2s_flipping_full), np.mean(r2s_flipping_coord), 
                r2s_orig, r2s_flipped, r2s_baseline, r2s_baseline_flipped, 
                r2s_flipping_full, r2s_flipping_coord]
    
    tmp = tmp + [wilcoxon(r2s_baseline, r2s_flipping_full, alternative='less', zero_method='zsplit').pvalue,
                wilcoxon(r2s_baseline, r2s_flipping_coord, alternative='less', zero_method='zsplit').pvalue]
    
    results.append(tmp)
    
    results_pdf = pd.DataFrame(results, columns=['name', 'r2_orig', 'r2_flipped', 'r2_baseline', 'r2_baseline_flipped', 
                                                'r2_flipping_full', 'r2_flipping_coord', 'r2s_orig', 'r2s_flipped', 'r2s_baseline', 
                                                'r2s_baseline_flipped', 'r2s_flipping_full', 'r2s_flipping_coord', 'p_full', 'p_coord'])
    results_pdf['r2_baseline_min'] = results_pdf[['r2_baseline', 'r2_baseline_flipped']].apply(lambda x: min(x), axis=1)
    print(results_pdf[['name', 
                        'r2_orig', 
                        #'r2_flipped', 
                        'r2_baseline', 
                        #'r2_baseline_flipped', 
                        #'r2_baseline_min', 
                        'r2_flipping_full', 
                        'r2_flipping_coord', 'p_full', 'p_coord']])
        


       name   r2_orig  r2_baseline  r2_flipping_full  r2_flipping_coord  \
0  autoMPG6  0.872045      0.87211          0.871949           0.872022   

     p_full   p_coord  
0  0.921993  0.826741  
       name   r2_orig  r2_baseline  r2_flipping_full  r2_flipping_coord  \
0  autoMPG6  0.872045     0.872110          0.871949           0.872022   
1  baseball  0.667569     0.667316          0.667154           0.667014   

     p_full   p_coord  
0  0.921993  0.826741  
1  0.519006  0.671922  
           name   r2_orig  r2_baseline  r2_flipping_full  r2_flipping_coord  \
0      autoMPG6  0.872045     0.872110          0.871949           0.872022   
1      baseball  0.667569     0.667316          0.667154           0.667014   
2  daily-demand  0.821413     0.819677          0.821828           0.824833   

     p_full       p_coord  
0  0.921993  8.267405e-01  
1  0.519006  6.719218e-01  
2  0.000222  5.824419e-19  
                 name   r2_orig  r2_baseline  r2_flipping_full  \
0       

In [11]:
for idx, row in results_pdf.iterrows():
    #w0 = wilcoxon(row['r2s_baseline'], row['r2s_flipping_full'], alternative='greater').pvalue
    w1 = wilcoxon(row['r2s_baseline'], row['r2s_flipping_full'], alternative='less').pvalue
    print(row['name'], w1)

autoMPG6 0.9219931284209805
baseball 0.5190058169329531
daily-demand 0.00022238871227872112
excitation_current 5.175336400448549e-94
forestfires 1.0
laser 0.001158326496748403
o-ring 1.6403553876303103e-10
qsar-aquatic-toxicity 0.048220485597124316
real_estate_valuation 0.2221659969502986
residential_building 0.7348588915951577
servo 0.35423470169471427
stock_portfolio_performance 0.7823164104440493
wankara 0.013186599975756321
winequality_red 0.7149404994781641
wizmir 0.016919438226062095
yacht_hydrodynamics 0.0318364305895066
