In [7]:
import numpy as np
import pandas as pd

from scipy.stats import wilcoxon

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score

from flipping_random_forest import FlippingRandomForestRegressor

import common_datasets.regression as regr

In [8]:
datasets = regr.get_filtered_data_loaders(n_bounds=(1, 2000), n_from_phenotypes=1)
names = [dataset()['name'] for dataset in datasets]

In [None]:
names

['airfoil',
 'autoMPG6',
 'baseball',
 'communities',
 'cpu_performance',
 'daily-demand',
 'diabetes',
 'excitation_current',
 'forestfires',
 'laser',
 'mortgage',
 'o-ring',
 'qsar-aquatic-toxicity',
 'real_estate_valuation',
 'residential_building',
 'servo',
 'slump_test',
 'stock_portfolio_performance',
 'treasury',
 'wankara',
 'winequality_red',
 'wizmir',
 'wsn-ale',
 'yacht_hydrodynamics']

In [None]:
summary = regr.get_summary_pdf()
summary = summary[summary['name'].isin(names)]

In [None]:
tmp = summary[summary[['grid', 'n_feature_uniques']].apply(lambda x: np.sum(np.array(x['grid']) & (np.array(x['n_feature_uniques']) > 4)) > 0, axis=1)]
data_loaders = tmp['data_loader_function']

In [None]:
results = []

validator = RepeatedKFold(n_splits=5, n_repeats=400, random_state=5)

for data_loader in data_loaders:
    dataset = data_loader()
    X = dataset['data']
    y = dataset['target']
    
    r2s_orig = []
    r2s_flipped = []
    r2s_baseline = []
    r2s_baseline_flipped = []
    r2s_flipping_full = []
    r2s_flipping_coord = []
    
    for train, test in validator.split(X, y, y):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        
        min_samples_leaf = 1#np.random.randint(1, 21)
        max_depth = None#np.random.choice([None, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        
        params = {'n_jobs': 1,
                    'min_samples_leaf': min_samples_leaf,
                    'max_depth': max_depth}
        
        try:
            pred = RandomForestRegressor().fit(X_train, y_train).predict(X_test)
            r2s_orig.append(r2_score(y_test, pred))
            
            #pred = RandomForestRegressor().fit(-X_train, y_train).predict(-X_test)
            #r2s_flipped.append(r2_score(y_test, pred))
            r2s_flipped.append(0)
            
            pred = FlippingRandomForestRegressor().fit(X_train, y_train).predict(X_test)
            r2s_baseline.append(r2_score(y_test, pred))
            
            #pred = FlippingRandomForestRegressor().fit(-X_train, y_train).predict(-X_test)
            #r2s_baseline_flipped.append(r2_score(y_test, pred))
            r2s_baseline_flipped.append(0)
            
            pred = FlippingRandomForestRegressor(flipping='full').fit(X_train, y_train).predict(X_test)
            r2s_flipping_full.append(r2_score(y_test, pred))
            
            pred = FlippingRandomForestRegressor(flipping='coordinate').fit(X_train, y_train).predict(X_test)
            r2s_flipping_coord.append(r2_score(y_test, pred))
        except:
            pass
    
    tmp = [dataset['name'], np.mean(r2s_orig), np.mean(r2s_flipped), np.mean(r2s_baseline), 
                np.mean(r2s_baseline_flipped), np.mean(r2s_flipping_full), np.mean(r2s_flipping_coord), 
                r2s_orig, r2s_flipped, r2s_baseline, r2s_baseline_flipped, 
                r2s_flipping_full, r2s_flipping_coord]
    
    tmp = tmp + [wilcoxon(r2s_baseline, r2s_flipping_full, alternative='less', zero_method='zsplit').pvalue,
                wilcoxon(r2s_baseline, r2s_flipping_coord, alternative='less', zero_method='zsplit').pvalue]
    
    results.append(tmp)
    
    results_pdf = pd.DataFrame(results, columns=['name', 'r2_orig', 'r2_flipped', 'r2_baseline', 'r2_baseline_flipped', 
                                                'r2_flipping_full', 'r2_flipping_coord', 'r2s_orig', 'r2s_flipped', 'r2s_baseline', 
                                                'r2s_baseline_flipped', 'r2s_flipping_full', 'r2s_flipping_coord', 'p_full', 'p_coord'])
    results_pdf['r2_baseline_min'] = results_pdf[['r2_baseline', 'r2_baseline_flipped']].apply(lambda x: min(x), axis=1)
    print(results_pdf[['name', 
                        'r2_orig', 
                        #'r2_flipped', 
                        'r2_baseline', 
                        #'r2_baseline_flipped', 
                        #'r2_baseline_min', 
                        'r2_flipping_full', 
                        'r2_flipping_coord', 'p_full', 'p_coord']])
        


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: 13 columns passed, passed data had 15 columns

In [None]:
for idx, row in results_pdf.iterrows():
    #w0 = wilcoxon(row['r2s_baseline'], row['r2s_flipping_full'], alternative='greater').pvalue
    w1 = wilcoxon(row['r2s_baseline'], row['r2s_flipping_full'], alternative='less').pvalue
    print(row['name'], w1)

autoMPG6 0.36573577105115274
baseball 0.9846360945926227
communities 0.8084461211911906
daily-demand 0.5423364789775422
excitation_current 0.7254244057727449
forestfires 0.9911522021352104
laser 0.015049251726346182
o-ring 0.9584287471670327
qsar-aquatic-toxicity 0.08702948300325603
real_estate_valuation 0.6034729814662072
residential_building 0.05465414685507099
servo 0.3460903579450517
stock_portfolio_performance 0.5239525012421037
wankara 0.8114957698811465
winequality_red 0.747578261841179
wizmir 0.10448324479476356
yacht_hydrodynamics 0.5462777268096652
