In [9]:
import numpy as np
import pandas as pd

import tqdm

from scipy.stats import wilcoxon

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score

from flipping_random_forest import FlippingRandomForestRegressor

import common_datasets.regression as regr

In [10]:
datasets = regr.get_filtered_data_loaders(n_bounds=(1, 3000), n_col_bounds=(1, 200), n_from_phenotypes=1)
names = [dataset()['name'] for dataset in datasets]

In [11]:
names

['airfoil',
 'autoMPG6',
 'baseball',
 'boom_bikes',
 'concrete',
 'cpu_performance',
 'daily-demand',
 'diabetes',
 'excitation_current',
 'forestfires',
 'laser',
 'maternal_health_risk',
 'medical_cost',
 'mortgage',
 'o-ring',
 'plastic',
 'qsar-aquatic-toxicity',
 'real_estate_valuation',
 'residential_building',
 'servo',
 'slump_test',
 'stock_portfolio_performance',
 'treasury',
 'wankara',
 'winequality_red',
 'wizmir',
 'wsn-ale',
 'yacht_hydrodynamics']

In [12]:
summary = regr.get_summary_pdf()
summary = summary[summary['name'].isin(names)]

In [13]:
tmp = summary[summary[['grid', 'n_feature_uniques']].apply(lambda x: np.sum(np.array(x['grid']) & (np.array(x['n_feature_uniques']) > 2)) > 0, axis=1)]
data_loaders = tmp['data_loader_function']

In [14]:
data_loaders = [data_loader for data_loader in data_loaders if not data_loader in [regr.load_forestfires]]

In [15]:
len(data_loaders)

20

In [16]:
data_loaders

[<function common_datasets.regression._regression.load_autoMPG6()>,
 <function common_datasets.regression._regression.load_baseball()>,
 <function common_datasets.regression._regression.load_boom_bikes()>,
 <function common_datasets.regression._regression.load_daily_demand()>,
 <function common_datasets.regression._regression.load_excitation_current()>,
 <function common_datasets.regression._regression.load_laser()>,
 <function common_datasets.regression._regression.load_maternal_health_risk()>,
 <function common_datasets.regression._regression.load_medical_cost()>,
 <function common_datasets.regression._regression.load_o_ring()>,
 <function common_datasets.regression._regression.load_plastic()>,
 <function common_datasets.regression._regression.load_qsar_aquatic_toxicity()>,
 <function common_datasets.regression._regression.load_real_estate_valuation()>,
 <function common_datasets.regression._regression.load_residential_building()>,
 <function common_datasets.regression._regression.lo

In [17]:
results = []

validator = RepeatedKFold(n_splits=5, n_repeats=400, random_state=5)

for data_loader in data_loaders:
    dataset = data_loader()
    X = dataset['data']
    y = dataset['target']
    
    r2s_orig = []
    r2s_flipped = []
    r2s_baseline = []
    r2s_baseline_flipped = []
    r2s_flipping_full = []
    r2s_flipping_coord = []
    
    for train, test in tqdm.tqdm(validator.split(X, y, y)):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        
        min_samples_leaf = 1#np.random.randint(1, 21)
        max_depth = None#np.random.choice([None, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        
        params = {'n_jobs': 1,
                    'min_samples_leaf': min_samples_leaf,
                    'max_depth': max_depth}
        
        #try:
        pred = RandomForestRegressor(**params).fit(X_train, y_train).predict(X_test)
        r2s_orig.append(r2_score(y_test, pred))
        
        #pred = RandomForestRegressor().fit(-X_train, y_train).predict(-X_test)
        #r2s_flipped.append(r2_score(y_test, pred))
        r2s_flipped.append(0)
        
        pred = FlippingRandomForestRegressor(**params).fit(X_train, y_train).predict(X_test)
        r2s_baseline.append(r2_score(y_test, pred))
        
        #pred = FlippingRandomForestRegressor().fit(-X_train, y_train).predict(-X_test)
        #r2s_baseline_flipped.append(r2_score(y_test, pred))
        r2s_baseline_flipped.append(0)
        
        pred = FlippingRandomForestRegressor(flipping='full', **params).fit(X_train, y_train).predict(X_test)
        r2s_flipping_full.append(r2_score(y_test, pred))
        
        pred = FlippingRandomForestRegressor(flipping='coordinate', **params).fit(X_train, y_train).predict(X_test)
        r2s_flipping_coord.append(r2_score(y_test, pred))
        #except:
        #    pass
    
    tmp = [dataset['name'], np.mean(r2s_orig), np.mean(r2s_flipped), np.mean(r2s_baseline), 
                np.mean(r2s_baseline_flipped), np.mean(r2s_flipping_full), np.mean(r2s_flipping_coord), 
                r2s_orig, r2s_flipped, r2s_baseline, r2s_baseline_flipped, 
                r2s_flipping_full, r2s_flipping_coord]
    
    tmp = tmp + [wilcoxon(r2s_baseline, r2s_flipping_full, alternative='less', zero_method='zsplit').pvalue,
                wilcoxon(r2s_baseline, r2s_flipping_coord, alternative='less', zero_method='zsplit').pvalue]
    
    results.append(tmp)
    
    results_pdf = pd.DataFrame(results, columns=['name', 'r2_orig', 'r2_flipped', 'r2_baseline', 'r2_baseline_flipped', 
                                                'r2_flipping_full', 'r2_flipping_coord', 'r2s_orig', 'r2s_flipped', 'r2s_baseline', 
                                                'r2s_baseline_flipped', 'r2s_flipping_full', 'r2s_flipping_coord', 'p_full', 'p_coord'])
    results_pdf['r2_baseline_min'] = results_pdf[['r2_baseline', 'r2_baseline_flipped']].apply(lambda x: min(x), axis=1)
    print(results_pdf[['name', 
                        'r2_orig', 
                        #'r2_flipped', 
                        'r2_baseline', 
                        #'r2_baseline_flipped', 
                        #'r2_baseline_min', 
                        'r2_flipping_full', 
                        'r2_flipping_coord', 'p_full', 'p_coord']])
        


       name   r2_orig  r2_baseline  r2_flipping_full  r2_flipping_coord  \
0  autoMPG6  0.871983     0.872023          0.871988           0.871926   

     p_full   p_coord  
0  0.437016  0.602573  
       name   r2_orig  r2_baseline  r2_flipping_full  r2_flipping_coord  \
0  autoMPG6  0.871983     0.872023          0.871988           0.871926   
1  baseball  0.667693     0.668028          0.667533           0.667378   

     p_full   p_coord  
0  0.437016  0.602573  
1  0.851640  0.875596  
         name   r2_orig  r2_baseline  r2_flipping_full  r2_flipping_coord  \
0    autoMPG6  0.871983     0.872023          0.871988           0.871926   
1    baseball  0.667693     0.668028          0.667533           0.667378   
2  boom_bikes  0.996079     0.996083          0.996094           0.996088   

     p_full   p_coord  
0  0.437016  0.602573  
1  0.851640  0.875596  
2  0.296349  0.339869  
           name   r2_orig  r2_baseline  r2_flipping_full  r2_flipping_coord  \
0      autoMPG6  0.

In [19]:
results_pdf.to_csv("regression.csv")

In [18]:
for idx, row in results_pdf.iterrows():
    #w0 = wilcoxon(row['r2s_baseline'], row['r2s_flipping_full'], alternative='greater').pvalue
    w1 = wilcoxon(row['r2s_baseline'], row['r2s_flipping_full'], alternative='less').pvalue
    print(row['name'], w1)

autoMPG6 0.4370159603303962
baseball 0.8516401975571873
boom_bikes 0.2963492991586663
daily-demand 3.1008291500015614e-07
excitation_current 7.421519860869094e-74
laser 4.843652515775188e-06
maternal_health_risk 1.3871539003992711e-05
medical_cost 0.17062460357266285
o-ring 3.7894482814760103e-06
plastic 0.9999999999999997
qsar-aquatic-toxicity 0.24432357900535534
real_estate_valuation 0.9842337533827089
residential_building 0.6772196197117168
servo 0.23279670996462598
stock_portfolio_performance 0.8930726046348134
wankara 0.00154799123770635
winequality_red 0.774976240056588
wizmir 0.002873880603261689
wsn-ale 0.0006193296246886752
yacht_hydrodynamics 0.004222205502664591
