In [1]:
import numpy as np
import pandas as pd

from scipy.stats import wilcoxon

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score

from flipping_random_forest import FlippingRandomForestClassifier

import common_datasets.binary_classification as binclas

In [2]:
datasets = binclas.get_filtered_data_loaders(n_bounds=(1, 1000), n_minority_bounds=(5, 1000), n_from_phenotypes=1)
names = [dataset()['name'] for dataset in datasets]

In [3]:
summary = binclas.get_summary_pdf()
summary = summary[summary['name'].isin(names)]

In [4]:
tmp = summary[summary[['grid', 'n_feature_uniques']].apply(lambda x: np.sum(np.array(x['grid']) & (np.array(x['n_feature_uniques']) > 4)) > 0, axis=1)]
data_loaders = tmp['data_loader_function']

In [5]:
data_loaders = [dl for dl in data_loaders if not dl in [binclas.load_iris0, binclas.load_dermatology_6]]

In [6]:
results = []

validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=400, random_state=5)

for data_loader in data_loaders:
    dataset = data_loader()
    X = dataset['data']
    y = dataset['target']
    
    aucs_orig = []
    aucs_flipped = []
    aucs_baseline = []
    aucs_baseline_flipped = []
    aucs_flipping_full = []
    aucs_flipping_coord = []
    
    for train, test in validator.split(X, y, y):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        
        try:
            pred = RandomForestClassifier(n_jobs=4, min_samples_leaf=3).fit(X_train, y_train).predict_proba(X_test)[:, 1]
            aucs_orig.append(roc_auc_score(y_test, pred))
            
            #pred = RandomForestClassifier(n_jobs=4).fit(-X_train, y_train).predict_proba(-X_test)[:, 1]
            #aucs_flipped.append(roc_auc_score(y_test, pred))
            aucs_flipped.append(0)
            
            pred = FlippingRandomForestClassifier(n_jobs=4, min_samples_leaf=3).fit(X_train, y_train).predict_proba(X_test)[:, 1]
            aucs_baseline.append(roc_auc_score(y_test, pred))
            
            #pred = FlippingRandomForestClassifier(n_jobs=4).fit(-X_train, y_train).predict_proba(-X_test)[:, 1]
            #aucs_baseline_flipped.append(roc_auc_score(y_test, pred))
            aucs_baseline_flipped.append(0)
            
            pred = FlippingRandomForestClassifier(flipping='full', n_jobs=4, min_samples_leaf=3).fit(X_train, y_train).predict_proba(X_test)[:, 1]
            aucs_flipping_full.append(roc_auc_score(y_test, pred))
            
            pred = FlippingRandomForestClassifier(flipping='coordinate', n_jobs=4, min_samples_leaf=3).fit(X_train, y_train).predict_proba(X_test)[:, 1]
            aucs_flipping_coord.append(roc_auc_score(y_test, pred))
        except:
            pass
    
    tmp = [dataset['name'], np.mean(aucs_orig), np.mean(aucs_flipped), np.mean(aucs_baseline), 
                            np.mean(aucs_baseline_flipped), np.mean(aucs_flipping_full), np.mean(aucs_flipping_coord), 
                            aucs_orig, aucs_flipped, aucs_baseline, aucs_baseline_flipped, aucs_flipping_full, aucs_flipping_coord]
    
    tmp = tmp + [wilcoxon(aucs_baseline, aucs_flipping_full, alternative='less', zero_method='zsplit').pvalue,
                wilcoxon(aucs_baseline, aucs_flipping_coord, alternative='less', zero_method='zsplit').pvalue]
    
    results.append(tmp)
    
    results_pdf = pd.DataFrame(results, columns=['name', 'auc_orig', 'auc_flipped', 'auc_baseline', 'auc_baseline_flipped', 
                                'auc_flipping_full', 'auc_flipping_coord', 'aucs_orig', 'aucs_flipped', 'aucs_baseline', 
                                'aucs_baseline_flipped', 'aucs_flipping_full', 'aucs_flipping_coord', 'p_full', 'p_coord'])
    results_pdf['auc_baseline_min'] = results_pdf[['auc_baseline', 'auc_baseline_flipped']].apply(lambda x: min(x), axis=1)
    results_pdf['auc_baseline_max'] = results_pdf[['auc_baseline', 'auc_baseline_flipped']].apply(lambda x: max(x), axis=1)
    print(results_pdf[['name', 
                        'auc_orig', 
                        #'auc_flipped', 
                        'auc_baseline', 
                        #'auc_baseline_flipped', 
                        #'auc_baseline_min', 
                        #'auc_baseline_max', 
                        'auc_flipping_full', 
                        'auc_flipping_coord', 'p_full', 'p_coord']])
        


          name  auc_orig  auc_baseline  auc_flipping_full  auc_flipping_coord  \
0  abalone9_18  0.848094      0.845158           0.845037             0.84487   

     p_full   p_coord  
0  0.459549  0.619868  
          name  auc_orig  auc_baseline  auc_flipping_full  auc_flipping_coord  \
0  abalone9_18  0.848094      0.845158           0.845037            0.844870   
1   australian  0.930104      0.929635           0.929667            0.929722   

     p_full   p_coord  
0  0.459549  0.619868  
1  0.450165  0.221037  




          name  auc_orig  auc_baseline  auc_flipping_full  auc_flipping_coord  \
0  abalone9_18  0.848094      0.845158           0.845037            0.844870   
1   australian  0.930104      0.929635           0.929667            0.929722   
2         bupa  0.770977      0.770142           0.768960            0.769385   

     p_full   p_coord  
0  0.459549  0.619868  
1  0.450165  0.221037  
2  0.997476  0.985387  
               name  auc_orig  auc_baseline  auc_flipping_full  \
0       abalone9_18  0.848094      0.845158           0.845037   
1        australian  0.930104      0.929635           0.929667   
2              bupa  0.770977      0.770142           0.768960   
3  cleveland-0_vs_4  0.974947      0.974863           0.975280   

   auc_flipping_coord    p_full   p_coord  
0            0.844870  0.459549  0.619868  
1            0.929722  0.450165  0.221037  
2            0.769385  0.997476  0.985387  
3            0.975083  0.018174  0.031904  
               name  auc_ori