In [1]:
import numpy as np
import pandas as pd

import tqdm

from scipy.stats import wilcoxon

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score

from flipping_random_forest import FlippingRandomForestClassifier

import common_datasets.binary_classification as binclas

In [2]:
datasets = binclas.get_filtered_data_loaders(n_bounds=(1, 1000), n_minority_bounds=(5, 1000), n_from_phenotypes=1)
names = [dataset()['name'] for dataset in datasets]

In [3]:
summary = binclas.get_summary_pdf()
summary = summary[summary['name'].isin(names)]

In [4]:
tmp = summary[summary[['grid', 'n_feature_uniques']].apply(lambda x: np.sum(np.array(x['grid']) & (np.array(x['n_feature_uniques']) > 4)) > 0, axis=1)]
data_loaders = tmp['data_loader_function']

In [5]:
data_loaders = [dl for dl in data_loaders if not dl in [binclas.load_iris0, binclas.load_dermatology_6]]

In [6]:
results = []

validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=400, random_state=5)

for data_loader in data_loaders:
    dataset = data_loader()
    X = dataset['data']
    y = dataset['target']
    
    aucs_orig = []
    aucs_flipped = []
    aucs_baseline = []
    aucs_baseline_flipped = []
    aucs_flipping_full = []
    aucs_flipping_coord = []
    
    for train, test in tqdm.tqdm(validator.split(X, y, y)):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        
        min_samples_leaf = 1#np.random.randint(1, 21)
        max_depth = None#np.random.choice([None, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        
        params = {'n_jobs': 1,
                    'min_samples_leaf': min_samples_leaf,
                    'max_depth': max_depth}
        
        try:
            pred = RandomForestClassifier(**params).fit(X_train, y_train).predict_proba(X_test)[:, 1]
            aucs_orig.append(roc_auc_score(y_test, pred))
            
            #pred = RandomForestClassifier(n_jobs=4).fit(-X_train, y_train).predict_proba(-X_test)[:, 1]
            #aucs_flipped.append(roc_auc_score(y_test, pred))
            aucs_flipped.append(0)
            
            pred = FlippingRandomForestClassifier(**params).fit(X_train, y_train).predict_proba(X_test)[:, 1]
            aucs_baseline.append(roc_auc_score(y_test, pred))
            
            #pred = FlippingRandomForestClassifier(n_jobs=4).fit(-X_train, y_train).predict_proba(-X_test)[:, 1]
            #aucs_baseline_flipped.append(roc_auc_score(y_test, pred))
            aucs_baseline_flipped.append(0)
            
            pred = FlippingRandomForestClassifier(flipping='full', **params).fit(X_train, y_train).predict_proba(X_test)[:, 1]
            aucs_flipping_full.append(roc_auc_score(y_test, pred))
            
            pred = FlippingRandomForestClassifier(flipping='coordinate', **params).fit(X_train, y_train).predict_proba(X_test)[:, 1]
            aucs_flipping_coord.append(roc_auc_score(y_test, pred))
        except:
            pass
    
    tmp = [dataset['name'], np.mean(aucs_orig), np.mean(aucs_flipped), np.mean(aucs_baseline), 
                            np.mean(aucs_baseline_flipped), np.mean(aucs_flipping_full), np.mean(aucs_flipping_coord), 
                            aucs_orig, aucs_flipped, aucs_baseline, aucs_baseline_flipped, aucs_flipping_full, aucs_flipping_coord]
    
    tmp = tmp + [wilcoxon(aucs_baseline, aucs_flipping_full, alternative='less', zero_method='zsplit').pvalue,
                wilcoxon(aucs_baseline, aucs_flipping_coord, alternative='less', zero_method='zsplit').pvalue]
    
    results.append(tmp)
    
    results_pdf = pd.DataFrame(results, columns=['name', 'auc_orig', 'auc_flipped', 'auc_baseline', 'auc_baseline_flipped', 
                                'auc_flipping_full', 'auc_flipping_coord', 'aucs_orig', 'aucs_flipped', 'aucs_baseline', 
                                'aucs_baseline_flipped', 'aucs_flipping_full', 'aucs_flipping_coord', 'p_full', 'p_coord'])
    results_pdf['auc_baseline_min'] = results_pdf[['auc_baseline', 'auc_baseline_flipped']].apply(lambda x: min(x), axis=1)
    results_pdf['auc_baseline_max'] = results_pdf[['auc_baseline', 'auc_baseline_flipped']].apply(lambda x: max(x), axis=1)
    print(results_pdf[['name', 
                        'auc_orig', 
                        #'auc_flipped', 
                        'auc_baseline', 
                        #'auc_baseline_flipped', 
                        #'auc_baseline_min', 
                        #'auc_baseline_max', 
                        'auc_flipping_full', 
                        'auc_flipping_coord', 'p_full', 'p_coord']])
        


2000it [19:41,  1.69it/s]


          name  auc_orig  auc_baseline  auc_flipping_full  auc_flipping_coord  \
0  abalone9_18  0.830577      0.831543           0.829827            0.830714   

     p_full   p_coord  
0  0.995937  0.858251  


2000it [16:53,  1.97it/s]


          name  auc_orig  auc_baseline  auc_flipping_full  auc_flipping_coord  \
0  abalone9_18  0.830577      0.831543           0.829827            0.830714   
1   australian  0.926534      0.926573           0.926603            0.926654   

     p_full   p_coord  
0  0.995937  0.858251  
1  0.465738  0.325069  


2000it [13:17,  2.51it/s]


          name  auc_orig  auc_baseline  auc_flipping_full  auc_flipping_coord  \
0  abalone9_18  0.830577      0.831543           0.829827            0.830714   
1   australian  0.926534      0.926573           0.926603            0.926654   
2         bupa  0.764493      0.764138           0.762466            0.763798   

     p_full   p_coord  
0  0.995937  0.858251  
1  0.465738  0.325069  
2  0.999396  0.817624  


2000it [10:35,  3.15it/s]


               name  auc_orig  auc_baseline  auc_flipping_full  \
0       abalone9_18  0.830577      0.831543           0.829827   
1        australian  0.926534      0.926573           0.926603   
2              bupa  0.764493      0.764138           0.762466   
3  cleveland-0_vs_4  0.972105      0.972175           0.973268   

   auc_flipping_coord    p_full   p_coord  
0            0.830714  0.995937  0.858251  
1            0.926654  0.465738  0.325069  
2            0.763798  0.999396  0.817624  
3            0.973514  0.000171  0.000033  


2000it [11:42,  2.85it/s]


               name  auc_orig  auc_baseline  auc_flipping_full  \
0       abalone9_18  0.830577      0.831543           0.829827   
1        australian  0.926534      0.926573           0.926603   
2              bupa  0.764493      0.764138           0.762466   
3  cleveland-0_vs_4  0.972105      0.972175           0.973268   
4            ecoli1  0.954407      0.954147           0.954405   

   auc_flipping_coord    p_full   p_coord  
0            0.830714  0.995937  0.858251  
1            0.926654  0.465738  0.325069  
2            0.763798  0.999396  0.817624  
3            0.973514  0.000171  0.000033  
4            0.954435  0.240008  0.164955  


2000it [11:32,  2.89it/s]


               name  auc_orig  auc_baseline  auc_flipping_full  \
0       abalone9_18  0.830577      0.831543           0.829827   
1        australian  0.926534      0.926573           0.926603   
2              bupa  0.764493      0.764138           0.762466   
3  cleveland-0_vs_4  0.972105      0.972175           0.973268   
4            ecoli1  0.954407      0.954147           0.954405   
5          haberman  0.668203      0.667635           0.670004   

   auc_flipping_coord    p_full       p_coord  
0            0.830714  0.995937  8.582506e-01  
1            0.926654  0.465738  3.250693e-01  
2            0.763798  0.999396  8.176241e-01  
3            0.973514  0.000171  3.337387e-05  
4            0.954435  0.240008  1.649553e-01  
5            0.670738  0.000002  7.777188e-10  


2000it [10:49,  3.08it/s]


               name  auc_orig  auc_baseline  auc_flipping_full  \
0       abalone9_18  0.830577      0.831543           0.829827   
1        australian  0.926534      0.926573           0.926603   
2              bupa  0.764493      0.764138           0.762466   
3  cleveland-0_vs_4  0.972105      0.972175           0.973268   
4            ecoli1  0.954407      0.954147           0.954405   
5          haberman  0.668203      0.667635           0.670004   
6         hepatitis  0.876441      0.876280           0.876141   

   auc_flipping_coord    p_full       p_coord  
0            0.830714  0.995937  8.582506e-01  
1            0.926654  0.465738  3.250693e-01  
2            0.763798  0.999396  8.176241e-01  
3            0.973514  0.000171  3.337387e-05  
4            0.954435  0.240008  1.649553e-01  
5            0.670738  0.000002  7.777188e-10  
6            0.876608  0.465752  3.550414e-01  


2000it [10:05,  3.30it/s]


                           name  auc_orig  auc_baseline  auc_flipping_full  \
0                   abalone9_18  0.830577      0.831543           0.829827   
1                    australian  0.926534      0.926573           0.926603   
2                          bupa  0.764493      0.764138           0.762466   
3              cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                        ecoli1  0.954407      0.954147           0.954405   
5                      haberman  0.668203      0.667635           0.670004   
6                     hepatitis  0.876441      0.876280           0.876141   
7  lymphography-normal-fibrosis  0.993195      0.993807           0.995094   

   auc_flipping_coord    p_full       p_coord  
0            0.830714  0.995937  8.582506e-01  
1            0.926654  0.465738  3.250693e-01  
2            0.763798  0.999396  8.176241e-01  
3            0.973514  0.000171  3.337387e-05  
4            0.954435  0.240008  1.649553e-01  
5        

2000it [14:27,  2.30it/s]


                           name  auc_orig  auc_baseline  auc_flipping_full  \
0                   abalone9_18  0.830577      0.831543           0.829827   
1                    australian  0.926534      0.926573           0.926603   
2                          bupa  0.764493      0.764138           0.762466   
3              cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                        ecoli1  0.954407      0.954147           0.954405   
5                      haberman  0.668203      0.667635           0.670004   
6                     hepatitis  0.876441      0.876280           0.876141   
7  lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                  mammographic  0.867551      0.867652           0.868304   

   auc_flipping_coord        p_full       p_coord  
0            0.830714  9.959371e-01  8.582506e-01  
1            0.926654  4.657385e-01  3.250693e-01  
2            0.763798  9.993959e-01  8.176241e-01  
3          

2000it [10:38,  3.13it/s]


                           name  auc_orig  auc_baseline  auc_flipping_full  \
0                   abalone9_18  0.830577      0.831543           0.829827   
1                    australian  0.926534      0.926573           0.926603   
2                          bupa  0.764493      0.764138           0.762466   
3              cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                        ecoli1  0.954407      0.954147           0.954405   
5                      haberman  0.668203      0.667635           0.670004   
6                     hepatitis  0.876441      0.876280           0.876141   
7  lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                  mammographic  0.867551      0.867652           0.868304   
9                  new_thyroid1  0.999228      0.999199           0.999132   

   auc_flipping_coord        p_full       p_coord  
0            0.830714  9.959371e-01  8.582506e-01  
1            0.926654  4.657385e-01  

2000it [16:55,  1.97it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   

    auc_flipping_coord        p_full       p_coord 

2000it [10:37,  3.14it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [15:40,  2.13it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [10:38,  3.13it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [13:44,  2.43it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [16:54,  1.97it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [19:42,  1.69it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [17:38,  1.89it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [14:00,  2.38it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [12:29,  2.67it/s]


                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98

2000it [12:46,  2.61it/s]

                            name  auc_orig  auc_baseline  auc_flipping_full  \
0                    abalone9_18  0.830577      0.831543           0.829827   
1                     australian  0.926534      0.926573           0.926603   
2                           bupa  0.764493      0.764138           0.762466   
3               cleveland-0_vs_4  0.972105      0.972175           0.973268   
4                         ecoli1  0.954407      0.954147           0.954405   
5                       haberman  0.668203      0.667635           0.670004   
6                      hepatitis  0.876441      0.876280           0.876141   
7   lymphography-normal-fibrosis  0.993195      0.993807           0.995094   
8                   mammographic  0.867551      0.867652           0.868304   
9                   new_thyroid1  0.999228      0.999199           0.999132   
10                          pima  0.823438      0.823378           0.823358   
11                  poker-9_vs_7  0.984687      0.98




In [7]:
results_pdf.to_csv("classification.csv")