In [1]:
import numpy as np
import pandas as pd

from scipy.stats import wilcoxon

In [2]:
estimator = 'dt'
method = 'c'
postfix = '_mid'

token = f'{estimator}{method}{postfix}'

data = pd.concat([
    pd.read_csv(f'effect_presence_{token}.csv'),
    pd.read_csv(f'proposed_a{token}.csv'),
    pd.read_csv(f'proposed_f{token}.csv')
])

In [6]:
score = 'auc' if 'auc' in data.columns else 'r2'

In [7]:
def labeling(row):
    params = eval(row['params'])
    if row['estimator'].startswith('Operator'):
        if params['operator'] == '<':
            return 'orig_l'
        else:
            return 'orig_leq'
    if row['estimator'].startswith('Averaged'):
        mode = params.get('mode', 'def')
        return f'avg_{mode}'
    if row['estimator'].startswith('Flipping'):
        mode = params.get('mode', 'def')
        return f'flip_{mode}'

def regularization(row):
    params = eval(row['params'])
    return params.get('min_impurity_decrease')

In [8]:
data['label'] = data.apply(labeling, axis=1)
data['min_samples_leaf'] = data.apply(regularization, axis=1)

In [9]:
perf = data.groupby(['name', 'label', 'min_samples_leaf'], dropna=False).agg({score: 'mean'}).reset_index(drop=False)

In [10]:
filter = perf[perf['label'] == 'orig_l'].groupby(['name']).apply(lambda row: row.sort_values(score).iloc[-1][['name', 'min_samples_leaf']]).reset_index(drop=True)

In [11]:
filter

Unnamed: 0,name,min_samples_leaf
0,abalone9_18,0.2
1,appendicitis,0.2
2,bupa,0.05
3,cleveland-0_vs_4,0.05
4,ecoli1,0.1
5,glass0,0.1
6,haberman,0.2
7,hepatitis,0.05
8,lymphography,
9,mammographic,0.05


In [12]:
data = pd.merge(data, filter, on=['name', 'min_samples_leaf'])

In [13]:
#data = data[data['min_samples_leaf'] == 0.1]

In [14]:
def grouping(pdf):
    pdf = pdf.sort_values(['min_samples_leaf', 'fold'])
    return pdf[score].values.tolist()

In [15]:
grouped = data.groupby(['name', 'label']).apply(grouping).reset_index(drop=False).rename(columns={0: 'scores'})

In [16]:
def evaluate(pdf):
    pivot = 'orig_leq'

    pivot_row = pdf[pdf['label'] == pivot].iloc[0]

    results = {}
    results[f'{score}_orig_leq'] = np.median(pivot_row['scores'])

    for idx, row in pdf.iterrows():
        if row['label'] != pivot:
            results[f'{score}_{row["label"]}'] = np.median(row['scores'])
            if not row['label'].endswith('orig_l'):
                results[f'p_{row["label"]}_l'] = wilcoxon(pivot_row['scores'], row['scores'], zero_method='zsplit'
                                                        , alternative='less'
                                                        ).pvalue
            else:
                results[f'p_{row["label"]}_t'] = wilcoxon(pivot_row['scores'], row['scores'], zero_method='zsplit'
                                                        #, alternative='less'
                                                        ).pvalue

    return pd.Series(results)

In [17]:
grouped.groupby('name').apply(evaluate)

Unnamed: 0_level_0,auc_orig_leq,auc_avg_def,p_avg_def_l,auc_avg_random,p_avg_random_l,auc_flip_def,p_flip_def_l,auc_orig_l,p_orig_l_t
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
abalone9_18,0.764441,0.764441,0.5,0.764441,0.5,0.764441,0.001410948,0.764441,1.0
appendicitis,0.772059,0.772059,0.5,0.772059,0.5,0.772059,0.5,0.772059,1.0
bupa,0.691379,0.692672,0.574377,0.692241,0.679962,0.693319,0.1888333,0.690948,0.370077
cleveland-0_vs_4,0.939394,0.939394,0.148217,0.939394,0.148217,0.939394,0.08377655,0.941051,0.259088
ecoli1,0.9574,0.9574,0.5,0.9574,0.5,0.95896,1.118837e-08,0.9574,1.0
glass0,0.843596,0.843596,0.5,0.843596,0.5,0.845861,0.0003311841,0.843596,1.0
haberman,0.666319,0.666319,0.000199,0.666319,0.000199,0.666319,0.0001112115,0.666667,0.000222
hepatitis,0.760952,0.760952,0.567823,0.760952,0.568308,0.763333,0.2993802,0.761905,0.995362
lymphography,1.0,1.0,0.298129,1.0,0.298129,1.0,0.09762029,1.0,0.596259
mammographic,0.898474,0.898474,0.997337,0.898474,0.997337,0.898474,0.9199365,0.898474,0.003297
