# «Friedman Test» from Demsar (2006)
Adapted from cgosorio's work.

In [209]:
import os.path
import scipy.stats
from itertools import product
import pandas as pd
import numpy as np

In [210]:
metric = 'f1-score'
results = os.path.join('ranks', 'results.csv')

In [211]:
results_df = pd.read_csv(results)

In [212]:
filters = results_df['filter'].unique()
base_estimators = results_df['base'].unique()
percents_labeled = results_df['percent labeled'].unique()
datasets = results_df['dataset'].unique()
percents_labeled.sort()

print(f"Base estimators used: {base_estimators}")
print(f"Filters used: {filters}")
print(f"Percents labeled used: {percents_labeled}")
print(f"# Datasets used: {len(datasets)}")


Base estimators used: ['KNeighborsClassifier' 'DecisionTreeClassifier' 'GaussianNB']
Filters used: ['ENN' 'LSSm' 'ENANE' 'base']
Percents labeled used: [0.05 0.1  0.15 0.2  0.25 0.3  0.35]
# Datasets used: 18


In [213]:
grouped_df = results_df.groupby(['dataset', 'percent labeled',
                                              'base', 'filter']).mean()

grouped_df = grouped_df[metric].to_frame()
grouped_df.reset_index(inplace=True)

bases_filters = np.array(list(product(base_estimators, filters)))
print("Pairs of base estimator with filters:\n", bases_filters)

Pairs of base estimator with filters:
 [['KNeighborsClassifier' 'ENN']
 ['KNeighborsClassifier' 'LSSm']
 ['KNeighborsClassifier' 'ENANE']
 ['KNeighborsClassifier' 'base']
 ['DecisionTreeClassifier' 'ENN']
 ['DecisionTreeClassifier' 'LSSm']
 ['DecisionTreeClassifier' 'ENANE']
 ['DecisionTreeClassifier' 'base']
 ['GaussianNB' 'ENN']
 ['GaussianNB' 'LSSm']
 ['GaussianNB' 'ENANE']
 ['GaussianNB' 'base']]


In [214]:
print(grouped_df)

                    dataset  percent labeled                    base filter  \
0          BreastTissue.csv             0.05  DecisionTreeClassifier  ENANE   
1          BreastTissue.csv             0.05  DecisionTreeClassifier    ENN   
2          BreastTissue.csv             0.05  DecisionTreeClassifier   LSSm   
3          BreastTissue.csv             0.05  DecisionTreeClassifier   base   
4          BreastTissue.csv             0.05              GaussianNB  ENANE   
...                     ...              ...                     ...    ...   
1507  wifi-localization.csv             0.35              GaussianNB   base   
1508  wifi-localization.csv             0.35    KNeighborsClassifier  ENANE   
1509  wifi-localization.csv             0.35    KNeighborsClassifier    ENN   
1510  wifi-localization.csv             0.35    KNeighborsClassifier   LSSm   
1511  wifi-localization.csv             0.35    KNeighborsClassifier   base   

      f1-score  
0     0.183905  
1     0.196561  


In [215]:
def base_filter_values():
    values = pd.DataFrame()
    for base_filter in bases_filters:
        base_, filter_ = base_filter
        working_df = grouped_df.loc[
            (grouped_df['percent labeled'] == percent) &
            (grouped_df['base'] == base_) &
            (grouped_df['filter'] == filter_)
            ]
        value = working_df[metric].to_frame()
        value.reset_index(inplace=True, drop=True)
        value.columns = [':'.join(base_filter)]
        values = pd.concat((value, values), axis=1)

    return values


def split_onto_base_estimators():
    for base_ in base_estimators:
        base_dfs.append(curr_vals.filter(regex=base_))

In [216]:
for percent in percents_labeled:
    curr_vals = base_filter_values()

    base_dfs = list()
    split_onto_base_estimators()
    print('\n\n------------------------------------\n\n> Percent labeled: ',
          percent)
    for base, df in zip(base_estimators, base_dfs):
        metric_vals = df.to_numpy()

        N, k = len(datasets), len(df.columns)
        assert N==metric_vals.shape[0] and k==metric_vals.shape[1]

        rankings = scipy.stats.rankdata(-metric_vals, axis=1)

        Rj = average_ranks = np.mean(rankings, axis=0)

        average_rank_tuples = sorted(zip(df.columns, average_ranks), key=lambda
            a: a[1])

        average_rank_for = {method: value for method,
                                              value in zip(df.columns,
                                                           average_ranks)}
        print()
        for k, v in average_rank_for.items():
            print(k, f'{v:.2f}')




------------------------------------

> Percent labeled:  0.05

KNeighborsClassifier:base 2.47
KNeighborsClassifier:ENANE 2.25
KNeighborsClassifier:LSSm 2.67
KNeighborsClassifier:ENN 2.61

DecisionTreeClassifier:base 2.39
DecisionTreeClassifier:ENANE 2.94
DecisionTreeClassifier:LSSm 2.00
DecisionTreeClassifier:ENN 2.67

GaussianNB:base 2.22
GaussianNB:ENANE 2.14
GaussianNB:LSSm 2.39
GaussianNB:ENN 3.25


------------------------------------

> Percent labeled:  0.1

KNeighborsClassifier:base 2.78
KNeighborsClassifier:ENANE 2.78
KNeighborsClassifier:LSSm 2.22
KNeighborsClassifier:ENN 2.22

DecisionTreeClassifier:base 2.72
DecisionTreeClassifier:ENANE 2.50
DecisionTreeClassifier:LSSm 2.44
DecisionTreeClassifier:ENN 2.33

GaussianNB:base 3.17
GaussianNB:ENANE 2.00
GaussianNB:LSSm 2.17
GaussianNB:ENN 2.67


------------------------------------

> Percent labeled:  0.15

KNeighborsClassifier:base 2.50
KNeighborsClassifier:ENANE 2.44
KNeighborsClassifier:LSSm 2.78
KNeighborsClassifier:ENN 