# Statistical comparison of multiple classifiers

Here, multiple classifiers were compared by using the results of a 30 x 10-fold cross-validation.

They are compared when finetuning was employed or not.

In [41]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath('../modules'))
from statistical_methods import StatisticalMethods

## No finetuning

In [42]:
finetune = False

In [43]:
knn_results = pd.read_csv(f'../results/metrics_KNN_tuning_{finetune}.csv')
qda_results = pd.read_csv(f'../results/metrics_QDA_tuning_{finetune}.csv')
lr_results = pd.read_csv(f'../results/metrics_LR_tuning_{finetune}.csv')
parzen_results = pd.read_csv(f'../results/metrics_KDE_tuning_{finetune}.csv')
voting_results = pd.read_csv(f'../results/metrics_Voting_tuning_{finetune}.csv')

list_of_model_results = [knn_results, qda_results, lr_results, parzen_results, voting_results]
list_of_model_names = ['KNN', 'QDA', 'LR', 'KDE', 'Voting']
list_of_metrics_names = ['error_rate', 'test_accuracy', 'coverage', 'f1_score']

comp = StatisticalMethods(list_of_model_results, list_of_model_names, list_of_metrics_names)

friedman_results, metrics_with_difference = comp.friedman_test()

nemenyi_results = comp.nemenyi_test(metrics_with_difference)

In [44]:
comp.get_estimate_and_ci()

Unnamed: 0,model,metric,mean,std,ci
0,KNN,error_rate,0.255076,0.069403,"0.2472, 0.2629"
1,KNN,test_accuracy,0.744924,0.069403,"0.7371, 0.7528"
2,KNN,coverage,0.852273,0.072687,"0.8440, 0.8605"
3,KNN,f1_score,0.84036,0.046899,"0.8351, 0.8457"
4,QDA,error_rate,0.206087,0.016807,"0.2042, 0.2080"
5,QDA,test_accuracy,0.793913,0.016807,"0.7920, 0.7958"
6,QDA,coverage,0.99969,0.003788,"0.9993, 1.0001"
7,QDA,f1_score,0.885021,0.010447,"0.8838, 0.8862"
8,LR,error_rate,0.244131,0.074624,"0.2357, 0.2526"
9,LR,test_accuracy,0.755869,0.074624,"0.7474, 0.7643"


In [45]:
formatted_stats = comp.get_formatted_estimate_and_ci()
formatted_stats

Unnamed: 0,Modelo,Métrica,Média (%),IC (%)
0,KNN,Taxa de erro,25.51 ± 6.94,"[24.72 , 26.29]"
1,KNN,Acurácia de teste,74.49 ± 6.94,"[73.71 , 75.28]"
2,KNN,Cobertura,85.23 ± 7.27,"[84.40 , 86.05]"
3,KNN,F1-score,84.04 ± 4.69,"[83.51 , 84.57]"
4,QDA,Taxa de erro,20.61 ± 1.68,"[20.42 , 20.80]"
5,QDA,Acurácia de teste,79.39 ± 1.68,"[79.20 , 79.58]"
6,QDA,Cobertura,99.97 ± 0.38,"[99.93 , 100.01]"
7,QDA,F1-score,88.50 ± 1.04,"[88.38 , 88.62]"
8,LR,Taxa de erro,24.41 ± 7.46,"[23.57 , 25.26]"
9,LR,Acurácia de teste,75.59 ± 7.46,"[74.74 , 76.43]"


In [46]:
friedman_results

Unnamed: 0,metric,statistic,p-value,are_different
0,error_rate,132.685406,1.037578e-27,True
1,test_accuracy,132.685406,1.037578e-27,True
2,coverage,680.172369,6.844013e-146,True
3,f1_score,253.79807,9.892739e-54,True


In [47]:
nemenyi_results

Unnamed: 0,metric,classifier_pair,p-value,are_different
0,error_rate,KNN vs QDA,1.110223e-16,True
1,error_rate,KNN vs LR,0.05420539,False
2,error_rate,KNN vs KDE,0.1767582,False
3,error_rate,KNN vs Voting,2.242431e-06,True
4,error_rate,QDA vs LR,4.845124e-12,True
5,error_rate,QDA vs KDE,1.163514e-13,True
6,error_rate,QDA vs Voting,1.999012e-05,True
7,error_rate,LR vs KDE,0.9882599,False
8,error_rate,LR vs Voting,0.09544077,False
9,error_rate,KDE vs Voting,0.02488351,True


In [48]:
nemenyi_results.loc[nemenyi_results['are_different'] == False]

Unnamed: 0,metric,classifier_pair,p-value,are_different
1,error_rate,KNN vs LR,0.054205,False
2,error_rate,KNN vs KDE,0.176758,False
7,error_rate,LR vs KDE,0.98826,False
8,error_rate,LR vs Voting,0.095441,False
11,test_accuracy,KNN vs LR,0.054205,False
12,test_accuracy,KNN vs KDE,0.176758,False
17,test_accuracy,LR vs KDE,0.98826,False
18,test_accuracy,LR vs Voting,0.095441,False
21,coverage,KNN vs LR,0.606232,False
37,f1_score,LR vs KDE,0.962583,False


## With finetuning

In [49]:
finetune = True

In [50]:
knn_results = pd.read_csv(f'../results/metrics_KNN_tuning_{finetune}.csv')
qda_results = pd.read_csv(f'../results/metrics_QDA_tuning_{finetune}.csv')
lr_results = pd.read_csv(f'../results/metrics_LR_tuning_{finetune}.csv')
parzen_results = pd.read_csv(f'../results/metrics_KDE_tuning_{finetune}.csv')
voting_results = pd.read_csv(f'../results/metrics_Voting_tuning_{finetune}.csv')

list_of_model_results = [knn_results, qda_results, lr_results, parzen_results, voting_results]
list_of_model_names = ['KNN', 'QDA', 'LR', 'KDE', 'Voting']
list_of_metrics_names = ['error_rate', 'test_accuracy', 'coverage', 'f1_score']

comp = StatisticalMethods(list_of_model_results, list_of_model_names, list_of_metrics_names)

friedman_results, metrics_with_difference = comp.friedman_test()

nemenyi_results = comp.nemenyi_test(metrics_with_difference)

In [51]:
comp.get_estimate_and_ci()

Unnamed: 0,model,metric,mean,std,ci
0,KNN,error_rate,0.234473,0.072668,"0.2262, 0.2427"
1,KNN,test_accuracy,0.765527,0.072668,"0.7573, 0.7738"
2,KNN,coverage,0.86368,0.074616,"0.8552, 0.8721"
3,KNN,f1_score,0.852917,0.048964,"0.8474, 0.8585"
4,QDA,error_rate,0.219767,0.046709,"0.2145, 0.2251"
5,QDA,test_accuracy,0.780233,0.046709,"0.7749, 0.7855"
6,QDA,coverage,0.95381,0.114889,"0.9408, 0.9668"
7,QDA,f1_score,0.870012,0.044593,"0.8650, 0.8751"
8,LR,error_rate,0.199525,0.068187,"0.1918, 0.2072"
9,LR,test_accuracy,0.800475,0.068187,"0.7928, 0.8082"


In [52]:
formatted_stats = comp.get_formatted_estimate_and_ci()
formatted_stats

Unnamed: 0,Modelo,Métrica,Média (%),IC (%)
0,KNN,Taxa de erro,23.45 ± 7.27,"[22.62 , 24.27]"
1,KNN,Acurácia de teste,76.55 ± 7.27,"[75.73 , 77.38]"
2,KNN,Cobertura,86.37 ± 7.46,"[85.52 , 87.21]"
3,KNN,F1-score,85.29 ± 4.90,"[84.74 , 85.85]"
4,QDA,Taxa de erro,21.98 ± 4.67,"[21.45 , 22.51]"
5,QDA,Acurácia de teste,78.02 ± 4.67,"[77.49 , 78.55]"
6,QDA,Cobertura,95.38 ± 11.49,"[94.08 , 96.68]"
7,QDA,F1-score,87.00 ± 4.46,"[86.50 , 87.51]"
8,LR,Taxa de erro,19.95 ± 6.82,"[19.18 , 20.72]"
9,LR,Acurácia de teste,80.05 ± 6.82,"[79.28 , 80.82]"


In [53]:
friedman_results

Unnamed: 0,metric,statistic,p-value,are_different
0,error_rate,321.684443,2.270924e-68,True
1,test_accuracy,321.684443,2.270924e-68,True
2,coverage,679.374867,1.01854e-145,True
3,f1_score,392.276211,1.297399e-83,True


In [54]:
nemenyi_results

Unnamed: 0,metric,classifier_pair,p-value,are_different
0,error_rate,KNN vs QDA,0.1114551,False
1,error_rate,KNN vs LR,9.90402e-08,True
2,error_rate,KNN vs KDE,1.110223e-16,True
3,error_rate,KNN vs Voting,0.4154244,False
4,error_rate,QDA vs LR,0.008071825,True
5,error_rate,QDA vs KDE,1.110223e-16,True
6,error_rate,QDA vs Voting,0.0003291866,True
7,error_rate,LR vs KDE,1.110223e-16,True
8,error_rate,LR vs Voting,8.525403e-13,True
9,error_rate,KDE vs Voting,1.110223e-16,True


In [55]:
nemenyi_results.loc[nemenyi_results['are_different'] == False]

Unnamed: 0,metric,classifier_pair,p-value,are_different
0,error_rate,KNN vs QDA,0.111455,False
3,error_rate,KNN vs Voting,0.415424,False
10,test_accuracy,KNN vs QDA,0.111455,False
13,test_accuracy,KNN vs Voting,0.415424,False
21,coverage,KNN vs LR,0.235131,False
34,f1_score,QDA vs LR,0.863881,False
