# Statistical comparison of multiple classifiers

Here, multiple classifiers were compared by using the results of a 30 x 10-fold cross-validation.

They are compared when finetuning was employed or not.

In [1]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath('../modules'))
from statistical_methods import StatisticalMethods

## No finetuning

In [2]:
finetune = False

In [11]:
knn_results = pd.read_csv(f'../results/metrics_KNN_tuning_{finetune}.csv')
qda_results = pd.read_csv(f'../results/metrics_QDA_tuning_{finetune}.csv')
lr_results = pd.read_csv(f'../results/metrics_LR_tuning_{finetune}.csv')
parzen_results = pd.read_csv(f'../results/metrics_Parzen_tuning_{finetune}.csv')
voting_results = pd.read_csv(f'../results/metrics_Voting_tuning_{finetune}.csv')

list_of_model_results = [knn_results, qda_results, lr_results, parzen_results, voting_results]
list_of_model_names = ['KNN', 'QDA', 'LR', 'Parzen', 'Voting']
list_of_metrics_names = ['error_rate', 'test_accuracy', 'coverage', 'f1_score']

comp = StatisticalMethods(list_of_model_results, list_of_model_names, list_of_metrics_names)

friedman_results, metrics_with_difference = comp.friedman_test()

nemenyi_results = comp.nemenyi_test(metrics_with_difference)

In [12]:
comp.get_estimate_and_ci()

Unnamed: 0,model,metric,mean,std,ci
0,KNN,error_rate,0.255076,0.069403,"0.2472, 0.2629"
1,KNN,test_accuracy,0.744924,0.069403,"0.7371, 0.7528"
2,KNN,coverage,0.81443,0.069313,"0.8066, 0.8223"
3,KNN,f1_score,0.84036,0.046899,"0.8351, 0.8457"
4,QDA,error_rate,0.206087,0.016807,"0.2042, 0.2080"
5,QDA,test_accuracy,0.793913,0.016807,"0.7920, 0.7958"
6,QDA,coverage,0.999753,0.003014,"0.9994, 1.0001"
7,QDA,f1_score,0.885021,0.010447,"0.8838, 0.8862"
8,LR,error_rate,0.244131,0.074624,"0.2357, 0.2526"
9,LR,test_accuracy,0.755869,0.074624,"0.7474, 0.7643"


In [13]:
formatted_stats = comp.get_formatted_estimate_and_ci()
formatted_stats

Unnamed: 0,Modelo,Métrica,Média (%),IC (%)
0,KNN,Taxa de erro,25.51 ± 6.94,"[24.72 , 26.29]"
1,KNN,Acurácia de teste,74.49 ± 6.94,"[73.71 , 75.28]"
2,KNN,Cobertura,81.44 ± 6.93,"[80.66 , 82.23]"
3,KNN,F1-score,84.04 ± 4.69,"[83.51 , 84.57]"
4,QDA,Taxa de erro,20.61 ± 1.68,"[20.42 , 20.80]"
5,QDA,Acurácia de teste,79.39 ± 1.68,"[79.20 , 79.58]"
6,QDA,Cobertura,99.98 ± 0.30,"[99.94 , 100.01]"
7,QDA,F1-score,88.50 ± 1.04,"[88.38 , 88.62]"
8,LR,Taxa de erro,24.41 ± 7.46,"[23.57 , 25.26]"
9,LR,Acurácia de teste,75.59 ± 7.46,"[74.74 , 76.43]"


In [14]:
friedman_results

Unnamed: 0,metric,statistic,p-value,are_different
0,error_rate,739.75974,8.559467e-159,True
1,test_accuracy,739.75974,8.559467e-159,True
2,coverage,769.612836,2.931336e-165,True
3,f1_score,770.785291,1.6335390000000002e-165,True


In [15]:
nemenyi_results

Unnamed: 0,metric,classifier_pair,p-value,are_different
0,error_rate,KNN vs QDA,2.481118e-10,True
1,error_rate,KNN vs LR,0.2978887,False
2,error_rate,KNN vs Parzen,1.110223e-16,True
3,error_rate,KNN vs Voting,0.001073016,True
4,error_rate,QDA vs LR,2.130006e-05,True
5,error_rate,QDA vs Parzen,1.110223e-16,True
6,error_rate,QDA vs Voting,1.110223e-16,True
7,error_rate,LR vs Parzen,1.110223e-16,True
8,error_rate,LR vs Voting,6.756505e-08,True
9,error_rate,Parzen vs Voting,1.110223e-16,True


In [16]:
nemenyi_results.loc[nemenyi_results['are_different'] == False]

Unnamed: 0,metric,classifier_pair,p-value,are_different
1,error_rate,KNN vs LR,0.297889,False
11,test_accuracy,KNN vs LR,0.297889,False
21,coverage,KNN vs LR,0.840223,False
29,coverage,Parzen vs Voting,0.614631,False
31,f1_score,KNN vs LR,0.480189,False


## With finetuning

In [18]:
finetune = True

In [19]:
knn_results = pd.read_csv(f'../results/metrics_KNN_tuning_{finetune}.csv')
qda_results = pd.read_csv(f'../results/metrics_QDA_tuning_{finetune}.csv')
lr_results = pd.read_csv(f'../results/metrics_LR_tuning_{finetune}.csv')
parzen_results = pd.read_csv(f'../results/metrics_Parzen_tuning_{finetune}.csv')
voting_results = pd.read_csv(f'../results/metrics_Voting_tuning_{finetune}.csv')

list_of_model_results = [knn_results, qda_results, lr_results, parzen_results, voting_results]
list_of_model_names = ['KNN', 'QDA', 'LR', 'Parzen', 'Voting']
list_of_metrics_names = ['error_rate', 'test_accuracy', 'coverage', 'f1_score']

comp = StatisticalMethods(list_of_model_results, list_of_model_names, list_of_metrics_names)

friedman_results, metrics_with_difference = comp.friedman_test()

nemenyi_results = comp.nemenyi_test(metrics_with_difference)

In [20]:
comp.get_estimate_and_ci()

Unnamed: 0,model,metric,mean,std,ci
0,KNN,error_rate,0.242996,0.074137,"0.2346, 0.2514"
1,KNN,test_accuracy,0.757004,0.074137,"0.7486, 0.7654"
2,KNN,coverage,0.773167,0.080522,"0.7641, 0.7823"
3,KNN,f1_score,0.843606,0.05171,"0.8378, 0.8495"
4,QDA,error_rate,0.219891,0.046667,"0.2146, 0.2252"
5,QDA,test_accuracy,0.780109,0.046667,"0.7748, 0.7854"
6,QDA,coverage,0.939207,0.145701,"0.9227, 0.9557"
7,QDA,f1_score,0.869852,0.044578,"0.8648, 0.8749"
8,LR,error_rate,0.200745,0.070005,"0.1928, 0.2087"
9,LR,test_accuracy,0.799255,0.070005,"0.7913, 0.8072"


In [21]:
formatted_stats = comp.get_formatted_estimate_and_ci()
formatted_stats

Unnamed: 0,Modelo,Métrica,Média (%),IC (%)
0,KNN,Taxa de erro,24.30 ± 7.41,"[23.46 , 25.14]"
1,KNN,Acurácia de teste,75.70 ± 7.41,"[74.86 , 76.54]"
2,KNN,Cobertura,77.32 ± 8.05,"[76.41 , 78.23]"
3,KNN,F1-score,84.36 ± 5.17,"[83.78 , 84.95]"
4,QDA,Taxa de erro,21.99 ± 4.67,"[21.46 , 22.52]"
5,QDA,Acurácia de teste,78.01 ± 4.67,"[77.48 , 78.54]"
6,QDA,Cobertura,93.92 ± 14.57,"[92.27 , 95.57]"
7,QDA,F1-score,86.99 ± 4.46,"[86.48 , 87.49]"
8,LR,Taxa de erro,20.07 ± 7.00,"[19.28 , 20.87]"
9,LR,Acurácia de teste,79.93 ± 7.00,"[79.13 , 80.72]"


In [22]:
friedman_results

Unnamed: 0,metric,statistic,p-value,are_different
0,error_rate,350.498246,1.368989e-74,True
1,test_accuracy,350.498246,1.368989e-74,True
2,coverage,716.628532,8.741097e-154,True
3,f1_score,435.271192,6.634106e-93,True


In [23]:
nemenyi_results

Unnamed: 0,metric,classifier_pair,p-value,are_different
0,error_rate,KNN vs QDA,0.001614752,True
1,error_rate,KNN vs LR,1.132272e-11,True
2,error_rate,KNN vs Parzen,1.110223e-16,True
3,error_rate,KNN vs Voting,0.914131,False
4,error_rate,QDA vs LR,0.007062893,True
5,error_rate,QDA vs Parzen,1.110223e-16,True
6,error_rate,QDA vs Voting,3.981494e-05,True
7,error_rate,LR vs Parzen,1.110223e-16,True
8,error_rate,LR vs Voting,1.654232e-14,True
9,error_rate,Parzen vs Voting,1.110223e-16,True


In [24]:
nemenyi_results.loc[nemenyi_results['are_different'] == False]

Unnamed: 0,metric,classifier_pair,p-value,are_different
3,error_rate,KNN vs Voting,0.914131,False
13,test_accuracy,KNN vs Voting,0.914131,False
29,coverage,Parzen vs Voting,0.297889,False
34,f1_score,QDA vs LR,0.80801,False
