# Statistical comparison of multiple classifiers

Here, multiple classifiers were compared by using the results of a 30 x 10-fold cross-validation.

They are compared when finetuning was employed or not.

In [1]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath('../modules'))
from statistical_methods import StatisticalMethods

## No finetuning

In [21]:
finetune = False

In [22]:
knn_results = pd.read_csv(f'../results/metrics_KNN_tuning_{finetune}.csv')
gnb_results = pd.read_csv(f'../results/metrics_GNB_tuning_{finetune}.csv')
lr_results = pd.read_csv(f'../results/metrics_LR_tuning_{finetune}.csv')
parzen_results = pd.read_csv(f'../results/metrics_Parzen_tuning_{finetune}.csv')
voting_results = pd.read_csv(f'../results/metrics_VotingClassifier_tuning_{finetune}.csv')

list_of_model_results = [knn_results, gnb_results, lr_results, parzen_results, voting_results]
list_of_model_names = ['KNN', 'GNB', 'LR', 'Parzen', 'VotingClassifier']
list_of_metrics_names = ['error_rate', 'test_accuracy', 'coverage', 'f1_score']

comp = StatisticalMethods(list_of_model_results, list_of_model_names, list_of_metrics_names)

friedman_results, metrics_with_difference = comp.friedman_test()

nemenyi_results = comp.nemenyi_test(metrics_with_difference)

In [23]:
comp.get_estimate_and_ci()

Unnamed: 0,model,metric,mean,std,ci
0,KNN,error_rate,0.255076,0.069403,"0.2472, 0.2629"
1,KNN,test_accuracy,0.744924,0.069403,"0.7371, 0.7528"
2,KNN,coverage,0.81443,0.069313,"0.8066, 0.8223"
3,KNN,f1_score,0.84036,0.046899,"0.8351, 0.8457"
4,GNB,error_rate,0.316515,0.083352,"0.3071, 0.3259"
5,GNB,test_accuracy,0.683485,0.083352,"0.6741, 0.6929"
6,GNB,coverage,0.519611,0.086643,"0.5098, 0.5294"
7,GNB,f1_score,0.75449,0.079124,"0.7455, 0.7634"
8,LR,error_rate,0.244131,0.074624,"0.2357, 0.2526"
9,LR,test_accuracy,0.755869,0.074624,"0.7474, 0.7643"


In [24]:
formatted_stats = comp.get_formatted_estimate_and_ci()
formatted_stats

Unnamed: 0,Modelo,Métrica,Média (%),IC (%)
0,KNN,Taxa de erro,25.51 ± 6.94,"[24.72 , 26.29]"
1,KNN,Acurácia de teste,74.49 ± 6.94,"[73.71 , 75.28]"
2,KNN,Cobertura,81.44 ± 6.93,"[80.66 , 82.23]"
3,KNN,F1-score,84.04 ± 4.69,"[83.51 , 84.57]"
4,GNB,Taxa de erro,31.65 ± 8.34,"[30.71 , 32.59]"
5,GNB,Acurácia de teste,68.35 ± 8.34,"[67.41 , 69.29]"
6,GNB,Cobertura,51.96 ± 8.66,"[50.98 , 52.94]"
7,GNB,F1-score,75.45 ± 7.91,"[74.55 , 76.34]"
8,LR,Taxa de erro,24.41 ± 7.46,"[23.57 , 25.26]"
9,LR,Acurácia de teste,75.59 ± 7.46,"[74.74 , 76.43]"


In [25]:
friedman_results

Unnamed: 0,metric,statistic,p-value,are_different
0,error_rate,715.487522,1.543985e-153,True
1,test_accuracy,715.487522,1.543985e-153,True
2,coverage,955.145551,1.873816e-205,True
3,f1_score,792.133692,3.883453e-170,True


In [26]:
nemenyi_results

Unnamed: 0,metric,classifier_pair,p-value,are_different
0,error_rate,KNN vs GNB,7.728262e-13,True
1,error_rate,KNN vs LR,0.7658616,False
2,error_rate,KNN vs Parzen,1.110223e-16,True
3,error_rate,KNN vs VotingClassifier,4.961026e-08,True
4,error_rate,GNB vs LR,1.110223e-16,True
5,error_rate,GNB vs Parzen,1.110223e-16,True
6,error_rate,GNB vs VotingClassifier,0.4801892,False
7,error_rate,LR vs Parzen,1.110223e-16,True
8,error_rate,LR vs VotingClassifier,2.171163e-11,True
9,error_rate,Parzen vs VotingClassifier,1.110223e-16,True


In [27]:
nemenyi_results.loc[nemenyi_results['are_different'] == False]

Unnamed: 0,metric,classifier_pair,p-value,are_different
1,error_rate,KNN vs LR,0.765862,False
6,error_rate,GNB vs VotingClassifier,0.480189,False
11,test_accuracy,KNN vs LR,0.765862,False
16,test_accuracy,GNB vs VotingClassifier,0.480189,False
21,coverage,KNN vs LR,0.972003,False
31,f1_score,KNN vs LR,0.957173,False


## With finetuning

In [28]:
finetune = True

In [29]:
knn_results = pd.read_csv(f'../results/metrics_KNN_tuning_{finetune}.csv')
gnb_results = pd.read_csv(f'../results/metrics_GNB_tuning_{finetune}.csv')
lr_results = pd.read_csv(f'../results/metrics_LR_tuning_{finetune}.csv')
parzen_results = pd.read_csv(f'../results/metrics_Parzen_tuning_{finetune}.csv')
voting_results = pd.read_csv(f'../results/metrics_VotingClassifier_tuning_{finetune}.csv')

list_of_model_results = [knn_results, gnb_results, lr_results, parzen_results, voting_results]
list_of_model_names = ['KNN', 'GNB', 'LR', 'Parzen', 'VotingClassifier']
list_of_metrics_names = ['error_rate', 'test_accuracy', 'coverage', 'f1_score']

comp = StatisticalMethods(list_of_model_results, list_of_model_names, list_of_metrics_names)

friedman_results, metrics_with_difference = comp.friedman_test()

nemenyi_results = comp.nemenyi_test(metrics_with_difference)

In [30]:
comp.get_estimate_and_ci()

Unnamed: 0,model,metric,mean,std,ci
0,KNN,error_rate,0.242996,0.074137,"0.2346, 0.2514"
1,KNN,test_accuracy,0.757004,0.074137,"0.7486, 0.7654"
2,KNN,coverage,0.773167,0.080522,"0.7641, 0.7823"
3,KNN,f1_score,0.843606,0.05171,"0.8378, 0.8495"
4,GNB,error_rate,0.316515,0.083243,"0.3071, 0.3259"
5,GNB,test_accuracy,0.683485,0.083243,"0.6741, 0.6929"
6,GNB,coverage,0.519364,0.087015,"0.5095, 0.5292"
7,GNB,f1_score,0.754412,0.07917,"0.7455, 0.7634"
8,LR,error_rate,0.20076,0.069639,"0.1929, 0.2086"
9,LR,test_accuracy,0.79924,0.069639,"0.7914, 0.8071"


In [31]:
formatted_stats = comp.get_formatted_estimate_and_ci()
formatted_stats

Unnamed: 0,Modelo,Métrica,Média (%),IC (%)
0,KNN,Taxa de erro,24.30 ± 7.41,"[23.46 , 25.14]"
1,KNN,Acurácia de teste,75.70 ± 7.41,"[74.86 , 76.54]"
2,KNN,Cobertura,77.32 ± 8.05,"[76.41 , 78.23]"
3,KNN,F1-score,84.36 ± 5.17,"[83.78 , 84.95]"
4,GNB,Taxa de erro,31.65 ± 8.32,"[30.71 , 32.59]"
5,GNB,Acurácia de teste,68.35 ± 8.32,"[67.41 , 69.29]"
6,GNB,Cobertura,51.94 ± 8.70,"[50.95 , 52.92]"
7,GNB,F1-score,75.44 ± 7.92,"[74.55 , 76.34]"
8,LR,Taxa de erro,20.08 ± 6.96,"[19.29 , 20.86]"
9,LR,Acurácia de teste,79.92 ± 6.96,"[79.14 , 80.71]"


In [32]:
friedman_results

Unnamed: 0,metric,statistic,p-value,are_different
0,error_rate,433.728972,1.429344e-92,True
1,test_accuracy,433.728972,1.429344e-92,True
2,coverage,995.156083,4.002537e-214,True
3,f1_score,570.172798,4.416185e-122,True


In [33]:
nemenyi_results

Unnamed: 0,metric,classifier_pair,p-value,are_different
0,error_rate,KNN vs GNB,1.110223e-16,True
1,error_rate,KNN vs LR,1.933307e-08,True
2,error_rate,KNN vs Parzen,1.110223e-16,True
3,error_rate,KNN vs VotingClassifier,0.1114551,False
4,error_rate,GNB vs LR,1.110223e-16,True
5,error_rate,GNB vs Parzen,0.9992025,False
6,error_rate,GNB vs VotingClassifier,2.88658e-15,True
7,error_rate,LR vs Parzen,1.110223e-16,True
8,error_rate,LR vs VotingClassifier,5.551115e-16,True
9,error_rate,Parzen vs VotingClassifier,2.04281e-14,True


In [34]:
nemenyi_results.loc[nemenyi_results['are_different'] == False]

Unnamed: 0,metric,classifier_pair,p-value,are_different
3,error_rate,KNN vs VotingClassifier,0.111455,False
5,error_rate,GNB vs Parzen,0.999203,False
13,test_accuracy,KNN vs VotingClassifier,0.111455,False
15,test_accuracy,GNB vs Parzen,0.999203,False
21,coverage,KNN vs LR,0.086744,False
