In [1]:
import pandas as pd
import scipy.stats as stats

from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = load_wine(as_frame=True)['frame']
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


## Teste T para a quantiade d Álcool das Classes x População

Fazemos o teste T não paramétrico para verificar se as distribuições das classes fogem a média da população.

Fazemos a Anova também para verificar se as distribuições são oriundas da mesma distribuição.


**Conclusão**:

No teste T, apenas na classe 3 não temos hipóteses suficientes para dizer que os dados pertencem a outra distribuição

Na Anova, temos hipóteses suficientes para dizer que as distribuições de das classes são diferentes.


In [3]:
alcohol = {
    'class-1': df[df.target == 0]['alcohol'],
    'class-2': df[df.target == 1]['alcohol'],
    'class-3': df[df.target == 2]['alcohol']
} 

values = []
for k, v in alcohol.items():
    res = stats.ttest_1samp(a=v, popmean=df.alcohol.mean())
    values.append(['Test-T', k, res.df, res.pvalue.round(6), res.pvalue < 0.05]) 

anova = stats.f_oneway(*alcohol.values())
values.append(['Anova', '-', '-', anova.pvalue.round(6), anova.pvalue < 0.5])

df_result = pd.DataFrame(values, columns=['Type Test', 'Label', 'Degrees of Freedom', 'P-Value', 'Reject H0?'])
df_result.style.hide()

Type Test,Label,Degrees of Freedom,P-Value,Reject H0?
Test-T,class-1,58,0.0,True
Test-T,class-2,70,0.0,True
Test-T,class-3,47,0.051204,False
Anova,-,-,0.0,True


## Teste Estatístico para 2 Classificadores

Treinamos o dataset dos vinhos em uma Regressão Logística e um classificador KNN

Foi calculada métricas pertinentes a modelos classificadores como:
- Accuracy
- Balanced Accuracy 
- Precision
- Recall
- F1-Score

In [4]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lr = LogisticRegression(random_state=42, max_iter=100)
knn = KNeighborsClassifier(n_neighbors=2)

metrics, predicts = [], {}
for model in [lr, knn]:
    model_name = type(model).__name__
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predicts[model_name] = y_pred

    metrics.append({
        'Model': model_name, 
        'Accuracy': accuracy_score(y_test, y_pred), 
        'Balanced Accuracy': balanced_accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1-Score': f1_score(y_test, y_pred, average='weighted')
    }) 

df_metrics = pd.DataFrame(metrics).round(4)
df_metrics

Unnamed: 0,Model,Accuracy,Balanced Accuracy,Precision,Recall,F1-Score
0,LogisticRegression,0.9815,0.9841,0.9827,0.9815,0.9816
1,KNeighborsClassifier,0.9444,0.9524,0.9492,0.9444,0.9436


## Teste Estatístico Pareado

Utilizamos o teste Wilcoxon para validar tanto a acurácia e a distribuiçã dos valores previsto por cada modelo.

Pode-se concluir que tanto a acurácia quanto os dados previsto não possuem informações significantes para comprovar que são divergentes, logo aceitamos H0.

In [6]:
accuracy1, accuracy2 = df_metrics.Accuracy
pred1, pred2 = predicts.values()

_, p_value_acc = stats.wilcoxon([accuracy1], [accuracy2])
_, p_value_pred = stats.wilcoxon(pred1, pred2)
print(f"P-value for Accuracy: {p_value_acc:.4f}")
print(f"P-value for Predicted Data: {p_value_pred:.4f}")

P-value for Accuracy: 1.0000
P-value for Predicted Data: 0.3173
