# CLASSIFYING WINE DATASET

## General Imports and Settings

In [54]:
import numpy as np 
import scipy as sp 
from scipy import stats
from sklearn import datasets
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from datetime import datetime
import matplotlib.pyplot as plt

In [55]:
num_folds = 10
num_rep = 3
exec_time = datetime.today().strftime('%Y-%m-%d %H:%M:%S')

### Leitura do dataset

In [56]:
wine_ds = datasets.load_wine()

wine_x = wine_ds.data
wine_y = wine_ds.target

## ZERO R

### Configuração do classificador

In [57]:
from sklearn.dummy import DummyClassifier

zr = DummyClassifier(strategy="stratified")
rskf = RepeatedStratifiedKFold(n_splits = num_folds, n_repeats = num_rep, random_state = 36851234)

### Classificação e obtenção dos resultados

In [58]:
zr_scores = cross_val_score(zr, wine_x, wine_y, scoring='accuracy', cv=rskf)
#print(score)

zr_score_mean = zr_scores.mean()
zr_score_std = zr_scores.std()
zr_score_low, zr_score_upp = stats.norm.interval(0.95, loc=zr_score_mean, scale=zr_score_std/np.sqrt(len(zr_scores)))

print("Zero R (mean, std, low, upp): \n", zr_score_mean, zr_score_std, zr_score_low, zr_score_upp)

Zero R (mean, std, low, upp): 
 0.3218954248366013 0.12526688441318043 0.27707006963784997 0.3667207800353526


## K NEAREST NEIGHBOR

### Configuração do classificador

In [59]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()


In [60]:
grid_params = {'n_neighbors': [3,5,11,19]}
gs_knn = GridSearchCV(estimator=knn, param_grid = grid_params, scoring='accuracy', cv = num_folds)
rskf = RepeatedStratifiedKFold(n_splits = num_folds, n_repeats = num_rep, random_state = 36851234)

### Classificação e obtenção dos resultados

In [61]:
knn_scores = cross_val_score(gs_knn, wine_x, wine_y, scoring='accuracy', cv=rskf)
#print(score)

knn_score_mean = knn_scores.mean()
knn_score_std = knn_scores.std()
knn_score_low, knn_score_upp = stats.norm.interval(0.95, loc=knn_score_mean, scale=knn_score_std/np.sqrt(len(knn_scores)))

print("KNN (mean, std, low, upp): \n",knn_score_mean, knn_score_std, knn_score_low, knn_score_upp)

KNN (mean, std, low, upp): 
 0.6956427015250546 0.12091424757836543 0.652374888793219 0.7389105142568901


## DECISION TREE

### Configuração do classificador

In [62]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()


In [63]:
grade={'max_depth': [3,5,10]}
gs_dt = GridSearchCV(estimator=dt, param_grid = grade, scoring='accuracy', cv = num_folds)
rskf = RepeatedStratifiedKFold(n_splits = num_folds, n_repeats = num_rep, random_state = 36851234)

### Classificação e obtenção dos resultados

In [64]:
dt_scores = cross_val_score(gs_dt, wine_x, wine_y, scoring='accuracy', cv=rskf)
#print(score)

dt_score_mean = dt_scores.mean()
dt_score_std = dt_scores.std()
dt_score_low, dt_score_upp = stats.norm.interval(0.95, loc=dt_score_mean, scale=dt_score_std/np.sqrt(len(dt_scores)))

print("dt (mean, std, low, upp): \n",dt_score_mean, dt_score_std, dt_score_low, dt_score_upp)

dt (mean, std, low, upp): 
 0.9065359477124182 0.06336611700427768 0.8838610906215882 0.9292108048032481


## NAIVE BAYES GAUSSIAN

### Configuração do classificador

In [65]:
from sklearn.naive_bayes import GaussianNB

nbg = GaussianNB()
rskf = RepeatedStratifiedKFold(n_splits = num_folds, n_repeats = num_rep, random_state = 36851234)

### Classificação e obtenção dos resultados

In [66]:
nbg_scores = cross_val_score(nbg, wine_x, wine_y, scoring='accuracy', cv=rskf)
#print(score)

nbg_score_mean = nbg_scores.mean()
nbg_score_std = nbg_scores.std()
nbg_score_low, nbg_score_upp = stats.norm.interval(0.95, loc=nbg_score_mean, scale=nbg_score_std/np.sqrt(len(nbg_scores)))

print("nbg (mean, std, low, upp): \n",nbg_score_mean, nbg_score_std, nbg_score_low, nbg_score_upp)

nbg (mean, std, low, upp): 
 0.9734204793028323 0.0482047991331875 0.9561709304412 0.9906700281644645


## KMC

### Configuração do classificador

In [67]:
# # CREATE CLASSIFIER
# from sklearn import newclassifier

# kmc = newclassifier()
# rskf = RepeatedStratifiedKFold(n_splits = num_folds, n_repeats = num_rep, random_state = 36851234)

### Classificação e obtenção dos resultados

In [68]:
# kmc_scores = cross_val_score(dt, wine_x, wine_y, scoring='accuracy', cv=rskf)
# #print(score)

# kmc_score_mean = kmc_scores.mean()
# kmc_score_std = kmc_scores.std()
# kmc_score_low, kmc_score_upp = stats.norm.interval(0.95, loc=kmc_score_mean, scale=kmc_score_std/np.sqrt(len(kmc_scores)))

# print("kmc (mean, std, low, upp): \n",kmc_score_mean, kmc_score_std, kmc_score_low, kmc_score_upp)

## Results

### Imprime os Resultados

In [69]:
# Create file  results.md 
with open("../results/results.md", "w") as rf: #opening result file
    print("# Results Obtained", file=rf)
    print("## Configuration\n", file=rf)
    print("Num of folds: ", num_folds, "\n", file=rf)
    print("Num of repetitions: ", num_rep, "\n", file=rf)
    print("Test run on: ", exec_time, file=rf)
    
    print("## Zero R\n", file=rf)
    print("Mean Value: ", str(zr_score_mean), "\n", file=rf)
    print("Standart Variation: ", str(zr_score_std), "\n", file=rf)
    print("Lower limit: ", str(zr_score_low), "\n", file=rf)
    print("Upper limit: ", str(zr_score_upp), "\n", file=rf)

    print("## K NEAREST NEIGHBORS\n", file=rf)
    print("Mean Value: ", str(knn_score_mean), "\n", file=rf)
    print("Standart Variation: ", str(knn_score_std), "\n", file=rf)
    print("Lower limit: ", str(knn_score_low), "\n", file=rf)
    print("Upper limit: ", str(knn_score_upp), "\n", file=rf)

    print("## DECISION TREE\n", file=rf)
    print("Mean Value: ", str(dt_score_mean), "\n", file=rf)
    print("Standart Variation: ", str(dt_score_std), "\n", file=rf)
    print("Lower limit: ", str(dt_score_low), "\n", file=rf)
    print("Upper limit: ", str(dt_score_upp), "\n", file=rf)

    print("## NBG\n", file=rf)
    print("Mean Value: ", str(nbg_score_mean), "\n", file=rf)
    print("Standart Variation: ", str(nbg_score_std), "\n", file=rf)
    print("Lower limit: ", str(nbg_score_low), "\n", file=rf)
    print("Upper limit: ", str(nbg_score_upp), "\n", file=rf)

    # print("## KMC\n", file=rf)
    # print("Mean Value: ", str(kmc_score_mean), "\n", file=rf)
    # print("Standart Variation: ", str(kmc_score_std), "\n", file=rf)
    # print("Lower limit: ", str(kmc_score_low), "\n", file=rf)
    # print("Upper limit: ", str(kmc_score_upp), "\n", file=rf)

### Gera os graficos

In [70]:
#plot 