# Wine Quality

**Data Set Information:**
TBC

**Attribute Information:**

**-type:** red wine or white wine, nominal qunatity; <br>
**-fixed acidity:** grams of tataric acid per 100ml, ratio qunatity; <br>
**-volatile acidity:** grams of acetic acid per liter of wine, ratio qunatity; <br>
**-citric acid:** grams per liter, ratio qunatity; <br>
**-residual sugar:** grams per liter, ratio qunatity; <br>
**-chlorides:** grams of sodium chloride per liter, ratio qunatity; <br>
**-free sulfur dioxide:** milligrams per liter, ratio qunatity; <br>
**-total sulfur dioxide:** milligrams per liter, ratio qunatity; <br>
**-density:** grams per cubic centimeter, ratio qunatity; <br>
**-pH:** ratio qunatity; <br>
**-sulphates:** grams of potassium sulfate per liter, ratio qunatity; <br>
**-alcohol:** vol.%, ratio qunatity; <br>
**-quality:** Output variable, score between 0 and 10, subjective(?), ordinal qunantity 

Preprocessing:
Combine two .csv files (one for red wine, one for white wine) into one file with the new attribute "type".





# General Data Preparation
### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import SelectPercentile, chi2, SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection

### Get the Data

In [None]:
data = pd.read_csv('wineQualityBothTypes.csv', delimiter = ';')

In [None]:
data

### Basic Data Information 

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include = 'object')

Check missing values

In [None]:
print(data.isnull().sum(axis=0))

### Exploratory Data Analysis

### Quality

In [None]:
#split data into red and white again
dataRed = data.loc[data['type'] == 'red']
dataWhite = data.loc[data['type'] == 'white']

In [None]:
#### Histograms

In [None]:
fig = plt.figure(figsize = (20,5))
sns.set_style('darkgrid')
bins = np.arange(data['quality'].min(), data['quality'].max()+1)
plt.hist(data['quality'], bins=bins, align='left')
plt.title("Both Types")
plt.xticks(bins)
plt.xlabel('quality')

In [None]:
#### Split into white wine and red wine
White wine:

In [None]:
fig = plt.figure(figsize = (20,5))
sns.set_style('darkgrid')
dataWhite = data.loc[data['type'] == 'white']
bins = np.arange(dataWhite['quality'].min(), dataWhite['quality'].max()+1)
plt.hist(dataWhite['quality'], bins=bins, align='left')
plt.title("White Wine")
plt.xticks(bins)
plt.xlabel('quality')

In [None]:
Red wine:

In [None]:
fig = plt.figure(figsize = (20,5))
sns.set_style('darkgrid')
bins = np.arange(dataRed['quality'].min(), dataRed['quality'].max()+1)
plt.hist(dataRed['quality'], bins=bins, align='left')
plt.title("Red Wine")
plt.xticks(bins)
plt.xlabel('quality')

### Fixed Acidity vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['fixed acidity'], data['quality'])
plt.title("Both Types")
plt.xlabel('fixed acidity [g(tataric acid)/100ml]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['fixed acidity'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('fixed acidity [g(tataric acid)/100ml]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['fixed acidity'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('fixed acidity [g(tataric acid)/100ml]')
plt.ylabel('quality')
plt.show()

### Volatile Acidity vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['volatile acidity'], data['quality'])
plt.title("Both Types")
plt.xlabel('volatile acidity [g(acetic acid)/100ml]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['volatile acidity'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('volatile acidity [g(acetic acid)/100ml]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['volatile acidity'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('volatile acidity [g(acetic acid)/100ml]')
plt.ylabel('quality')
plt.show()

### Citric Acid vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['citric acid'], data['quality'])
plt.title("Both Types")
plt.xlabel('citric acid [g/l]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['citric acid'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('citric acid [g/l]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['citric acid'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('citric acid [g/l]')
plt.ylabel('quality')
plt.show()

### Residual Sugar vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['residual sugar'], data['quality'])
plt.title("Both Types")
plt.xlabel('residual sugar [g/l]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['residual sugar'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('residual sugar [g/l]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['residual sugar'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('residual sugar [g/l]')
plt.ylabel('quality')
plt.show()

### Chlorides vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['chlorides'], data['quality'])
plt.title("Both Types")
plt.xlabel('chlorides [g(sodium chloride)/l]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['chlorides'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('chlorides [g(sodium chloride)/l]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['chlorides'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('chlorides [g(sodium chloride)/l]')
plt.ylabel('quality')
plt.show()

### Free Sulfur Dioxide vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['free sulfur dioxide'], data['quality'])
plt.title("Both Types")
plt.xlabel('free sulfur dioxide [mg/l]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['free sulfur dioxide'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('free sulfur dioxide [mg/l]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['free sulfur dioxide'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('free sulfur dioxide [mg/l]')
plt.ylabel('quality')
plt.show()

### Total Sulfur Dioxide vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['total sulfur dioxide'], data['quality'])
plt.title("Both Types")
plt.xlabel('total sulfur dioxide [mg/l]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['total sulfur dioxide'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('total sulfur dioxide [mg/l]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['total sulfur dioxide'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('total sulfur dioxide [mg/l]')
plt.ylabel('quality')
plt.show()

### Density vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['density'], data['quality'])
plt.title("Both Types")
plt.xlabel('density [g/cm³]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['density'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('density [g/cm³]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['density'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('density [g/cm³]')
plt.ylabel('quality')
plt.show()

### pH value vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['pH'], data['quality'])
plt.title("Both Types")
plt.xlabel('pH')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['pH'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('pH')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['pH'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('pH')
plt.ylabel('quality')
plt.show()

### Sulphates vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['sulphates'], data['quality'])
plt.title("Both Types")
plt.xlabel('sulphates [g(potassium sulfate)/l]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['sulphates'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('sulphates [g(potassium sulfate)/l]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['sulphates'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('sulphates [g(potassium sulfate)/l]')
plt.ylabel('quality')
plt.show()

### Alcohol Content vs. Quality 

#### Both types

plt.figure(figsize=(20, 8))
#sns.jointplot(data['fixed acidity'], data['quality'], kind='reg')
plt.scatter(data['alcohol'], data['quality'])
plt.title("Both Types")
plt.xlabel('alcohol [vol%]')
plt.ylabel('quality')
plt.show()

#### Red Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataRed['alcohol'], dataRed['quality'])
plt.title("Red Wine")
plt.xlabel('alcohol [vol%]')
plt.ylabel('quality')
plt.show()

#### White Wine

plt.figure(figsize=(20, 8))
plt.scatter(dataWhite['alcohol'], dataWhite['quality'])
plt.title("White Wine")
plt.xlabel('alcohol [vol%]')
plt.ylabel('quality')
plt.show()

# 2) Preprocessing

In [None]:
#first convert the strings 'white' and 'red' to 0 and 1 respectively
mapping = {'white': 0, 'red': 1}
data = data.replace({'type': mapping})

#Then split data into red and white again with that mapping
dataWhite = dataWhite.replace({'type': mapping})
dataRed = dataRed.replace({'type': mapping})

#split the data in attributes and class
X = data.drop('quality', axis=1)
y = data['quality']

#split the data in attributes and class
XWhite = dataWhite.drop('quality', axis=1)
yWhite = dataWhite['quality']

XRed = dataRed.drop('quality', axis=1)
yRed = dataRed['quality']


##  2.1) Feature Selection 

**Choose one of the methods:**
1. SelectPercentile (chi2)
2. SelectFromModel (LinearSVC)
3. SelectFromModel (LogisticRegression)
3. SelectFromModel (ExtraTreesClassifier)

## 2.2) Data Pre-processing

**Choose one of the methods:**
1. Standardization (StandardScaler)
2. Standardization (RobustScaler)
3. MinMaxScaler
4. Normalization

## 2.3)Outliers Detection using mathematical function Z-Score


### All three features included in the following function:

In [None]:
def featureSelection(XCurr, yCurr, featureselection_method, preprocessing_method):
    if featureselection_method == 1:
        selection = SelectPercentile(chi2, percentile=5)
    elif featureselection_method == 2:
        clf = LinearSVC()
        #clf = LinearSVC(C=0.1, penalty="l1", dual=False).fit(X, y)
        selection = SelectFromModel(clf, prefit=False)
    elif featureselection_method == 3:
        clf = LogisticRegression()
        #clf = LogisticRegression(C=0.2, penalty="l2", dual=False, max_iter=200).fit(X, y)
        selection = SelectFromModel(clf, prefit=False)
    elif featureselection_method == 4:
        clf = ExtraTreesClassifier(n_estimators=50).fit(XCurr, yCurr)
        selection = SelectFromModel(clf, prefit=False)      

    clf.feature_importances_ 
    X_transformed = selection.fit_transform(XCurr, yCurr)
    columns = np.asarray(XCurr.columns.values)
    support = np.asarray(selection.get_support())
    columns_with_support = columns[support]
    print("X_transformed.shape",X_transformed.shape)
    print("selected attributes: ", columns_with_support)
    
    #Data Pre-processing:
    if preprocessing_method == 1:
        scaler = StandardScaler()
        scaler = scaler.fit(X_transformed)
        X_scaled = scaler.transform(X_transformed)
    elif preprocessing_method == 2:
        scaler = RobustScaler()
        scaler = scaler.fit(X_transformed)
        X_scaled = scaler.transform(X_transformed)
    elif preprocessing_method == 3:
        scaler = preprocessing.MinMaxScaler()
        scaler = scaler.fit(X_transformed)
        X_scaled = scaler.transform(X_transformed)
    elif preprocessing_method == 4:
        scaler = preprocessing.Normalizer()
        scaler = scaler.fit(X_transformed)
        X_scaled = scaler.transform(X_transformed)
    print("X_scaled = ",X_scaled)
    
    
    #Outlier Detection:
    z = np.abs(stats.zscore(X_scaled))
    threshold = 20
    outliers_rows = np.where(z > threshold)
    print("\n z > threshold = ", np.where(z > threshold))
    # The first array contains the list of row numbers and second array respective column numbers
    print("\n number of outliers = ",len(set(outliers_rows[0])))
    
    #Remove Ouliers
    #X_prepared = X_scaled[(np.abs(stats.zscore(X_scaled)) < threshold).all(axis=1)]
    #X_prepared.shape
    #y = y.to_numpy()
    
    #X_train, X_test, y_train, y_test = train_test_split(X_scaled, yCurr, test_size=0.30)
    #return(X_train, X_test, y_train, y_test)
    return(X_scaled, yCurr)

In [None]:
featureselection_method = 4
preprocessing_method = 1

In [None]:
print('\n-------------------both types ------------------\n')
X, y = featureSelection(X, y, featureselection_method, preprocessing_method)

print('\n--------------------red wine -------------------\n')
XRed, yRed = featureSelection(XRed, yRed, featureselection_method, preprocessing_method)

print('\n-------------------white wine------------------\n')
XWhite, yWhite = featureSelection(XWhite, yWhite, featureselection_method, preprocessing_method)

In [None]:
NN = 1

d = {}

d["Logistic Regression"] = LogisticRegression(max_iter=200)
d["Gaussian Naive Bayes"] = GaussianNB()

d["KNearest Neighbors ("+ str(NN) + ")"] = KNeighborsClassifier(n_neighbors=NN)

d["SVM rbf"] = SVC()
d["SGD Classifier"] = SGDClassifier()

d["Decision Tree"] = DecisionTreeClassifier()

d["Random Forest"] = RandomForestClassifier()

d["Multi-layer Perceptron Classifier"] = MLPClassifier(max_iter=1000)

### General Remark:
Depending on the split of test and training set sometimes the ranking of the accuracy of the 8 claassifiers changes a bit. therefore I decided to run the classifiers with standard setting N times with different test/training set splits. 
Then from the average accuracy I chose the top three. 
For those three i then do a hyperparameter optimization.
This way it is easier to pick three classifiers for the report.

## 3.1) Comparison of Classifiers with standard settings

In [None]:
def compareClassifiers(X, y, d):
    N = 15
    
    scoreList = [0]*len(d.items())
    highScoreList = [0]*len(d.items())
    nameList = ['a']*len(d.items())
    timeList = [0]*len(d.items())
    
    for i in range(0,N):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
        
        counter = 0
        for name, clf in d.items():
            start = time.time()
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            end = time.time()
            timeList[counter] = timeList[counter] + (end - start)
            score = accuracy_score(y_test, y_pred)
            scoreList[counter] = scoreList[counter] + score
            if score > highScoreList[counter]:
                highScoreList[counter] = score
            nameList[counter] = name #overwrites n times but whatever 
            #print("i = ", i, " counter = ", counter, "score = ", score, "name = ", name, "time = ", end - start)
            counter += 1
 
    scoreList[:] = [x / N for x in scoreList]
    timeList[:] = [x / N for x in timeList] 
    
    ranking = sorted(zip(scoreList,nameList,highScoreList,timeList))[::-1]
    print(ranking)
    print("\nClassifiers from best to worst:")
    for i in range(0, len(ranking)):
        print(i+1, ') {:34} averageScore: {:.5} bestScore: {:.5} averageTime: {:.5} s'.format(ranking[i][1], str(ranking[i][0]), str(ranking[i][2]), str(ranking[i][3])))
        #print(ranking[i][1], ranking[i][0],ranking[i][2], ranking[i][3])
    return(y_test, y_train, X_test, X_train, ranking)

#old version:
y_test, y_train, X_test, X_train = train_test_split(X, y, test_size=0.20)
scoreList = []
nameList = []
counter = 0

for name, clf in d.items():
    start = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scoreList.append(accuracy_score(y_test, y_pred))
    nameList.append(name)

    end = time.time()
    print("\n--------------",name,"---------------\n")
    print("- Accuracy: %f" % score, "- Time: %0.2f" % (end - start), "seconds")
    print("\n Number of mislabeled points out of a total %d points : %d \n\n"% (X_test.shape[0], (y_test != y_pred).sum()))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))



print("###############")
ranking = sorted(zip(scoreList,nameList))[::-1]
#print(sorted(zip(scoreList,nameList)))
print("\nClassifiers from best to worst:")
for i in range(0, len(ranking)):
    print(i+1, ') {0:35} score: {1}'.format(ranking[i][1], ranking[i][0]))

#return(y_test, y_train, X_test, X_train, ranking)

### 3.1.1) Both Wines

In [None]:
y_test, y_train, X_test, X_train, rankingBoth = compareClassifiers(X, y, d)

### 3.1.2) Red Wine only

In [None]:
yRed_test, yRed_train, XRed_test, XRed_train, rankingRed = compareClassifiers(XRed, yRed, d)

### 3.1.3) White Wine only

In [None]:
yWhite_test, yWhite_train, XWhite_test, XWhite_train, rankingWhite = compareClassifiers(XWhite, yWhite, d)

## 3.2) Hyperparameter Optimization

In [None]:
def hyperparameterOptimization(classifierName, param_grid, ranking, y_test, y_train, X_test, X_train):
    names = [j for i,j,k,l in ranking]
    index = names.index(classifierName)
    #print(ranking[index][1])

    start = time.time()
    clf_gridsearch = GridSearchCV(d.get(ranking[index][1]), param_grid, verbose=0)
    clf_gridsearch.fit(X_train, y_train)
    print(clf_gridsearch.best_params_)
    predictions = clf_gridsearch.predict(X_test)
    #print("test =",predictions)
    score_gridsearch = accuracy_score(y_test, predictions)
    end = time.time()
    print("GridSearchCV - Accuracy: %f" % score_gridsearch, "- Time: %0.2f" % (end - start), "seconds")
    print("Previous best Accuracy: ", ranking[index][2])
   
    print("\n\nAdditional statistics for prediction using best parameters:\n")
    print("Number of mislabeled points out of a total %d points : %d \n"% (X_test.shape[0], (y_test != predictions).sum()))
    print("Confusion Matrix:\n",confusion_matrix(y_test, predictions))
    print("Classsification Report: \n",classification_report(y_test, predictions))
    
    return()

In [None]:
def defaultPrediction(clf, y_test, y_train, X_test, X_train):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("accuracy with standard parameters:",accuracy_score(y_test, y_pred))    
    return()

### 3.2.1) RandomForest

#### Parameter Grid

In [None]:
param_grid = {'criterion' : ['gini', 'entropy'],
              'max_depth': [10, 50, None],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2]}
#takes very long to calculate, if param grid is too large

#### 3.2.1.1) Both Wines

In [None]:
y_test, y_train, X_test, X_train, rankingBoth = compareClassifiers(X, y, d)
defaultPrediction(RandomForestClassifier(), y_test, y_train, X_test, X_train)
hyperparameterOptimization('Random Forest', param_grid, rankingBoth, y_test, y_train, X_test, X_train)

#### 3.2.1.2) Red Wine

In [None]:
yRed_test, yRed_train, XRed_test, XRed_train, rankingRed = compareClassifiers(XRed, yRed, d)
defaultPrediction(RandomForestClassifier(), yRed_test, yRed_train, XRed_test, XRed_train)
hyperparameterOptimization('Random Forest', param_grid, rankingRed, yRed_test, yRed_train, XRed_test, XRed_train)

#### 3.2.1.3) White Wine

In [None]:
yWhite_test, yWhite_train, XWhite_test, XWhite_train, rankingWhite = compareClassifiers(XWhite, yWhite, d)
defaultPrediction(RandomForestClassifier(), yWhite_test, yWhite_train, XWhite_test, XWhite_train)
hyperparameterOptimization('Random Forest', param_grid, rankingWhite, yWhite_test, yWhite_train, XWhite_test, XWhite_train)

### 3.2.2) KNearest Neighbors

#### Parameter Grid

In [None]:
param_grid = param_grid = {'n_neighbors': [1,2,5,10,25,50,75,100,125],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

#### 3.2.2.1) Both Wines

In [None]:
defaultPrediction(KNeighborsClassifier(n_neighbors=NN), y_test, y_train, X_test, X_train)
hyperparameterOptimization('KNearest Neighbors (1)', param_grid, rankingBoth, y_test, y_train, X_test, X_train)

#### 3.2.2.2) Red Wine

In [None]:
defaultPrediction(KNeighborsClassifier(n_neighbors=NN), yRed_test, yRed_train, XRed_test, XRed_train)
hyperparameterOptimization('KNearest Neighbors (1)', param_grid, rankingRed, yRed_test, yRed_train, XRed_test, XRed_train)

#### 3.2.2.3) White Wine

In [None]:
defaultPrediction(KNeighborsClassifier(n_neighbors=NN), yWhite_test, yWhite_train, XWhite_test, XWhite_train)
hyperparameterOptimization('KNearest Neighbors (1)', param_grid, rankingWhite, yWhite_test, yWhite_train, XWhite_test, XWhite_train)

### 3.2.3) Decision Tree

#### Parameter Grid

In [None]:
param_grid = {'criterion' : ['gini', 'entropy'],
              'max_depth': [10, 50, 100, None],
              'max_features' : ['auto', 'sqrt', 'log2', None],
              'splitter' : ['best', 'random'],
              'min_samples_leaf': [1, 2, 4]}

#### 3.2.3.1) Both Wines

In [None]:
y_test, y_train, X_test, X_train, rankingBoth = compareClassifiers(X, y, d)
defaultPrediction(DecisionTreeClassifier(), y_test, y_train, X_test, X_train)
hyperparameterOptimization('Decision Tree', param_grid, rankingBoth, y_test, y_train, X_test, X_train)

#### 3.2.3.2) Red Wine

In [None]:
yRed_test, yRed_train, XRed_test, XRed_train, rankingRed = compareClassifiers(XRed, yRed, d)
defaultPrediction(DecisionTreeClassifier(), yRed_test, yRed_train, XRed_test, XRed_train)
hyperparameterOptimization('Decision Tree', param_grid, rankingRed, yRed_test, yRed_train, XRed_test, XRed_train)

#### 3.2.3.3) White Wine

In [None]:
yWhite_test, yWhite_train, XWhite_test, XWhite_train, rankingWhite = compareClassifiers(XWhite, yWhite, d)
defaultPrediction(DecisionTreeClassifier(), yWhite_test, yWhite_train, XWhite_test, XWhite_train)
hyperparameterOptimization('Decision Tree', param_grid, rankingWhite, yWhite_test, yWhite_train, XWhite_test, XWhite_train)

## 3.3) Compare with k-fold cross validation

In [None]:
def kFoldComp(X, y):
    # prepare configuration for cross validation test harness
    seed = 7
    # prepare models
    models = []

    models.append(('KNN', KNeighborsClassifier(n_neighbors=1)))
    #models.append(('NBayes', GaussianNB()))
    models.append(('RandomForest', RandomForestClassifier()))
    models.append(('DTree', DecisionTreeClassifier()))
    #models.append(('SVM', SVC()))
    #models.append(('LRegression', LogisticRegression(max_iter=200)))

    # evaluate each model in turn
    results = []
    names = []
    scoring = 'accuracy'
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10)
        cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()
    return(results, names)

In [None]:
results, names = kFoldComp(X, y)

In [None]:
resultsRed, namesRed = kFoldComp(XRed, yRed)

In [None]:
resultsWhite, namesWhite = kFoldComp(XWhite, yWhite)

In [None]:
# boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax1 = fig.add_subplot(113)
    ax1.title("Both Wines")
    plt.boxplot(results)
    ax1.set_xticklabels(names)
    ax2 = fig.add_subplot(213)
    ax2.title("Red Wines")
    plt.boxplot(resultsRed)
    ax2.set_xticklabels(names)
    ax3 = fig.add_subplot(313)
    ax3.title("White Wines")
    plt.boxplot(resultsRed)
    ax3.set_xticklabels(names)
    plt.show()
    return(results, names)