# Machine Learning - project milestone 3

In [1]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd
from numpy import mean

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Machine learning
import catboost
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool, cv
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')

### Loading in the data

Load the downloaded dta under the folder "Resources" to a variable named data

In [2]:
data=pd.read_csv("Resources/winequalityN.csv")

In [3]:
data.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


### Data features

   Input variables (based on physicochemical tests):
   
       1 - fixed acidity
       2 - volatile acidity
       3 - citric acid
       4 - residual sugar
       5 - chlorides
       6 - free sulfur dioxide
       7 - total sulfur dioxide
       8 - density
       9 - pH
       10 - sulphates
       11 - alcohol
   Output variable (based on sensory data): 
   
       12 - quality (score between 0 and 10)

In [4]:
#Visualizing mian statistical metrics for the whole dataset
data.describe()

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

In [None]:
#Filtering the dataset into white and red wine to evaluate its statictics individually
redWine=data[data['type'] == "red"]
whiteWine=data[data['type'] == "white"]

In [None]:
redWine.describe()

In [None]:
whiteWine.describe()

As we can observe from the description of the separate type of wines, although the quality is vey approximate on average, the chemical components hve completely different profiles. Indicating that what makes white wine a good or bad wine is probably not the same for the red wines.

## What missing values are there?

Where are the holes in our data?

These are rows which are missing a value or have NaN instead of something like the rest of the column.

In [None]:
# Plot graphic of missing values
missingno.matrix(data, figsize = (50,15))

This is a unique scenario to find where you have a dataset which is almost absolutelly complete, wothout any missing values. This will allow more time to focus on finding the best model performance later on. We can visualize by the graph above that a tiny bit of ph levels are missing along with some fixed acidity and citric acid holes, almost too hard to visualize. We will be able to drop those values without compromising the dataset.

In [None]:
# Alternatively, you can see the number of missing values like this
data.isnull().sum()

From an Excel spreadsheet I could antecipate that a few of these missing values are in the same rows, which will minimize even more the impacts of its removal

## Visualize and analyse each feature and classify them

Here we will be able to visualize the distribution of each feature's frequency, remove the few missing values and separate them into categorical (one and only Output) and add them into our empty dataFrames. Everything will be done in duplicity, one for the white wine dataset and another for the red wine dataset. The empty dataFrames were cre

In [None]:
#Creating empty dataframes to separate the types of wines and transforming categorical into binary
red_bin = pd.DataFrame()
whi_bin = pd.DataFrame() 


### What datatypes are in the dataframe?

In [None]:
# Different data types in the dataset
data.dtypes

## Let's explore each of these features individually
We'll go through each column iteratively and see which ones to use in our first models.
Some may need more preprocessing than others to get ready.

In [None]:
data.head()

### Target Feature:Quality

Description: Wine grade given by specialists based on taste test.
    
Ranges from 0 to 10, primarily we will consider wines with a grade 6 or greater a good wine otherwise a bad wine. Although they are numerical we will transform it in categorical/binary, at first in order to simplify our models later on.
    
This is the variable we want our machine learning model to predict based off all the others.

In [None]:
# Are there any missing values in the quality column?
data.quality.isnull().sum()

In [None]:
# Frequency of grade for white wine
fig = plt.figure(figsize=(7,5))
sns.countplot(y='quality', data=whiteWine);
print(whiteWine.quality.value_counts())

In [None]:
# add Sex to the subset dataframes
whi_bin['quality'] = whiteWine['quality']
whi_bin['quality'] = np.where(whi_bin['quality'] >=  6, 1, 0) # change quality to 0 for values lower than 6 and 1 for grater than 6

In [None]:
whi_bin.head()

In [None]:
# Frequency of grade for red wine
fig = plt.figure(figsize=(20,10))
sns.countplot(y='quality', data=redWine);
print(redWine.quality.value_counts())

In [None]:
# add to the subset dataframes
red_bin['quality'] = redWine['quality']
red_bin['quality'] = np.where(red_bin['quality'] >=  6, 1, 0) # change quality to 0 for values lower than 6 and 1 for grater than 6

In [None]:
red_bin.head()

### Target Feature:Alcohol

Description: Wine alcohol grade.
    
Numerical value acttually represents the percentage.

In [None]:
# Are there any missing values in the alcohol column?
data.alcohol.isnull().sum()

In [None]:
sns.distplot(whiteWine.alcohol)

In [None]:
sns.distplot(redWine.alcohol)

In [None]:
#Add alcohol to sub df
whi_bin['alcohol'] = whiteWine['alcohol']

In [None]:
#same
red_bin['alcohol'] = redWine['alcohol']

### Target Feature: sulphates

Description: Wine sulphates level.
    

In [None]:
# Are there any missing values in the sulphates column?
whiteWine.sulphates.isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='sulphates', data=whiteWine);

In [None]:
#Add sulphates to sub df
whi_bin['sulphates'] = whiteWine['sulphates']

In [None]:
#Remove missing values
whi_bin = whi_bin.dropna(subset=['sulphates'])

In [None]:
# Are there any missing values in the sulphates column?
redWine.sulphates.isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='sulphates', data=redWine);

In [None]:
#Add sulphates to sub df
red_bin['sulphates'] = redWine['sulphates']

In [None]:
#Remove missing values
red_bin = red_bin.dropna(subset=['sulphates'])
print(len(red_bin))

### Target Feature: pH

Description: Wine pH level.

In [None]:
# Are there any missing values in the pH column?
whiteWine.pH.isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='pH', data=whiteWine);

In [None]:
#Add PH to sub df
whi_bin['pH'] = whiteWine['pH']

In [None]:
#Remove missing values
whi_bin = whi_bin.dropna(subset=['pH'])
print(len(whi_bin))

In [None]:
# Are there any missing values in the pH column?
redWine.pH.isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='pH', data=redWine);

In [None]:
#Add to sub dfs
red_bin['pH'] = redWine['pH']

In [None]:
#Remove missing values
red_bin = red_bin.dropna(subset=['pH'])
print(len(red_bin))

### Target Feature: Density

Description: Wine density level.

In [None]:
# Are there any missing values in the column?
data.density.isnull().sum()

In [None]:
sns.distplot(whiteWine.density)

In [None]:
#Add density to sub df
whi_bin['density'] = whiteWine['density']

In [None]:
sns.distplot(redWine.density)

In [None]:
#same
red_bin['density'] = redWine['density']

### Target Feature: total sulfur dioxide

Description: Wine total sulfur dioxide level.

In [None]:
# Are there any missing values in the column?
data['total sulfur dioxide'].isnull().sum()

In [None]:
sns.distplot(whiteWine['total sulfur dioxide'])

In [None]:
#Add to sub df
whi_bin['total sulfur dioxide'] = whiteWine['total sulfur dioxide']

In [None]:
sns.distplot(redWine['total sulfur dioxide'])

In [None]:
#same
red_bin['total sulfur dioxide'] = redWine['total sulfur dioxide']

### Target Feature: free sulfur dioxide

Description: Wine free sulfur dioxide level.

In [None]:
# Are there any missing values in the column?
data['free sulfur dioxide'].isnull().sum()

In [None]:
sns.distplot(whiteWine['free sulfur dioxide'])

In [None]:
#Add to sub df
whi_bin['free sulfur dioxide'] = whiteWine['free sulfur dioxide']

In [None]:
sns.distplot(redWine['free sulfur dioxide'])

In [None]:
#same
red_bin['free sulfur dioxide'] = redWine['free sulfur dioxide']

### Target Feature: Chlorides

Description: Wine chloride level.

In [None]:
# Are there any missing values in the chlorides column?
whiteWine.chlorides.isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='chlorides', data=whiteWine);

In [None]:
#Add PH to sub df
whi_bin['chlorides'] = whiteWine['chlorides']

In [None]:
#Remove missing values
whi_bin = whi_bin.dropna(subset=['chlorides'])
print(len(whi_bin))

In [None]:
# Are there any missing values in the chlorides column?
redWine.chlorides.isnull().sum()

In [None]:
# What do the counts look like?
sns.distplot(redWine.chlorides)

In [None]:
#Add to sub dfs
red_bin['chlorides'] = redWine['chlorides']

### Target Feature: residual sugar

Description: Wine residual sugar level.

In [None]:
# Are there any missing values in the residual sugar column?
whiteWine['residual sugar'].isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='residual sugar', data=whiteWine);

In [None]:
#Add PH to sub df
whi_bin['residual sugar'] = whiteWine['residual sugar']

In [None]:
#Remove missing values
whi_bin = whi_bin.dropna(subset=['residual sugar'])
print(len(whi_bin))

In [None]:
# Are there any missing values in the residual sugar column?
redWine['residual sugar'].isnull().sum()

In [None]:
# What do the counts look like?
# What do the counts look like?
sns.distplot(redWine['residual sugar'])

In [None]:
#Add to sub dfs
red_bin['residual sugar'] = redWine['residual sugar']

### Target Feature: citric acid

Description: Wine citric acid level.

In [None]:
# Are there any missing values in the citric acid column?
whiteWine['citric acid'].isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='citric acid', data=whiteWine);

In [None]:
#Add PH to sub df
whi_bin['citric acid'] = whiteWine['citric acid']

In [None]:
#Remove missing values
whi_bin = whi_bin.dropna(subset=['citric acid'])
print(len(whi_bin))

In [None]:
# Are there any missing values in the citric acid column?
redWine['citric acid'].isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='citric acid', data=redWine);

In [None]:
#Add to sub dfs
red_bin['citric acid'] = redWine['citric acid']

In [None]:
#Remove missing values
red_bin = red_bin.dropna(subset=['citric acid'])
print(len(red_bin))

### Target Feature: volatile acidity

Description: Wine volatile acidity level.

In [None]:
# Are there any missing values in the volatile acidity column?
whiteWine['volatile acidity'].isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='volatile acidity', data=whiteWine);

In [None]:
#Add PH to sub df
whi_bin['volatile acidity'] = whiteWine['volatile acidity']

In [None]:
#Remove missing values
whi_bin = whi_bin.dropna(subset=['volatile acidity'])
print(len(whi_bin))

In [None]:
# Are there any missing values in the volatile acidity column?
redWine['volatile acidity'].isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='volatile acidity', data=redWine);

In [None]:
#Add to sub dfs
red_bin['volatile acidity'] = redWine['volatile acidity']

In [None]:
#Remove missing values
red_bin = red_bin.dropna(subset=['volatile acidity'])
print(len(red_bin))

### Target Feature: fixed acidity

Description: Wine fixed acidity level.

In [None]:
# Are there any missing values in the fixed acidity column?
whiteWine['fixed acidity'].isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='fixed acidity', data=whiteWine);

In [None]:
#Add PH to sub df
whi_bin['fixed acidity'] = whiteWine['fixed acidity']

In [None]:
#Remove missing values
whi_bin = whi_bin.dropna(subset=['fixed acidity'])
print(len(whi_bin))

In [None]:
# Are there any missing values in the fixed acidity column?
redWine['fixed acidity'].isnull().sum()

In [None]:
# What do the counts look like?
sns.countplot(y='fixed acidity', data=redWine);

In [None]:
#Add to sub dfs
red_bin['fixed acidity'] = redWine['fixed acidity']

In [None]:
#Remove missing values
red_bin = red_bin.dropna(subset=['fixed acidity'])
print(len(red_bin))

## Feature Encoding

 There is no need for this step since there were no categorical data other that the output in the previous step. The output variable is already converted to binary

In [None]:
whi_bin.head()

In [None]:
red_bin.head()

## Start Building Machine Learning Models

Now our data has been manipulating and cleaned, we can run a series of different machine learning algorithms over it to find which yield the best results.

### Let's seperate the data
First we will separate the data into X and y variables (or predictores and response), and than split the data into train and test dataset in order to fit our model in the training Dataset against our test afterwards.

In [None]:
#X and y for WHITE wine dataset
XW= whi_bin.drop('quality', axis=1) 
yw = whi_bin.quality

In [None]:
#X and y for RED wine dataset
XR= red_bin.drop('quality', axis=1) 
yr = red_bin.quality

In [None]:
#X_train,test and y for WHITE wine dataset
XW_train, XW_test, yw_train, yw_test = train_test_split(XW, yw, test_size=0.3, random_state=42)

In [None]:
#X_train,test and y for RED wine dataset
XR_train, XR_test, yr_train, yr_test = train_test_split(XR, yr, test_size=0.3, random_state=42)

### Define a function to fit machine learning algorithms

To prevent writing code multiple times, we will functionise fitting the model and returning the accuracy scores.

In [None]:
# Function that runs the requested algorithm and returns the accuracy metrics
def fit_ml_algo(algo, X_train, y_train, cv):
    
    # One Pass
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(algo, X_train, y_train, cv=cv)
    # Cross-validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

### Logistic Regression

In [None]:
# Logistic Regression for WHITE WINE
start_time = time.time()
train_pred_logW, acc_logW, acc_cv_logW = fit_ml_algo(LogisticRegression(), XW_train, yw_train,10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_logW)
print("Accuracy CV 10-Fold: %s" % acc_cv_logW)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

In [None]:
# Logistic Regression for RED WINE
start_time = time.time()
train_pred_logR, acc_logR, acc_cv_logR = fit_ml_algo(LogisticRegression(), XR_train, yr_train,10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_logR)
print("Accuracy CV 10-Fold: %s" % acc_cv_logR)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

### K-Nearest Neighbours

In [None]:
# k-Nearest Neighbours for WHITE WINE
start_time = time.time()
train_pred_knnW, acc_knnW, acc_cv_knnW = fit_ml_algo(KNeighborsClassifier(), 
                                                  XW_train, 
                                                  yw_train, 
                                                  10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knnW)
print("Accuracy CV 10-Fold: %s" % acc_cv_knnW)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))

In [None]:
# k-Nearest Neighbours for RED WINE
start_time = time.time()
train_pred_knnR, acc_knnR, acc_cv_knnR = fit_ml_algo(KNeighborsClassifier(), 
                                                  XR_train, 
                                                  yr_train, 
                                                  10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knnR)
print("Accuracy CV 10-Fold: %s" % acc_cv_knnR)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))

### Gaussian Naive Bayes

In [None]:
# Gaussian Naive Bayes for WHITE WINE
start_time = time.time()
train_pred_gaussianW, acc_gaussianW, acc_cv_gaussianW = fit_ml_algo(GaussianNB(), 
                                                                      XW_train, 
                                                                      yw_train, 
                                                                           10)
gaussian_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gaussianW)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussianW)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))

In [None]:
# Gaussian Naive Bayes for RED WINE
start_time = time.time()
train_pred_gaussianR, acc_gaussianR, acc_cv_gaussianR = fit_ml_algo(GaussianNB(), 
                                                                      XR_train, 
                                                                      yr_train, 
                                                                           10)
gaussian_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gaussianR)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussianR)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))

### Linear Support Vector Machines (SVC)

In [None]:
# Linear SVC for WHITE WINE
start_time = time.time()
train_pred_svcW, acc_linear_svcW, acc_cv_linear_svcW = fit_ml_algo(LinearSVC(),
                                                                XW_train, 
                                                                yw_train, 
                                                                10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svcW)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svcW)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))

In [None]:
# Linear SVC for RED WINE
start_time = time.time()
train_pred_svcR, acc_linear_svcR, acc_cv_linear_svcR = fit_ml_algo(LinearSVC(),
                                                                XR_train, 
                                                                yr_train, 
                                                                10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svcR)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svcR)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))

### Stochastic Gradient Descent

In [None]:
# Stochastic Gradient Descent for WHITE WINE
start_time = time.time()
train_pred_sgdW, acc_sgdW, acc_cv_sgdW = fit_ml_algo(SGDClassifier(), 
                                                  XW_train, 
                                                  yw_train,
                                                  10)
sgd_time = (time.time() - start_time)
print("Accuracy: %s" % acc_sgdW)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgdW)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))

In [None]:
# Stochastic Gradient Descent for RED WINE
start_time = time.time()
train_pred_sgdR, acc_sgdR, acc_cv_sgdR = fit_ml_algo(SGDClassifier(), 
                                                  XR_train, 
                                                  yr_train,
                                                  10)
sgd_time = (time.time() - start_time)
print("Accuracy: %s" % acc_sgdR)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgdR)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))

### Decision Tree Classifier

In [None]:
# Decision Tree Classifier for WHITE WINE
start_time = time.time()
train_pred_dtW, acc_dtW, acc_cv_dtW = fit_ml_algo(DecisionTreeClassifier(), 
                                                                XW_train, 
                                                                yw_train,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dtW)
print("Accuracy CV 10-Fold: %s" % acc_cv_dtW)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

In [None]:
# Decision Tree Classifier for RED WINE
start_time = time.time()
train_pred_dtR, acc_dtR, acc_cv_dtR = fit_ml_algo(DecisionTreeClassifier(), 
                                                                XR_train, 
                                                                yr_train,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dtR)
print("Accuracy CV 10-Fold: %s" % acc_cv_dtR)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

### Gradient Boost Trees

In [None]:
# Gradient Boosting Trees for WHITE WINE
start_time = time.time()
train_pred_gbtW, acc_gbtW, acc_cv_gbtW = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       XW_train, 
                                                                       yw_train,
                                                                       10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbtW)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbtW)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))

In [None]:
# Gradient Boosting Trees for RED WINE
start_time = time.time()
train_pred_gbtR, acc_gbtR, acc_cv_gbtR = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       XR_train, 
                                                                       yr_train,
                                                                       10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbtR)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbtR)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))

### Regular accuracy scores for White Wines on train dataset

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees'],
    'Score': [
        acc_knnW, 
        acc_logW,  
        acc_gaussianW, 
        acc_sgdW, 
        acc_linear_svcW, 
        acc_dtW,
        acc_gbtW
    ]})
print("---Regular Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

In [None]:
cv_models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees'],
    'Score': [
        acc_cv_knnW, 
        acc_cv_logW,      
        acc_cv_gaussianW, 
        acc_cv_sgdW, 
        acc_cv_linear_svcW, 
        acc_cv_dtW,
        acc_cv_gbtW
        
    ]})
print('---Cross-validation Accuracy Scores---')
cv_models.sort_values(by='Score', ascending=False)

### Regular accuracy scores for Red Wines on Train Dataset

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees'],
    'Score': [
        acc_knnR, 
        acc_logR,  
        acc_gaussianR, 
        acc_sgdR, 
        acc_linear_svcR, 
        acc_dtR,
        acc_gbtR 
    ]})

print("---Reguglar Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

In [None]:
cv_models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees'],
    'Score': [
        acc_cv_knnR, 
        acc_cv_logR,      
        acc_cv_gaussianR, 
        acc_cv_sgdR, 
        acc_cv_linear_svcR, 
        acc_cv_dtR,
        acc_cv_gbtR
    ]})
print('---Cross-validation Accuracy Scores---')
cv_models.sort_values(by='Score', ascending=False)

Gradient boosting trees was the model that displayed the best performance in both Red and White wines dataaset. Going forward the work should be focusing on improving the features in this model and compare its performance against the dataset.

## Applying model to Test dataset


### Logistic Regression

In [None]:
# Logistic Regression for WHITE WINE
start_time = time.time()
train_pred_logW, acc_logW, acc_cv_log5 = fit_ml_algo(LogisticRegression(), XW_test, yw_test,10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_logW)
print("Accuracy CV 10-Fold: %s" % acc_cv_logW)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

In [None]:
# Logistic Regression for RED WINE
start_time = time.time()
train_pred_logR, acc_logR, acc_cv_logR = fit_ml_algo(LogisticRegression(), XR_test, yr_test,10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_logR)
print("Accuracy CV 10-Fold: %s" % acc_cv_logR)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

### Decision Tree Classifier

In [None]:
# Decision Tree Classifier for WHITE WINE
start_time = time.time()
test_pred_dtWT, acc_dtWT, acc_cv_dtWT = fit_ml_algo(DecisionTreeClassifier(), 
                                                                XW_test, 
                                                                yw_test,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dtWT)
print("Accuracy CV 10-Fold: %s" % acc_cv_dtWT)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

In [None]:
# Decision Tree Classifier for RED WINE
start_time = time.time()
test_pred_dtRT, acc_dtRT, acc_cv_dtRT = fit_ml_algo(DecisionTreeClassifier(), 
                                                                XR_test, 
                                                                yr_test,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dtRT)
print("Accuracy CV 10-Fold: %s" % acc_cv_dtRT)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

### Gradient Boost Trees

In [None]:
# Gradient Boosting Trees for WHITE WINE
start_time = time.time()
test_pred_gbtWT, acc_gbtWT, acc_cv_gbtWT = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       XW_test, 
                                                                       yw_test,
                                                                       10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbtWT)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbtWT)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))

In [None]:
# Gradient Boosting Trees for RED WINE
start_time = time.time()
test_pred_gbtRT, acc_gbtRT, acc_cv_gbtRT = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       XR_test, 
                                                                       yr_test,
                                                                       10)

gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbtRT)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbtRT)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))

We can see here that the cross_validation avoids overfitting in the test dataset

---------------------------

## What attributes in terms of chemical components contributes for a good wine? What would make the best wine possible? 

### Best Features for White wine

In [None]:
## Feature Importance
best_modelW = GradientBoostingClassifier( )
best_modelW.fit(XW_train, yw_train)

In [None]:
# Plot the feature importance scores
feature_importance(best_modelW, XW_train)

### Best Features for Red wine

In [None]:
## Feature Importance
best_modelR = GradientBoostingClassifier( )
best_modelR.fit(XR_train, yr_train)

In [None]:
# Plot the feature importance scores
feature_importance(best_modelR, XR_train)

-------------------------------

## Are there any differences between the wine composition, quality perceived and whether the wine is red or white?

In [None]:
# Frequency of grade for white wine
fig = plt.figure(figsize=(15,5))
sns.countplot(y='quality', data=whiteWine);
print(whiteWine.quality.value_counts())

In [None]:
# Frequency of grade for red wine
fig = plt.figure(figsize=(15,5))
sns.countplot(y='quality', data=redWine);
print(redWine.quality.value_counts())


In [None]:
whiteWine.describe()

In [None]:
redWine.describe()

In [None]:
bestWhites = whiteWine.loc[(whiteWine.quality >= 6)]

In [None]:
bestReds =  redWine.loc[(redWine.quality >= 6)]

In [None]:
bestWhites.describe()

In [None]:
bestReds.describe()

----------------------------

# Try to improve accuracy of the best models

## Applying best performing models only on the 5 most relevant predictors

In [None]:
#X and y for WHITE wine dataset
XW5= whi_bin.drop(['quality','chlorides','pH','sulphates','fixed acidity','density','total sulfur dioxide'], axis=1) 
yw5 = whi_bin.quality
#X and y for RED wine dataset
XR5= red_bin.drop(['quality','citric acid','pH','residual sugar', 'density','fixed acidity'], axis=1) 
yr5 = red_bin.quality
#X_train,test and y for WHITE wine dataset
XW5_train, XW5_test, yw5_train, yw5_test = train_test_split(XW5, yw5, test_size=0.3, random_state=42)
#X_train,test and y for RED wine dataset
XR5_train, XR5_test, yr5_train, yr5_test = train_test_split(XR5, yr5, test_size=0.3, random_state=42)

### Gradient Boosting Classifier

In [None]:
#Try the best models in the reduced dataset
# Gradient Boosting Trees for WHITE WINE
start_time = time.time()
test_pred_gbtWT5, acc_gbtWT5, acc_cv_gbtWT5 = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       XW5_test, 
                                                                       yw5_test,
                                                                       10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbtWT5)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbtWT5)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))


In [None]:
# Gradient Boosting Trees for RED WINE
start_time = time.time()
test_pred_gbtRT5, acc_gbtRT5, acc_cv_gbtRT5 = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       XR5_test, 
                                                                       yr5_test,
                                                                       10)

gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbtRT5)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbtRT5)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))


### Logistic Regression

In [None]:
# Logistic Regression for WHITE WINE
start_time = time.time()
train_pred_logW5, acc_logW5, acc_cv_logW5 = fit_ml_algo(LogisticRegression(), XW5_test, yw5_test,10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_logW5)
print("Accuracy CV 10-Fold: %s" % acc_cv_logW5)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

In [None]:
# Logistic Regression for RED WINE
start_time = time.time()
train_pred_logR5, acc_logR5, acc_cv_logR5 = fit_ml_algo(LogisticRegression(), XR5_test, yr5_test,10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_logR5)
print("Accuracy CV 10-Fold: %s" % acc_cv_logR5)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

### Decision Tree Classifier

In [None]:
# Decision Tree Classifier for WHITE WINE
start_time = time.time()
test_pred_dtWT5, acc_dtWT5, acc_cv_dtWT5 = fit_ml_algo(DecisionTreeClassifier(), 
                                                                XW5_test, 
                                                                yw5_test,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dtWT5)
print("Accuracy CV 10-Fold: %s" % acc_cv_dtWT5)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

In [None]:
# Decision Tree Classifier for RED WINE
start_time = time.time()
test_pred_dtRT5, acc_dtRT5, acc_cv_dtRT5 = fit_ml_algo(DecisionTreeClassifier(), 
                                                                XR5_test, 
                                                                yr5_test,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dtRT5)
print("Accuracy CV 10-Fold: %s" % acc_cv_dtRT5)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

### Tuning gradient boost paramenters for white wine

In [None]:
p_test = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = p_test, scoring='accuracy',n_jobs=4, cv=5)
tuning.fit(XW_train, yw_train)

In [None]:
print(F"THE BEST SCORE IN  THE TRAINING DATASET AND ITS PARAMETERS ARE AS FOLOWS: {tuning.best_score_},{tuning.best_params_}")

In [None]:
p_test2 = {'max_depth':[2,3,4,5,6,7] }
tuning2 = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.01,n_estimators=1500, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = p_test2, scoring='accuracy',n_jobs=4, cv=5)
tuning2.fit(XW_train, yw_train)

In [None]:
print(F"THE BEST SCORE IN  THE TRAINING DATASET AND ITS PARAMETERS ARE AS FOLOWS: {tuning2.best_score_},{tuning2.best_params_}")

### Apply optimized model to test

In [None]:
optimized_model1 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=750,max_depth=7, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
optimized_model1.fit(XW_train, yw_train)

In [None]:
print('Accuracy of the GBM on test set: {:.3f}'.format(optimized_model1.score(XW_test, yw_test)))

-----------------------------------------

### Tuning gradient boost paramenters for white wine

In [None]:
p_test3 = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

tuning3 = GridSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = p_test3, scoring='accuracy',n_jobs=4, cv=5)
tuning3.fit(XR_train, yr_train)

In [None]:
print(F"THE BEST SCORE IN  THE TRAINING DATASET AND ITS PARAMETERS ARE AS FOLOWS: {tuning3.best_score_},{tuning3.best_params_}")

In [None]:
p_test4 = {'max_depth':[2,3,4,5,6,7] }
tuning4 = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.01,n_estimators=1500, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = p_test4, scoring='accuracy',n_jobs=4, cv=5)
tuning4.fit(XR_train, yr_train)

In [None]:
print(F"THE BEST SCORE IN  THE TRAINING DATASET AND ITS PARAMETERS ARE AS FOLOWS: {tuning4.best_score_},{tuning4.best_params_}")

### Apply optimized model to test

In [None]:
optimized_model2 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=1500,max_depth=5, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
optimized_model2.fit(XR_train, yr_train)

In [None]:
print('Accuracy of the GBM on test set: {:.3f}'.format(optimized_model2.score(XR_test, yr_test)))