# Wine Quality Predictor
---
**Author:** Dylan Tulett, Brian Johnston  
**Version:** 1.0  
**Date** April 2021  
**Solution** Produce a machine learning model to judge wine quality.

# 1.0 Reading in Data

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
reds = pd.read_csv("winequality-red.csv")

In [4]:
reds.sample(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1513,6.4,0.56,0.15,1.8,0.078,17.0,65.0,0.99294,3.33,0.6,10.5,6
427,9.5,0.78,0.22,1.9,0.077,6.0,32.0,0.9988,3.26,0.56,10.6,6
1278,8.0,0.715,0.22,2.3,0.075,13.0,81.0,0.99688,3.24,0.54,9.5,6


In [5]:
reds

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [6]:
reds.columns = ['fixed_acid', 'vol_acid', 'cit_acid','sugar', 'chlorides','free_so2',
                'tot_so2', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

# 2.0 Splitting Data

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = np.array(reds[['fixed_acid','sugar','alcohol']])
y = np.array(reds['quality'])

x_train, x_test_prime, y_train, y_test_prime = train_test_split(X, y, test_size=.40, random_state=49)
x_validate, x_test, y_validate, y_test = train_test_split(x_test_prime, y_test_prime, test_size=.50, random_state=49)

NOTE: multiple column combinations were tried for the X array. The one that was selected is the one that got the best score in part 4.0. This happened to be the fixed acid, sugar, and alcohol columns together

In [9]:
print(len(x_train), len(x_validate), len(x_test))

959 320 320


# 3.0 Normalizing the Data
- For this, the standard deviation will be used to scale the data
- Don't need to normalize the y data because this is what we are guessing for

In [37]:
def std_scaler(x):
    """This function takes an np.array and normalizes the data 
    by standard deviation"""
    mean = np.mean(x)
    std = np.std(x)
    output = (x-mean)/std
    return output

In [11]:
x_train_scaled = std_scaler(x_train)
x_validate_scaled = std_scaler(x_validate)
x_test_scaled = std_scaler(x_test)

# 4.0 Training Different Models

## 4.1 kn Neighbour

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [13]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)

KNeighborsClassifier()

In [14]:
#validate
y_val_pred_kn = knn_model.predict(x_validate)
y_val_pred_kn

array([6, 5, 6, 5, 7, 6, 7, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 6, 5, 5, 4, 6,
       6, 6, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 5, 6, 6, 7, 6, 5, 5, 6, 6, 5,
       7, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 6, 5, 6, 6, 5, 5,
       5, 6, 6, 6, 5, 7, 6, 6, 6, 6, 5, 6, 6, 5, 6, 5, 5, 5, 6, 5, 6, 6,
       6, 5, 5, 5, 5, 6, 6, 5, 6, 5, 6, 6, 5, 6, 6, 7, 5, 6, 5, 5, 5, 6,
       5, 5, 5, 6, 5, 5, 5, 5, 5, 6, 6, 7, 5, 5, 5, 6, 7, 6, 5, 5, 5, 6,
       6, 7, 5, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 7, 6, 6, 5, 5, 5, 5, 5,
       5, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6, 7, 5, 5, 6, 5, 6, 6, 6, 5, 5, 5,
       7, 5, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 7, 6,
       7, 7, 5, 6, 6, 7, 6, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 5, 7, 6, 5,
       6, 7, 6, 5, 5, 6, 5, 5, 6, 5, 5, 5, 5, 6, 5, 6, 7, 5, 6, 7, 6, 6,
       5, 5, 6, 5, 5, 6, 5, 6, 6, 6, 5, 6, 6, 5, 5, 5, 6, 5, 6, 6, 6, 5,
       5, 6, 6, 5, 7, 5, 7, 6, 6, 5, 6, 5, 5, 6, 5, 5, 7, 4, 6, 5, 6, 6,
       5, 5, 5, 7, 5, 6, 6, 5, 6, 5, 7, 7, 5, 7, 5,

In [15]:
accuracy = metrics.accuracy_score(y_validate,y_val_pred_kn)
precision = metrics.precision_score(y_validate,y_val_pred_kn, average='weighted')
recall = metrics.recall_score(y_validate,y_val_pred_kn, average='weighted')
f1 = metrics.f1_score(y_validate,y_val_pred_kn, average='weighted')

print(f'{accuracy}, {precision}, {recall}, {f1}')

0.540625, 0.5274205988164805, 0.540625, 0.5303196809325342


  _warn_prf(average, modifier, msg_start, len(result))


## 4.2 Naive Bayes (Gaussian)

In [16]:
from sklearn.naive_bayes import GaussianNB

In [17]:
gnb_model = GaussianNB()
gnb_model.fit(x_train, y_train)

GaussianNB()

In [18]:
#validate
y_val_pred_nb = gnb_model.predict(x_validate)
y_val_pred_nb

array([6, 6, 5, 6, 6, 6, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 5, 5, 6,
       5, 6, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6, 6, 5, 6, 5, 5, 5,
       6, 5, 6, 6, 6, 6, 5, 6, 5, 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5,
       5, 6, 6, 6, 5, 6, 6, 6, 7, 6, 5, 6, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5,
       4, 5, 5, 5, 5, 7, 5, 5, 6, 5, 6, 6, 5, 5, 5, 6, 5, 5, 5, 5, 5, 7,
       5, 5, 6, 7, 5, 5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 6, 5, 5, 5, 6,
       5, 6, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5,
       7, 6, 5, 5, 5, 5, 5, 6, 5, 6, 5, 6, 6, 4, 6, 5, 6, 6, 6, 5, 6, 5,
       6, 5, 6, 6, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 6,
       7, 6, 5, 5, 7, 6, 5, 5, 5, 5, 6, 4, 6, 5, 5, 5, 5, 6, 5, 6, 6, 5,
       6, 6, 5, 5, 5, 6, 5, 5, 6, 5, 5, 6, 5, 6, 5, 5, 6, 5, 6, 6, 6, 6,
       5, 5, 5, 5, 4, 6, 5, 7, 6, 6, 5, 5, 6, 5, 5, 6, 6, 5, 6, 6, 7, 5,
       5, 5, 4, 5, 7, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 5, 6, 5,
       5, 6, 5, 6, 5, 6, 5, 6, 5, 5, 6, 6, 5, 6, 5,

In [19]:
accuracy = metrics.accuracy_score(y_validate,y_val_pred_nb)
precision = metrics.precision_score(y_validate,y_val_pred_nb, average='weighted')
recall = metrics.recall_score(y_validate,y_val_pred_nb, average='weighted')
f1 = metrics.f1_score(y_validate,y_val_pred_nb, average='weighted')

print(f'{accuracy}, {precision}, {recall}, {f1}')

0.525, 0.4775787601626017, 0.525, 0.4892222614561893


  _warn_prf(average, modifier, msg_start, len(result))


## 4.3 Random Forest
- Best So far

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rfc_model1 = RandomForestClassifier(max_depth=3, random_state=49)
rfc_model1.fit(x_train,y_train)

#validate
y_val_pred_rf1 = rfc_model1.predict(x_validate)

#score
accuracy = metrics.accuracy_score(y_validate,y_val_pred_rf1)
precision = metrics.precision_score(y_validate,y_val_pred_rf1, average='weighted')
recall = metrics.recall_score(y_validate,y_val_pred_rf1, average='weighted')
f1 = metrics.f1_score(y_validate,y_val_pred_rf1, average='weighted')

print(f'{accuracy}, {precision}, {recall}, {f1}')

0.565625, 0.530237407101569, 0.565625, 0.5261792220048618


  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
rfc_model2 = RandomForestClassifier(max_depth=20, n_estimators=200, random_state=49)
rfc_model2.fit(x_train,y_train)

#validate
y_val_pred_rf2 = rfc_model2.predict(x_validate)

#score
accuracy = metrics.accuracy_score(y_validate,y_val_pred_rf2)
precision = metrics.precision_score(y_validate,y_val_pred_rf2, average='weighted')
recall = metrics.recall_score(y_validate,y_val_pred_rf2, average='weighted')
f1 = metrics.f1_score(y_validate,y_val_pred_rf2, average='weighted')

print(f'{accuracy}, {precision}, {recall}, {f1}')

0.6375, 0.6303055436643835, 0.6375, 0.630524460935


  _warn_prf(average, modifier, msg_start, len(result))


This last model works the best of all the models that have been tried. This will be used for the analysis

### 4.3.1 Testing Our Model

In [23]:
y_pred_rf2 = rfc_model2.predict(x_test)

In [24]:
accuracy = metrics.accuracy_score(y_test,y_pred_rf2)
precision = metrics.precision_score(y_test,y_pred_rf2, average='weighted')
recall = metrics.recall_score(y_test,y_pred_rf2, average='weighted')
f1 = metrics.f1_score(y_test,y_pred_rf2, average='weighted')

print(f'{accuracy}, {precision}, {recall}, {f1}')

0.590625, 0.5556495098039216, 0.590625, 0.5691413805609284


  _warn_prf(average, modifier, msg_start, len(result))


NOTE: It is not useful to run any ROC Curve analysis on this model because ROC curves are used for binary classifiers, and this is a non-binary classifier. Because of this, it is not a good idea to do an ROC curve. 

## 5.0 Testing Other Experiment Parameters

### 5.1
50/25/25 split for train, validate, and test  
Standard deviation normalization

In [33]:
#groups
X = np.array(reds[['fixed_acid','sugar','alcohol']])
y = np.array(reds['quality'])

#split
x_train, x_test_prime, y_train, y_test_prime = train_test_split(X, y, test_size=.50, random_state=49)
x_validate, x_test, y_validate, y_test = train_test_split(x_test_prime, y_test_prime, test_size=.50, random_state=49)

#normalization
x_train_scaled = std_scaler(x_train)
x_validate_scaled = std_scaler(x_validate)
x_test_scaled = std_scaler(x_test)

#producing a model
rfc_model3 = RandomForestClassifier(max_depth=20, n_estimators=200, random_state=49)
rfc_model3.fit(x_train,y_train)

#validate
y_val_pred_rf3 = rfc_model3.predict(x_validate)

#score
accuracy = metrics.accuracy_score(y_validate,y_val_pred_rf3)
precision = metrics.precision_score(y_validate,y_val_pred_rf3, average='weighted')
recall = metrics.recall_score(y_validate,y_val_pred_rf3, average='weighted')
f1 = metrics.f1_score(y_validate,y_val_pred_rf3, average='weighted')

print(f'Validate: {accuracy}, {precision}, {recall}, {f1}')

#test
y_pred_rf3 = rfc_model3.predict(x_test)

accuracy = metrics.accuracy_score(y_validate,y_pred_rf3)
precision = metrics.precision_score(y_validate,y_pred_rf3, average='weighted')
recall = metrics.recall_score(y_validate,y_pred_rf3, average='weighted')
f1 = metrics.f1_score(y_validate,y_pred_rf3, average='weighted')

print(f'Test: {accuracy}, {precision}, {recall}, {f1}')

Validate: 0.575, 0.5603941441441441, 0.575, 0.5640605527284872
Test: 0.3825, 0.3633584724005135, 0.3825, 0.3712857142857143


  _warn_prf(average, modifier, msg_start, len(result))


### 5.2 
70/15/15 split for train, validate, and test  
Standard deviation normalization

In [34]:
#groups
X = np.array(reds[['fixed_acid','sugar','alcohol']])
y = np.array(reds['quality'])

#split
x_train, x_test_prime, y_train, y_test_prime = train_test_split(X, y, test_size=.20, random_state=49)
x_validate, x_test, y_validate, y_test = train_test_split(x_test_prime, y_test_prime, test_size=.50, random_state=49)

#normalization
x_train_scaled = std_scaler(x_train)
x_validate_scaled = std_scaler(x_validate)
x_test_scaled = std_scaler(x_test)

#producing a model
rfc_model3 = RandomForestClassifier(max_depth=30, n_estimators=200, random_state=49)
rfc_model3.fit(x_train,y_train)

#validate
y_val_pred_rf3 = rfc_model3.predict(x_validate)

#score
accuracy = metrics.accuracy_score(y_validate,y_val_pred_rf3)
precision = metrics.precision_score(y_validate,y_val_pred_rf3, average='weighted')
recall = metrics.recall_score(y_validate,y_val_pred_rf3, average='weighted')
f1 = metrics.f1_score(y_validate,y_val_pred_rf3, average='weighted')

print(f'Validate: {accuracy}, {precision}, {recall}, {f1}')

#test
y_pred_rf3 = rfc_model3.predict(x_test)

accuracy = metrics.accuracy_score(y_validate,y_pred_rf3)
precision = metrics.precision_score(y_validate,y_pred_rf3, average='weighted')
recall = metrics.recall_score(y_validate,y_pred_rf3, average='weighted')
f1 = metrics.f1_score(y_validate,y_pred_rf3, average='weighted')

print(f'Test: {accuracy}, {precision}, {recall}, {f1}')

Validate: 0.56875, 0.5699542523684304, 0.56875, 0.5651513016557617
Test: 0.38125, 0.37325549450549456, 0.38125, 0.367952753014437


### 5.3
50/25/25 split train, validate, test  
Raw features

In [35]:
#groups
X = np.array(reds[['fixed_acid','sugar','alcohol']])
y = np.array(reds['quality'])

#split
x_train, x_test_prime, y_train, y_test_prime = train_test_split(X, y, test_size=.20, random_state=49)
x_validate, x_test, y_validate, y_test = train_test_split(x_test_prime, y_test_prime, test_size=.50, random_state=49)

#producing a model
rfc_model3 = RandomForestClassifier(max_depth=30, n_estimators=200, random_state=49)
rfc_model3.fit(x_train,y_train)

#validate
y_val_pred_rf3 = rfc_model3.predict(x_validate)

#score
accuracy = metrics.accuracy_score(y_validate,y_val_pred_rf3)
precision = metrics.precision_score(y_validate,y_val_pred_rf3, average='weighted')
recall = metrics.recall_score(y_validate,y_val_pred_rf3, average='weighted')
f1 = metrics.f1_score(y_validate,y_val_pred_rf3, average='weighted')

print(f'Validate :{accuracy}, {precision}, {recall}, {f1}')

#test
y_pred_rf3 = rfc_model3.predict(x_test)

accuracy = metrics.accuracy_score(y_validate,y_pred_rf3)
precision = metrics.precision_score(y_validate,y_pred_rf3, average='weighted')
recall = metrics.recall_score(y_validate,y_pred_rf3, average='weighted')
f1 = metrics.f1_score(y_validate,y_pred_rf3, average='weighted')

print(f'Test: {accuracy}, {precision}, {recall}, {f1}')

Validate :0.56875, 0.5699542523684304, 0.56875, 0.5651513016557617
Test: 0.38125, 0.37325549450549456, 0.38125, 0.367952753014437


### 5.4
60/20/20 split train, validate, test  
Raw features

In [36]:
#groups
X = np.array(reds[['fixed_acid','sugar','alcohol']])
y = np.array(reds['quality'])

#split
x_train, x_test_prime, y_train, y_test_prime = train_test_split(X, y, test_size=.40, random_state=49)
x_validate, x_test, y_validate, y_test = train_test_split(x_test_prime, y_test_prime, test_size=.50, random_state=49)

#producing a model
rfc_model3 = RandomForestClassifier(max_depth=30, n_estimators=200, random_state=49)
rfc_model3.fit(x_train,y_train)

#validate
y_val_pred_rf3 = rfc_model3.predict(x_validate)

#score
accuracy = metrics.accuracy_score(y_validate,y_val_pred_rf3)
precision = metrics.precision_score(y_validate,y_val_pred_rf3, average='weighted')
recall = metrics.recall_score(y_validate,y_val_pred_rf3, average='weighted')
f1 = metrics.f1_score(y_validate,y_val_pred_rf3, average='weighted')

print(f'Validate: {accuracy}, {precision}, {recall}, {f1}')

#test
y_pred_rf3 = rfc_model3.predict(x_test)

accuracy = metrics.accuracy_score(y_validate,y_pred_rf3)
precision = metrics.precision_score(y_validate,y_pred_rf3, average='weighted')
recall = metrics.recall_score(y_validate,y_pred_rf3, average='weighted')
f1 = metrics.f1_score(y_validate,y_pred_rf3, average='weighted')

print(f'Test: {accuracy}, {precision}, {recall}, {f1}')

Validate: 0.634375, 0.6270324946671999, 0.634375, 0.6268971578151386
Test: 0.409375, 0.3870747202201871, 0.409375, 0.3970484511008705


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Discussion
As you can see, the original model with random forest was the best. This model used:  
- Random Forest
- 60/20/20 Train, validate, and test data spliting
- max 30 depth of tree
- 200 trees max per forest
- fixed acid, sugar, alcohol for the features
