In [1]:
# Imports and data loading
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import datetime

red_wine = pd.read_csv('../input/winequality-red.csv', delimiter=';')
white_wine = pd.read_csv('../input/winequality-white.csv', delimiter=';')
# Quality is the target variable, so wine colour is just another feature
# Add colour features
red_wine['red'] = 1
red_wine['white'] = 0
white_wine['red'] = 0
white_wine['white'] = 1
all_wine = pd.concat([red_wine, white_wine])

In [2]:
# Separate into feature/target data, then train/test data, also standardise
wine_feature = all_wine.drop('quality', axis=1)
wine_target = all_wine['quality']

feature_names = wine_feature.columns

EDA takeaways: 
- The data needs to be standardised for all models where that matters. 
- Prior to that, four of the features need to be converted to their logarithms. Those five features are: fixed acidity, chlorides, free sulfur dioxide, and sulphates. Density wasn't meaningfully affected. 

In [3]:
# Logarithms taken to reduce statistical imbalances
wine_feature['fixed acidity'] = np.log(wine_feature['fixed acidity'])
wine_feature['chlorides'] = np.log(wine_feature['chlorides'])
wine_feature['free sulfur dioxide'] = np.log(wine_feature['free sulfur dioxide'])
wine_feature['sulphates'] = np.log(wine_feature['sulphates'])

In [4]:
# Standardisation
x_train, x_test, y_train, y_test = train_test_split(wine_feature, wine_target, train_size=0.7, test_size=0.3, random_state=0)

# Directly confirm their sizes 
print("x_train.shape: %s" % str(x_train.shape))
print("x_test.shape: %s" % str(x_test.shape))
print("y_train.shape: %s" % str(y_train.shape))
print("y_test.shape: %s" % str(y_test.shape))

scaler = StandardScaler().fit(x_train)
standardised_x_train = scaler.transform(x_train)
standardised_x_test = scaler.transform(x_test)

x_train.shape: (4547, 13)
x_test.shape: (1950, 13)
y_train.shape: (4547,)
y_test.shape: (1950,)


In [5]:
# OOB Logistic Regression
lr = LogisticRegression(random_state=4)
lr.fit(standardised_x_train, y_train)
lr_pred = lr.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, lr_pred))
print('OOB Classification Report:')
print(classification_report(y_test, lr_pred))

# Logistic Regression with Grid Search
lr_gs_params = {'tol': [0.000003, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003], 
                'C': [0.03,0.1,0.3,1,3,10,30]}
lr_grid = GridSearchCV(estimator=LogisticRegression(random_state=4), param_grid=lr_gs_params)
lr_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % lr_grid.best_score_)
print('GridSearchCV best estimator: %s' % lr_grid.best_estimator_)

OOB Accuracy Score: 0.518462
OOB Classification Report:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        12
          4       0.50      0.02      0.03        64
          5       0.56      0.57      0.57       643
          6       0.50      0.72      0.59       845
          7       0.48      0.10      0.17       329
          8       0.00      0.00      0.00        54
          9       0.00      0.00      0.00         3

avg / total       0.50      0.52      0.47      1950



  'precision', 'predicted', average, warn_for)


GridSearchCV best score: 0.547174
GridSearchCV best estimator: LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=4, solver='liblinear', tol=3e-06,
          verbose=0, warm_start=False)


In [6]:
# OOB Random Forest
rf = RandomForestClassifier(random_state=4)
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, rf_pred))
print('OOB Classification Report:')
print(classification_report(y_test, rf_pred))

# Random Forest with Grid Search
rf_gs_params = {'n_estimators': np.arange(5,16),
                'min_samples_split': [2, 10, 20, 40, 80, 120]}
rf_grid = GridSearchCV(estimator=RandomForestClassifier(random_state=4), param_grid=rf_gs_params)
rf_grid.fit(x_train, y_train)
print('GridSearchCV best score: %f' % rf_grid.best_score_)
print('GridSearchCV best estimator: %s' % rf_grid.best_estimator_)

OOB Accuracy Score: 0.628718
OOB Classification Report:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        12
          4       0.27      0.09      0.14        64
          5       0.65      0.69      0.67       643
          6       0.61      0.71      0.66       845
          7       0.68      0.48      0.56       329
          8       0.79      0.28      0.41        54
          9       0.00      0.00      0.00         3

avg / total       0.62      0.63      0.62      1950



  'precision', 'predicted', average, warn_for)


GridSearchCV best score: 0.626787
GridSearchCV best estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=14, n_jobs=1,
            oob_score=False, random_state=4, verbose=0, warm_start=False)


In [7]:
# OOB K Nearest Neighbors 
knn = KNeighborsClassifier()
knn.fit(standardised_x_train, y_train)
knn_pred = knn.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, knn_pred))
print('OOB Classification Report:')
print(classification_report(y_test, knn_pred))

# K Nearest Neighbors with Grid Search
knn_gs_params = {'n_neighbors': np.arange(1,16), 
                'weights': ['uniform', 'distance']}
knn_grid = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_gs_params)
knn_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % knn_grid.best_score_)
print('GridSearchCV best estimator: %s' % knn_grid.best_estimator_)

OOB Accuracy Score: 0.542051
OOB Classification Report:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        12
          4       0.22      0.08      0.11        64
          5       0.56      0.60      0.58       643
          6       0.55      0.62      0.58       845
          7       0.51      0.43      0.47       329
          8       0.29      0.07      0.12        54
          9       0.00      0.00      0.00         3

avg / total       0.52      0.54      0.53      1950



  'precision', 'predicted', average, warn_for)


GridSearchCV best score: 0.625687
GridSearchCV best estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='distance')


In [8]:
# OOB Multi-layer Perceptron
print(datetime.datetime.now())
mlp = MLPClassifier(random_state=4)
mlp.fit(standardised_x_train, y_train)
mlp_pred = mlp.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, mlp_pred))
print('OOB Classification Report:')
print(classification_report(y_test, mlp_pred))

# Multi-layer Perceptron with Grid Search
mlp_gs_params = {'hidden_layer_sizes': [(100,), (100,100), (130,80,40)],
                 'activation': ['identity', 'logistic', 'tanh', 'relu'],
                 'solver': ['lbfgs', 'sgd', 'adam'], 
                 'alpha': [0.000003, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003]}
mlp_grid = GridSearchCV(estimator=MLPClassifier(random_state=4), param_grid=mlp_gs_params)
mlp_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % mlp_grid.best_score_)
print('GridSearchCV best estimator: %s' % mlp_grid.best_estimator_)
print(datetime.datetime.now())

2018-09-28 18:08:31.270254


  'precision', 'predicted', average, warn_for)


OOB Accuracy Score: 0.557949
OOB Classification Report:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        12
          4       0.36      0.08      0.13        64
          5       0.59      0.61      0.60       643
          6       0.55      0.64      0.59       845
          7       0.54      0.44      0.48       329
          8       0.25      0.02      0.03        54
          9       0.00      0.00      0.00         3

avg / total       0.54      0.56      0.54      1950















GridSearchCV best score: 0.581042
GridSearchCV best estimator: MLPClassifier(activation='relu', alpha=3e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(130, 80, 40), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=4, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
2018-09-28 18:51:37.675426


In [9]:
# OOB Multi-layer Perceptron max_iter=500
print(datetime.datetime.now())
mlp = MLPClassifier(max_iter=500, random_state=4)
mlp.fit(standardised_x_train, y_train)
mlp_pred = mlp.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, mlp_pred))
print('OOB Classification Report:')
print(classification_report(y_test, mlp_pred))

# Multi-layer Perceptron with Grid Search
mlp_gs_params = {'hidden_layer_sizes': [(100,), (100,100), (130,80,40)],
                 'activation': ['identity', 'logistic', 'tanh', 'relu'],
                 'solver': ['lbfgs', 'sgd', 'adam'], 
                 'alpha': [0.000003, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003]}
mlp_grid = GridSearchCV(estimator=MLPClassifier(max_iter=500, random_state=4), param_grid=mlp_gs_params)
mlp_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % mlp_grid.best_score_)
print('GridSearchCV best estimator: %s' % mlp_grid.best_estimator_)
print(datetime.datetime.now())

2018-09-28 18:51:37.707053
OOB Accuracy Score: 0.559487
OOB Classification Report:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        12
          4       0.33      0.06      0.11        64
          5       0.59      0.61      0.60       643
          6       0.54      0.66      0.60       845
          7       0.56      0.40      0.46       329
          8       0.50      0.06      0.10        54
          9       0.00      0.00      0.00         3

avg / total       0.55      0.56      0.54      1950



  'precision', 'predicted', average, warn_for)




GridSearchCV best score: 0.582362
GridSearchCV best estimator: MLPClassifier(activation='tanh', alpha=3e-06, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=4, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
2018-09-28 20:01:04.117318




In [10]:
# OOB Support Vector Machine
svm = SVC(random_state=4)
svm.fit(standardised_x_train, y_train)
svm_pred = svm.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, svm_pred))
print('OOB Classification Report:')
print(classification_report(y_test, svm_pred))

# Support Vector Machine with Grid Search
svm_gs_params = {'C': [0.03,0.1,0.3,1,3,10,30],
                 'kernel': ['rbf', 'poly', 'sigmoid']}
svm_grid = GridSearchCV(estimator=SVC(random_state=4), param_grid=svm_gs_params)
svm_grid.fit(standardised_x_train, y_train)
print('GridSearchCV best score: %f' % svm_grid.best_score_)
print('GridSearchCV best estimator: %s' % svm_grid.best_estimator_)

OOB Accuracy Score: 0.547179
OOB Classification Report:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        12
          4       0.00      0.00      0.00        64
          5       0.60      0.58      0.59       643
          6       0.52      0.75      0.61       845
          7       0.56      0.19      0.29       329
          8       0.00      0.00      0.00        54
          9       0.00      0.00      0.00         3

avg / total       0.52      0.55      0.51      1950



  'precision', 'predicted', average, warn_for)


GridSearchCV best score: 0.581482
GridSearchCV best estimator: SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=4, shrinking=True,
  tol=0.001, verbose=False)


In [11]:
# OOB Decision Tree
dtc = DecisionTreeClassifier(random_state=4)
dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, dtc_pred))
print('OOB Classification Report:')
print(classification_report(y_test, dtc_pred))

# Decision Tree with Grid Search
dtc_gs_params = {'criterion': ['gini', 'entropy']}
dtc_grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=4), param_grid=dtc_gs_params)
dtc_grid.fit(x_train, y_train)
print('GridSearchCV best score: %f' % dtc_grid.best_score_)
print('GridSearchCV best estimator: %s' % dtc_grid.best_estimator_)

OOB Accuracy Score: 0.584615
OOB Classification Report:
             precision    recall  f1-score   support

          3       0.00      0.00      0.00        12
          4       0.19      0.17      0.18        64
          5       0.63      0.64      0.63       643
          6       0.61      0.63      0.62       845
          7       0.54      0.51      0.52       329
          8       0.34      0.43      0.38        54
          9       0.00      0.00      0.00         3

avg / total       0.58      0.58      0.58      1950





GridSearchCV best score: 0.553112
GridSearchCV best estimator: DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=4,
            splitter='best')


In [12]:
# OOB Naive Bayes
gnb = GaussianNB()
gnb.fit(standardised_x_train, y_train)
gnb_pred = gnb.predict(standardised_x_test)
print('OOB Accuracy Score: %f' % accuracy_score(y_test, gnb_pred))
print('OOB Classification Report:')
print(classification_report(y_test, gnb_pred))

# Naive Bayes with Grid Search
# There were simple parameters to search through, so no grid search was performed for GaussianNB

OOB Accuracy Score: 0.441026
OOB Classification Report:
             precision    recall  f1-score   support

          3       0.08      0.08      0.08        12
          4       0.24      0.12      0.16        64
          5       0.48      0.43      0.45       643
          6       0.46      0.50      0.48       845
          7       0.41      0.44      0.42       329
          8       0.13      0.13      0.13        54
          9       0.00      0.00      0.00         3

avg / total       0.44      0.44      0.44      1950

