# Question 4

## How well can the quality of wine be predicted from physicochemical measurements?

In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

red_wine = pd.read_csv('../data/winequality-red.csv', delimiter=';')
white_wine = pd.read_csv('../data/winequality-white.csv', delimiter=';')

red_x = red_wine.iloc[:,:11]
red_y = red_wine.iloc[:,-1:]

white_x = white_wine.iloc[:,:11]
white_y = white_wine.iloc[:,-1:]

`winequality.names` in the `data` directory contains information on the datasets, including some concerning prior analyses, where SVMs achieved the best results. 

In [2]:
# train/test split
red_x_train, red_x_test, red_y_train, red_y_test = train_test_split(red_x, red_y, train_size=0.7)
white_x_train, white_x_test, white_y_train, white_y_test = train_test_split(white_x, white_y, train_size=0.7, test_size=0.3)



In [3]:
# mean normalisation
red_scaler = StandardScaler().fit(red_x_train)
standardised_red_x_train = red_scaler.transform(red_x_train)
standardised_red_x_test = red_scaler.transform(red_x_test)

white_scaler = StandardScaler().fit(white_x_train)
standardised_white_x_train = white_scaler.transform(white_x_train)
standardised_white_x_test = white_scaler.transform(white_x_test)

In [4]:
# obligatory linear regression 
red_lr = LinearRegression()
red_lr.fit(standardised_red_x_train, red_y_train)
red_lr_pred = red_lr.predict(standardised_red_x_test)
print('red lr pred mae: %f' % mean_absolute_error(red_y_test, red_lr_pred))

white_lr = LinearRegression()
white_lr.fit(standardised_white_x_train, white_y_train)
white_lr_pred = white_lr.predict(standardised_white_x_test)
print('white lr pred mae: %f' % mean_absolute_error(white_y_test, white_lr_pred))

red lr pred mae: 0.502297
white lr pred mae: 0.587126


In [5]:
# red svr 
red_svr_ploy = SVR(kernel='poly')
red_svr_rbf = SVR(kernel='rbf')
red_svr_sigmoid = SVR(kernel='sigmoid')

red_svr_ploy.fit(standardised_red_x_train, red_y_train.values.ravel())
red_svr_rbf.fit(standardised_red_x_train, red_y_train.values.ravel())
red_svr_sigmoid.fit(standardised_red_x_train, red_y_train.values.ravel())

red_svr_poly_predictions = red_svr_ploy.predict(standardised_red_x_test)
red_svr_rbf_predictions = red_svr_rbf.predict(standardised_red_x_test)
red_svr_sigmoid_predictions = red_svr_sigmoid.predict(standardised_red_x_test)

print('mae red svr poly: %f' % mean_absolute_error(red_y_test,red_svr_poly_predictions))
print('mae red svr rbf: %f' % mean_absolute_error(red_y_test,red_svr_rbf_predictions))
print('mae red svr sigmoid: %f' % mean_absolute_error(red_y_test,red_svr_sigmoid_predictions))

mae red svr poly: 0.529313
mae red svr rbf: 0.453978
mae red svr sigmoid: 5.844613


In [6]:
# white svr
white_svr_ploy = SVR(kernel='poly')
white_svr_rbf = SVR(kernel='rbf')
white_svr_sigmoid = SVR(kernel='sigmoid')

white_svr_ploy.fit(standardised_white_x_train, white_y_train.values.ravel())
white_svr_rbf.fit(standardised_white_x_train, white_y_train.values.ravel())
white_svr_sigmoid.fit(standardised_white_x_train, white_y_train.values.ravel())

white_svr_poly_predictions = white_svr_ploy.predict(standardised_white_x_test)
white_svr_rbf_predictions = white_svr_rbf.predict(standardised_white_x_test)
white_svr_sigmoid_predictions = white_svr_sigmoid.predict(standardised_white_x_test)

print('mae white svr poly: %f' % mean_absolute_error(white_y_test,white_svr_poly_predictions))
print('mae white svr rbf: %f' % mean_absolute_error(white_y_test,white_svr_rbf_predictions))
print('mae white svr sigmoid: %f' % mean_absolute_error(white_y_test,white_svr_sigmoid_predictions))

mae white svr poly: 0.661716
mae white svr rbf: 0.514259
mae white svr sigmoid: 14.632742


For both datasets the RBF kernel resulted in the most accurate predictions, so a gridsearch will be run to fine tune it. 

In [7]:
# svr gridsearches
parameters = {'C':np.arange(0.1, 2.0, 0.1), 'gamma': np.arange(0.01, 1, 0.01)}

red_rbf_gridsearchcv = GridSearchCV(SVR(kernel='rbf'), parameters, n_jobs=4)
white_rbf_gridsearchcv = GridSearchCV(SVR(kernel='rbf'), parameters, n_jobs=4)

red_rbf_gridsearchcv.fit(standardised_red_x_train, red_y_train.values.ravel())
white_rbf_gridsearchcv.fit(standardised_white_x_train, white_y_train.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9]), 'gamma': array([0.01, 0.02, ..., 0.98, 0.99])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
print('Red Wine:')
print('\tred rbf gridsearchCV best score: %f' % red_rbf_gridsearchcv.best_score_)
print('\tred rbf gridsearchCV best params: %s' % red_rbf_gridsearchcv.best_params_)
print()
print('White Wine:')
print('\twhite rbf gridsearchCV best score: %f' % white_rbf_gridsearchcv.best_score_)
print('\twhite rbf gridsearchCV best params: %s' % white_rbf_gridsearchcv.best_params_)

Red Wine:
	red rbf gridsearchCV best score: 0.365047
	red rbf gridsearchCV best params: {'C': 0.6, 'gamma': 0.060000000000000005}

White Wine:
	white rbf gridsearchCV best score: 0.382712
	white rbf gridsearchCV best params: {'C': 1.4000000000000001, 'gamma': 0.15000000000000002}


So the best SVM can get mean absolute error (which is a useful error measurement because it relates directly to the target variable which we understand intuitively) between 0.36 (for red wine) and 0.37 (for white wine.) 

Fortunately, that is better than default linear regression. 