Python Machine Learning with Scikit-Learn: Wine Snob Edition

In [2]:
#Numpy
import numpy as np
#Pandas
import pandas as pd


In [3]:
#Import sampling helper
from sklearn.model_selection import train_test_split


In [4]:
#Import preprocessing modules

from sklearn import preprocessing

In [6]:
#Import random forest model
from sklearn.ensemble import RandomForestRegressor

In [7]:
#Import cross-validation pipeline

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [10]:
#Import evaluation metrics

from sklearn.metrics import mean_squared_error, r2_score

In [11]:
#Import module for saving scikit-learn models

from sklearn.externals import joblib

In [12]:
#loading the data from a .csv file

data = pd.read_csv('winequality-red.csv')

In [13]:
data = pd.read_csv('winequality-red.csv', sep=';')

In [14]:
#Now let's take a look at the first 5 rows of data
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [15]:
data.shape

(1599, 12)

In [17]:
# the data statistics
data.describe()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


Split data into training and test sets.

In [18]:
#Separate target from training features

y = data.quality
X = data.drop('quality', axis=1)

In [19]:
#Split data into train and test sets
#we'll set aside 20% of the data as a test set for evaluating our model

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

Declare data preprocessing steps.

In [25]:
#Fitting the Transformer API
scaler = preprocessing.StandardScaler().fit(X_train)

In [26]:
#Applying transformer to training data
X_train_scaled = scaler.transform(X_train)
 



In [29]:
print (X_train_scaled.mean(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]


In [30]:
print (X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [31]:
#Applying transformer to test data
X_test_scaled = scaler.transform(X_test)

In [32]:
print (X_test_scaled.mean(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]


In [33]:
print (X_test_scaled.std(axis=0))

[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [34]:
#Pipeline with preprocessing and model
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

Declare hyperparameters to tune

In [35]:
#Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

Tune model using a cross-validation pipeline.

In [36]:
#Sklearn cross-validation with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

In [37]:
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [39]:


print (clf.best_params_)
# {'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


Refit on the entire training set.

In [41]:
#Confirm model will be retrained
print (clf.refit)

True


Evaluate model pipeline on test data

In [42]:
#Predict a new set of dataPython
y_pred = clf.predict(X_test)

In [43]:
print (r2_score(y_test, y_pred))

0.4687993219928568


In [44]:
print (mean_squared_error(y_test, y_pred))

0.34276968750000003


Save model for future use.

In [45]:
#Save model to a .pkl filePython
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']