In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import os

In [2]:
file = os.path.join('data','winequality-red.csv')
data = pd.read_csv(file)

In [3]:
column_name = 'quality_score'
data.loc[data['quality'] >= 6, 'quality_score'] = 'high'
data.loc[data['quality'] <= 5, 'quality_score'] = 'med'

In [4]:
data['Q'] = np.where(data['quality_score'] == 'low', 0,
                    np.where(data['quality_score'] == 'med', 1, np.where(data['quality_score'] == 'high', 3,4)))
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_score,Q
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,med,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,med,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,med,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,high,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,med,1


In [5]:
y = data.Q

In [6]:
data.drop('quality_score', axis=1, inplace=True)
data.drop('quality', axis=1, inplace=True)

In [7]:
data.drop('Q', axis=1, inplace=True)

In [8]:
X = data

**train_test_split function:**

As you can see, we'll set aside 20% of the data as a test set for evaluating our model.

**_"stratify=y"_** to ensure your training set looks similar to your test set, making your evaluation metrics more reliable.

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

Next, we'll import the entire preprocessing module. This contains utilities for scaling, transforming, and wrangling data.

**_Instead of directly invoking the scale function, we'll be using a feature in Scikit-Learn called the Transformer API. The Transformer API allows you to "fit" a preprocessing step using the training data the same way you'd fit a model..._**

...and then use the same transformation on future data sets!

Here's what that process looks like:

Fit the transformer on the training set (saving the means and standard deviations)
Apply the transformer to the training set (scaling the training data)
Apply the transformer to the test set (using the same means and standard deviations)
This makes your final estimate of model performance more realistic, and it allows to insert your preprocessing steps into a cross-validation pipeline

In [10]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)

**_Now, the scaler object has the saved means and standard deviations for each feature in the training set._**

In [11]:
X_train_scaled = scaler.transform(X_train)

Confirm it worked:

In [12]:
print (X_train_scaled.mean(axis=0))

[ -6.86098732e-16  -2.02774119e-16  -2.77772766e-18  -9.65260362e-17
   3.05550043e-17  -6.24988724e-17  -1.38886383e-17  -2.90300318e-14
   1.72774661e-15   8.33318298e-18  -6.27766451e-16]


In [13]:
print (X_train_scaled.std(axis=0))

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


Note how we're taking the scaler object and using it to transform the training set. Now, we can **_transform the test set using the exact same means and standard deviations used to transform the training set:_**

In [14]:
X_test_scaled = scaler.transform(X_test)
 
print (X_test_scaled.mean(axis=0)) 
print (X_test_scaled.std(axis=0))

[ 0.00913734 -0.03460033  0.05467551  0.00918805 -0.0739019   0.01591236
  0.08358537  0.05279943 -0.01422127  0.0811219  -0.01247969]
[ 0.94371544  1.02383393  0.99427317  0.97352596  0.80113527  0.94831112
  1.02026416  0.97439713  0.96027612  1.1174815   1.02197151]


Notice how the scaled features in the test set are not perfectly centered at zero with unit variance! This is exactly what we'd expect, as we're transforming the test set using the means from the training set, not from the test set itself.

**_In practice, when we set up the cross-validation pipeline, we won't even need to manually fit the Transformer API. Instead, we'll simply declare the class object, like so:_**

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

This is exactly what it looks like: a modeling pipeline that first transforms the data using StandardScaler() and then fits a model using a random forest regressor.

**Hyperperameters:**
    
There are two types of parameters we need to worry about: model parameters and hyperparameters. Models parameters can be learned directly from the data (i.e. regression coefficients), while hyperparameters cannot.

Hyperparameters express "higher-level" structural information about the model, and they are typically set before training the model. 

**_We can list the tunable hyperparameters like so:_**

In [16]:
print (pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))], 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=Fals

**Declare the hyperparameters we want to tune through cross-validation:**

In [17]:
hyperparameters_rfg = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

**Cross-validation** for reliably estimating the performance of a method for building a model by training and evaluating your model multiple times using the same method.

The best practice when performing CV is to include your data preprocessing steps inside the cross-validation loop. This prevents accidentally tainting your training folds with influential data from your test fold.

Fortunately, **_Scikit-Learn makes it simple to set this up with "GridSearchCV" which essentially performs cross-validation across the entire "grid" (all possible permutations) of hyperparameters._**

It takes in your model (in this case, we're using a model pipeline), the hyperparameters you want to tune, and the number of folds to create:

In [18]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(pipeline, hyperparameters_rfg, cv=5)

In [19]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

**Now, you can see the best set of parameters found using CV:**

In [20]:
print (clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


**Conveniently, GridSearchCV from sklearn will automatically refit the model with the best set of hyperparameters using the entire training set... to confirm:**

In [21]:
print (clf.refit)

True


In [22]:
y_pred = clf.predict(X_test)

In [23]:
from sklearn.metrics import mean_squared_error, r2_score
print (r2_score(y_test, y_pred))
print (mean_squared_error(y_test, y_pred))

0.488168295459
0.5094125


In [24]:
clf.score(X_test,y_test)

0.48816829545900536

In [25]:
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         GradientBoostingClassifier())

In [26]:
print (pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gradientboostingclassifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))], 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'gradientboostingclassifier': GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min

In [27]:
hyperparameters_gb = { 'gradientboostingclassifier__max_depth' : [3, 5, 7, 9],
                    'gradientboostingclassifier__subsample' : [.6, .7, .8, .9, 1.0]} 

In [28]:
grad_boost = GridSearchCV(pipeline, hyperparameters_gb, cv=5)

In [29]:
grad_boost.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gradientboostingclassifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
   ...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gradientboostingclassifier__max_depth': [3, 5, 7, 9], 'gradientboostingclassifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
print (grad_boost.best_params_)

{'gradientboostingclassifier__max_depth': 7, 'gradientboostingclassifier__subsample': 0.8}


In [31]:
print (grad_boost.refit)

True


In [32]:
y_pred = grad_boost.predict(X_test)

In [33]:
print (r2_score(y_test, y_pred)) 
print (mean_squared_error(y_test, y_pred))

0.246438243259
0.75


In [34]:
grad_boost.score(X_test,y_test)

0.8125

In [35]:
from sklearn.metrics import classification_report
target_names = ['high quality', 'medium quality']
print(classification_report(y_test, grad_boost.predict(X_test), target_names=target_names))

                precision    recall  f1-score   support

  high quality       0.79      0.81      0.80       149
medium quality       0.83      0.82      0.82       171

   avg / total       0.81      0.81      0.81       320



**Save model for future use in a pickle file:**

In [36]:
from sklearn.externals import joblib 
joblib.dump(grad_boost, 'grad_boost.pkl')

['grad_boost.pkl']