In [56]:
import numpy as np
import pandas
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

In [57]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pandas.read_csv(dataset_url)
data.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


In [58]:
#it looks like the CSV file is actually using semicolons to separate the data
data = pandas.read_csv(dataset_url, sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [59]:
data.shape

(1599, 12)

In [60]:
#We have 1,599 samples and 12 features, including our target(quality) feature. Let's print some summary statistics
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [61]:
#they have some very different scales, so let's make a mental note to "standardize" the data later
#Step 4: Split data into training and test sets
#First, let's separate our target (y) features from our input (X) features:
y = data.quality
X = data.drop('quality', axis=1)

In [62]:
#Split data into train and test sets
"""it's good practice to stratify your sample by the target variable. 
This will ensure your training set looks similar to your test set, making your evaluation metrics more reliable."""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)
X_test.head() #y_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
797,9.3,0.37,0.44,1.6,0.038,21.0,42.0,0.99526,3.24,0.81,10.8
871,6.9,0.56,0.03,1.5,0.086,36.0,46.0,0.99522,3.53,0.57,10.6
1333,9.1,0.775,0.22,2.2,0.079,12.0,48.0,0.9976,3.18,0.51,9.6
1463,6.9,0.63,0.01,2.4,0.076,14.0,39.0,0.99522,3.34,0.53,10.8
1058,9.9,0.53,0.57,2.4,0.093,30.0,52.0,0.9971,3.19,0.76,11.6


In [63]:
#Step 5: Declare data preprocessing steps
#Remember, in Step 3, we made the mental note to standardize our features because they were on different scales
#Standardization is the process of subtracting the means from each feature and then dividing by the feature standard deviations
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled.mean(axis=0)
X_train_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [64]:
"""Note how we're taking the scaler object and using it to transform the training set. 
we can transform the test set using the exact same means and standard deviations used to transform the training set"""
#Applying transformer to test data
X_test_scaled = scaler.transform(X_test)
X_test_scaled.mean(axis=0)
X_test_scaled.std(axis=0)

array([1.02160495, 1.00135689, 0.97456598, 0.91099054, 0.86716698,
       0.94193125, 1.03673213, 1.03145119, 0.95734849, 0.83829505,
       1.0286218 ])

In [65]:
#modeling pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))
pipeline.get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestregressor',
   RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                         max_features='auto', max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False))],
 'verbose': False,
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1,

In [66]:
#Step 6: Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [67]:
#Step 7: Tune model using a cross-validation pipeline
"""Before dive into fitting our models, lets talk about cross-validation 
that helps you maximize model performance while reducing the chance of overfitting"""
#Cross-validation with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
#Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [68]:
#Step 8: Refit on the entire training set
#Confirm model will be retained
clf.refit
#Now, we can simply use the  clf object as your model when applying it to other sets of data

True

In [69]:
#Step 9: Evaluate model pipeline on test data
#let's predict a new set of data
y_pred = clf.predict(X_test)

In [71]:
#Now we can use the metrics we imported earlier to evaluate our model performance.
print('r2_score: {}'.format(r2_score(y_test, y_pred)))
print('mean square error: {}'.format(mean_squared_error(y_test, y_pred)))

r2_score: 0.47221550941340285
mean square error: 0.3405653125


In [74]:
#Step 10: Save model for future use
#save the model to a .pkl file
#joblib.dump(clf, 'rf_regressor.pkl')
#--------------------------------------------------------------------#
#When you want to load the model again, simply use this function
#Load model from .pkl file
#clf2 = joblib.load('rf_regressor.pkl')
#Predict data set using loaded model
#clf2.predict(X_test)