In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib



In [5]:
ds = pd.read_csv('winequality-red.csv')
ds

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [6]:
ds.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [7]:
ds.shape

(1599, 12)

In [8]:
ds.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [9]:
y = ds.quality
x = ds.drop('quality', axis=1)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3,random_state=111,stratify=y)

In [11]:
# Standardization
x_train_scaled = preprocessing.scale(x_train)
x_train_scaled

array([[ 0.91168023,  1.13222431,  0.80083432, ..., -1.02323948,
        -0.53259588,  0.52107073],
       [ 0.74440265,  0.12514716, -0.52858904, ..., -1.08754754,
        -0.2822389 , -0.95355985],
       [ 0.52136588, -0.88192999,  0.85196599, ..., -0.44446696,
         0.969546  ,  0.33674191],
       ...,
       [-1.54172422,  0.48881391, -1.39782738, ...,  1.74200703,
        -0.84554211, -0.49273779],
       [-0.37078119,  1.24412177, -0.01727236, ..., -0.44446696,
        -1.28366683, -1.04572426],
       [-0.76109553,  2.5029682 , -1.19330071, ...,  0.26292169,
        -0.34482815, -0.95355985]])

In [12]:
x_train_scaled.mean(axis=0)

array([ 5.73863268e-16, -1.11121518e-16,  5.39733088e-17,  5.71482093e-17,
       -1.84144230e-16,  8.73097642e-18,  6.66729109e-17, -1.58951394e-14,
       -4.38771251e-15,  1.26202296e-16, -6.52442056e-16])

In [13]:
x_train_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [14]:
#Cross validation Pipeline:
scaler = preprocessing.StandardScaler().fit(x_train)

In [15]:
x_train_scaled = scaler.transform(x_train)

In [16]:
x_train_scaled.mean(axis=0)

array([ 5.73863268e-16, -1.11121518e-16,  5.39733088e-17,  5.71482093e-17,
       -1.84144230e-16,  8.73097642e-18,  6.66729109e-17, -1.58951394e-14,
       -4.38771251e-15,  1.26202296e-16, -6.52442056e-16])

In [17]:
x_train_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [20]:
x_test_scaled = scaler.transform(x_test)
x_test_scaled.mean(axis=0)

array([-0.08420217,  0.00351695, -0.04092076, -0.05840553,  0.08278128,
       -0.14078784, -0.03968702, -0.01433399,  0.04280056,  0.06369707,
       -0.03575592])

In [21]:
x_test_scaled.std(axis=0)

array([0.89526952, 1.00499383, 0.98514009, 0.81161148, 1.31574843,
       0.85613274, 0.9679061 , 0.90336649, 0.97419788, 1.18875515,
       0.93773011])

In [22]:
pipeline = make_pipeline(preprocessing.StandardScaler(),RandomForestRegressor(n_estimators=120))

In [23]:
# Hyperparameter
pipeline.get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestregressor',
   RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                         max_features='auto', max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=120,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False))],
 'verbose': False,
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1,

In [24]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],'randomforestregressor__max_depth': [None,4,2,1]}

In [26]:
# cross-validation
rp= GridSearchCV(pipeline,hyperparameters, cv=9)
rp.fit(x_train, y_train)

GridSearchCV(cv=9, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split=

In [27]:
rp.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'log2'}

In [28]:
# confirming Model be Retained
rp.refit

True

In [29]:
# start predicting the data 
y_pred = rp.predict(x_test)

In [30]:
#Apply metrics on the completed Model
r2_score(y_test, y_pred)

0.4536499860529988

In [31]:
mean_squared_error(y_test, y_pred)

0.35421455439814814

In [32]:
# To save the model
joblib.dump(rp, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [33]:
# To load the model from pickle file
rp2 = joblib.load('rf_regressor.pkl')

In [35]:
rp2.predict(x_test)

array([5.13333333, 5.54166667, 6.2       , 5.225     , 5.73333333,
       5.825     , 5.88333333, 6.95833333, 5.45      , 5.68333333,
       5.51666667, 5.25833333, 6.13333333, 5.35833333, 5.28333333,
       5.25      , 5.40833333, 5.13333333, 5.40833333, 4.91666667,
       5.94166667, 6.43333333, 5.9       , 5.525     , 5.225     ,
       5.68333333, 5.825     , 5.775     , 5.925     , 5.325     ,
       5.5       , 5.18333333, 5.19166667, 5.20833333, 5.18333333,
       4.93333333, 5.90833333, 5.39166667, 5.85833333, 5.6       ,
       5.15833333, 4.88333333, 5.28333333, 5.09166667, 4.85833333,
       5.55833333, 5.625     , 5.25833333, 5.475     , 5.95833333,
       6.39166667, 5.33333333, 6.73333333, 5.93333333, 5.225     ,
       6.84166667, 6.20833333, 6.00833333, 5.73333333, 5.66666667,
       5.15833333, 5.76666667, 5.06666667, 5.88333333, 5.33333333,
       6.58333333, 6.15      , 6.93333333, 5.76666667, 5.25      ,
       5.375     , 5.9       , 6.68333333, 6.525     , 5.86666