In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

In [10]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')

In [12]:
print (data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [13]:
print(data.shape)

(1599, 12)


In [14]:
print(data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [15]:
y = data.quality
X = data.drop('quality', axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [17]:
X_train_scaled = preprocessing.scale(X_train)
print(X_train_scaled)

[[ 0.51358886  2.19680282 -0.164433   ...  1.08415147 -0.69866131
  -0.58608178]
 [-1.73698885 -0.31792985 -0.82867679 ...  1.46964764  1.2491516
   2.97009781]
 [-0.35201795  0.46443143 -0.47100705 ... -0.13658641 -0.35492962
  -0.20843439]
 ...
 [-0.98679628  1.10708533 -0.93086814 ...  0.24890976 -0.98510439
   0.35803669]
 [-0.69826067  0.46443143 -1.28853787 ...  1.08415147 -0.35492962
  -0.68049363]
 [ 3.1104093  -0.62528606  2.08377675 ... -1.61432173  0.79084268
  -0.39725809]]


In [18]:
print(X_train_scaled.mean(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]


In [19]:
print(X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [20]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [21]:
X_train_scaled = scaler.transform(X_train)

In [22]:
print(X_train_scaled.mean(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]


In [23]:
print(X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [24]:
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

In [27]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [28]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

In [29]:
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [30]:
print(clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


In [35]:
print(clf.refit)

True


In [36]:
y_pred = clf.predict(X_test)

In [37]:
print(r2_score(y_test, y_pred))

0.47110503057085784


In [38]:
print(mean_squared_error(y_test, y_pred))

0.34128187500000007


In [39]:
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [40]:
clf2 = joblib.load('rf_regressor.pkl')
clf2.predict(X_test)

array([6.55, 5.59, 5.05, 5.47, 6.35, 5.51, 5.2 , 4.73, 5.01, 6.01, 5.38,
       5.73, 5.79, 5.03, 5.79, 5.67, 6.57, 5.7 , 5.72, 6.97, 5.38, 5.65,
       5.1 , 5.99, 5.93, 5.05, 5.37, 5.15, 5.96, 5.95, 5.94, 6.5 , 5.99,
       5.04, 5.  , 5.95, 5.07, 6.1 , 4.97, 5.95, 4.9 , 5.95, 6.58, 5.14,
       6.2 , 5.34, 5.47, 5.49, 5.15, 6.35, 5.92, 5.3 , 5.8 , 5.25, 5.46,
       5.72, 5.36, 5.44, 5.  , 5.3 , 5.29, 5.19, 5.11, 5.85, 5.9 , 5.36,
       6.44, 5.04, 5.19, 6.67, 5.66, 5.87, 5.14, 5.04, 5.33, 6.03, 5.31,
       5.16, 5.17, 5.34, 6.4 , 5.66, 6.13, 6.33, 5.12, 6.12, 6.46, 6.44,
       5.71, 5.86, 5.86, 5.35, 6.51, 5.76, 5.72, 5.83, 6.78, 6.86, 5.49,
       6.77, 5.06, 5.4 , 5.12, 6.56, 5.08, 4.72, 5.66, 4.99, 5.56, 6.  ,
       5.85, 5.47, 6.12, 5.47, 5.2 , 5.13, 5.93, 5.09, 4.86, 6.02, 5.85,
       5.06, 5.84, 6.12, 5.34, 5.52, 5.22, 5.87, 5.45, 5.46, 5.86, 6.22,
       5.24, 5.28, 5.04, 6.34, 5.04, 5.2 , 6.56, 5.58, 5.18, 5.05, 5.55,
       6.02, 5.37, 5.39, 5.16, 6.5 , 5.78, 5.13, 5.