In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.externals import joblib




In [2]:
#Load and read the data set 
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [3]:
print (dataset_url)


http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv


In [5]:
print (data.head())

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [6]:
data = pd.read_csv(dataset_url, sep=';')
print (data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [7]:
print (data.shape)

(1599, 12)


In [8]:
print (data.describe)

<bound method NDFrame.describe of       fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.0

In [8]:
#Extract input features (1-10 and output feature column quality)
y = data.quality
X = data.drop('quality', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

In [9]:
print (X_train)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
691             9.2             0.920         0.24             2.6      0.087   
1475            5.3             0.470         0.11             2.2      0.048   
1065            7.7             0.610         0.18             2.4      0.083   
1159           10.2             0.410         0.43             2.2      0.110   
227             9.0             0.820         0.14             2.6      0.089   
...             ...               ...          ...             ...        ...   
463             8.1             0.660         0.70             2.2      0.098   
558            10.9             0.530         0.49             4.6      0.118   
1094            6.6             0.725         0.09             5.5      0.117   
792             7.1             0.610         0.02             2.5      0.081   
381            13.7             0.415         0.68             2.9      0.085   

      free sulfur dioxide  

In [10]:
print (y_train)

691     5
1475    7
1065    6
1159    5
227     5
       ..
463     5
558     6
1094    6
792     6
381     6
Name: quality, Length: 1279, dtype: int64


In [11]:
print (y_test)

797     7
871     5
1333    5
1463    6
1058    7
       ..
211     6
162     6
748     6
914     6
557     5
Name: quality, Length: 320, dtype: int64


In [12]:
#Standardize the datasets 
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

#Transform the datasets
X_test_scaled = scaler.transform(X_test)

In [13]:
print (X_train_scaled.mean(axis=0))


[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]


In [14]:
print (X_train_scaled.std(axis=0))


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [15]:
print (X_test_scaled.mean(axis=0))


[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]


In [16]:
print (X_test_scaled.std(axis=0))


[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [17]:
#Setup the pipeline for the model - RandomForestRegressor
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [18]:
print (pipeline.get_params())


{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
           

In [19]:
#Assign the params for the model
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [20]:
#Cross validation the model
clf = GridSearchCV(pipeline, hyperparameters, cv=10)


In [21]:
#Fit and tune the model (long process time)
clf.fit(X_train, y_train)


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [22]:
#see what the computer gives us the param
print (clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


In [23]:
print (clf.refit)

True


In [24]:
# Now predict a new set of data
y_pred = clf.predict(X_test)

In [25]:
#Get the r2 score for test vs the predict
print (r2_score(y_test, y_pred))

0.4601668381863311


In [26]:
#Get f1 score (but not working!!!)
# f1_score(y_test, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None,zero_division="warn")
#Update - f1 score is for classification problem, R2 is for regression problem.  Thus for this problem, it doesn't make sense for F1 scroe

In [27]:
print (mean_squared_error(y_test, y_pred))


0.34834


In [28]:
#save work (pickle the python to .pkl as winesnob_rf_regression.pkl)
joblib.dump(clf, 'winesnob_rf_regressor.pkl')

['winesnob_rf_regressor.pkl']

In [29]:
#to run, load model from .pkl file and execute
clf2 = joblib.load('winesnob_rf_regressor.pkl')
clf2.predict(X_test)

array([6.4 , 5.54, 5.  , 5.46, 6.2 , 5.59, 4.96, 4.78, 5.  , 6.03, 5.32,
       5.68, 5.79, 5.1 , 5.8 , 5.71, 6.57, 5.67, 5.68, 6.95, 5.37, 5.58,
       5.08, 6.04, 5.89, 5.11, 5.29, 5.14, 5.95, 6.02, 5.87, 6.53, 5.99,
       5.07, 5.  , 5.92, 5.06, 6.27, 4.98, 5.8 , 4.88, 6.13, 6.6 , 5.09,
       6.22, 5.22, 5.57, 5.52, 5.11, 6.53, 6.14, 5.29, 5.75, 5.13, 5.65,
       5.67, 5.32, 5.35, 4.97, 5.38, 5.21, 5.21, 5.05, 5.72, 5.88, 5.31,
       6.47, 5.06, 5.08, 6.68, 5.8 , 5.72, 5.13, 5.  , 5.34, 5.99, 5.27,
       5.13, 5.23, 5.31, 6.37, 5.47, 6.19, 6.33, 5.11, 5.93, 6.51, 6.32,
       5.86, 5.81, 5.88, 5.37, 6.3 , 5.66, 5.59, 5.76, 6.76, 6.72, 5.52,
       6.86, 5.05, 5.56, 5.13, 6.4 , 5.05, 4.72, 5.67, 4.99, 5.66, 5.98,
       5.77, 5.48, 6.09, 5.37, 5.21, 5.23, 5.93, 5.1 , 4.81, 6.07, 5.82,
       5.1 , 5.86, 6.22, 5.31, 5.38, 5.32, 6.01, 5.33, 5.42, 5.81, 6.27,
       5.13, 5.33, 5.04, 6.26, 5.01, 5.14, 6.66, 5.56, 5.22, 5.04, 5.55,
       6.15, 5.35, 5.35, 5.12, 6.53, 5.83, 5.18, 5.

In [30]:
#compare above to the expected valute
print (y_test)

797     7
871     5
1333    5
1463    6
1058    7
       ..
211     6
162     6
748     6
914     6
557     5
Name: quality, Length: 320, dtype: int64


In [31]:
#Try to compare the predicted value to the y_test data sets
#Not working yet
#from difflib import Differ