In [11]:
!pip install pandas --quiet
!pip install numpy --quiet
!pip install -U scikit-learn
!pip install joblib 



In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [20]:
# Data downloaded from https://archive.ics.uci.edu/dataset/186/wine+quality
wine = pd.read_csv("winequality-red.csv", sep=";")
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [21]:
wine.shape

(1599, 12)

In [45]:
y= wine.quality
X = wine.drop('quality', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123, stratify=y)

In [46]:
# Transformer API
scaler = preprocessing.StandardScaler().fit(X_train)

#Applying transformer to training dataPython
X_train_scaled = scaler.transform(X_train)

print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

[-1.85165758e-17  5.80186042e-17  8.76451255e-17  2.34851903e-16
  9.38173174e-17 -8.14729336e-17  1.48132606e-17 -3.06566601e-14
  1.40664254e-15 -1.77759128e-16  1.25172052e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [47]:
#List tunable Hyperparameters
pipeline = make_pipeline(preprocessing.StandardScaler(),RandomForestRegressor(n_estimators=10))
print(pipeline.get_params ())

{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor(n_estimators=10))], 'verbose': False, 'standardscaler': StandardScaler(), 'randomforestregressor': RandomForestRegressor(n_estimators=10), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'randomforestregressor__bootstrap': True, 'randomforestregressor__ccp_alpha': 0.0, 'randomforestregressor__criterion': 'squared_error', 'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 1.0, 'randomforestregressor__max_leaf_nodes': None, 'randomforestregressor__max_samples': None, 'randomforestregressor__min_impurity_decrease': 0.0, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__min_weight_fraction_leaf': 0.0, 'randomforestregressor__n_estimators': 10, 'randomforestregressor__n_jobs': None, 'randomforestregressor__oob_score': False, 'randomfo

In [48]:
# Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt'],
                    'randomforestregressor__max_depth': [None, 1, 2, 4]
                  }

In [49]:
# Cross-validation
clf = GridSearchCV (pipeline, hyperparameters, cv=3)

# Fit and tune model
clf.fit(X_train, y_train)

12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lalo/Applications/MiniConda3/envs/ml/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lalo/Applications/MiniConda3/envs/ml/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lalo/Applications/MiniConda3/envs/ml/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_pa

In [50]:
# Cross-validation
clf = GridSearchCV (pipeline, hyperparameters, cv=3)

# Fit and tune model
clf.fit(X_train, y_train)

12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lalo/Applications/MiniConda3/envs/ml/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lalo/Applications/MiniConda3/envs/ml/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lalo/Applications/MiniConda3/envs/ml/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_pa

In [51]:
y_pred = clf.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test, y_pred))

0.4805802638648441
0.33987499999999987


In [53]:
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

# References

URL: https://www.simplilearn.com/tutorials/python-tutorial/scikit-learn