# Fitting a model

In [1]:
###
# Code adopted from https://github.com/WillKoehrsen/machine-learning-project-walkthrough/blob/master/Machine%20Learning%20Project%20Part%202.ipynb
# By WillKoehrsen
###
import pandas as pd
import numpy as np
from pprint import pprint
pd.set_option('display.max_columns', None)

from sklearn.datasets import load_boston
data = load_boston()
df = pd.concat([pd.DataFrame(data['data'], columns = data['feature_names']), 
                pd.DataFrame(data['target'], columns=['MEDV'])], axis=1)

In [2]:
from sklearn.model_selection import train_test_split
X = df.drop(['MEDV'], axis=1)
y = df['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [3]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
#Algorithms
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler

In [4]:
model=make_pipeline(RobustScaler(), RandomForestRegressor(random_state=42))
model_fit=model.fit(X_train, y_train)

In [5]:
y_pred=model_fit.predict(X_test)
print('Model Score(R2 Score):', round(model_fit.score(X_test, y_test), 3))
print('MAE:', round(mean_absolute_error(y_test, y_pred), 3))

Model Score(R2 Score): 0.871
MAE: 2.084


In [6]:
SGDRegressor(random_state=42).get_params()

{'alpha': 0.0001,
 'average': False,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.01,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'invscaling',
 'loss': 'squared_loss',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'penalty': 'l2',
 'power_t': 0.25,
 'random_state': 42,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [7]:
RandomForestRegressor(random_state=42).get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [8]:
KNeighborsRegressor().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [9]:
XGBRegressor(random_state=42).get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

## SGDRegressor

In [10]:
grid = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    'max_iter': [1000], # number of epochs
    'loss': ['squared_loss'], # logistic regression,
    'penalty': ['l2']
}

In [11]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# Create the model to use for hyperparameter tuning
model = SGDRegressor(random_state=42)

# Set up the random search with 4-fold cross validation
random_sgdr = RandomizedSearchCV(estimator=model,
                                 param_distributions=grid,
                                 scoring = 'roc_auc',
                                 n_jobs = -1,
                                 verbose = 1,
                                 random_state=42)

In [12]:
random_sgdr.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




RandomizedSearchCV(estimator=SGDRegressor(random_state=42), n_jobs=-1,
                   param_distributions={'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0,
                                                  10.0, 100.0, 1000.0],
                                        'loss': ['squared_loss'],
                                        'max_iter': [1000], 'penalty': ['l2']},
                   random_state=42, scoring='roc_auc', verbose=1)

In [13]:
sgdrmodel=random_sgdr.best_estimator_
sgdrmodel=make_pipeline(RobustScaler(), sgdrmodel)
sgdrmodel_fit=sgdrmodel.fit(X_train, y_train)

In [14]:
y_pred=sgdrmodel_fit.predict(X_test)
print('Model Score(R2 Score):', round(sgdrmodel_fit.score(X_test, y_test), 3))
print('MAE:', round(mean_absolute_error(y_test, y_pred), 3))

Model Score(R2 Score): 0.686
MAE: 3.245


In [15]:
sgdrmodel

Pipeline(steps=[('robustscaler', RobustScaler()),
                ('sgdregressor', SGDRegressor(random_state=42))])

## RandomForestRegressor

In [16]:
# Number of trees used in the boosting process
n_estimators = [500, 900, 1100]

# Maximum depth of each tree
max_depth = [3, 5, 10]

# Minimum number of samples per leaf
min_samples_leaf = [2, 4, 6]

# Minimum number of samples to split a node
min_samples_split = [4, 6]

# Maximum number of features to consider for making splits
max_features = ['auto', 'sqrt', 'log2', None]

# Define the grid of hyperparameters to search
hyperparameter_grid = {'n_estimators': n_estimators,
                       'max_depth': max_depth,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [17]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# Create the model to use for hyperparameter tuning
model = RandomForestRegressor(random_state = 42)

# Set up the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=model,
                         param_distributions=hyperparameter_grid,
                         scoring = 'neg_mean_absolute_error',
                         n_jobs = -1, 
                         verbose = 1,
                         return_train_score = True)

In [18]:
random_cv.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
                   param_distributions={'max_depth': [3, 5, 10],
                                        'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [2, 4, 6],
                                        'min_samples_split': [4, 6],
                                        'n_estimators': [500, 900, 1100]},
                   return_train_score=True, scoring='neg_mean_absolute_error',
                   verbose=1)

In [19]:
hmodel=random_cv.best_estimator_
hmodel=make_pipeline(RobustScaler(), hmodel)
hmodel_fit=hmodel.fit(X_train, y_train)

In [20]:
y_pred=hmodel_fit.predict(X_test)
print('Model Score(R2 Score):', round(hmodel_fit.score(X_test, y_test), 3))
print('MAE:', round(mean_absolute_error(y_test, y_pred), 3))

Model Score(R2 Score): 0.857
MAE: 2.136


## KNeighborsRegressor

In [21]:
# Define our candidate hyperparameters
hp_candidates = [{'n_neighbors': [2,3,4,5,6], 
                  'weights': ['uniform','distance']}]

In [22]:
# Create the model to use for hyperparameter tuning
model = KNeighborsRegressor()

# Set up the random search with 4-fold cross validation
random_knr = RandomizedSearchCV(estimator=model,
                             param_distributions=hp_candidates,
                             scoring = 'neg_mean_absolute_error',
                             n_jobs = -1, 
                             verbose = 1,
                             return_train_score = True)

In [23]:
random_knr.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(estimator=KNeighborsRegressor(), n_jobs=-1,
                   param_distributions=[{'n_neighbors': [2, 3, 4, 5, 6],
                                         'weights': ['uniform', 'distance']}],
                   return_train_score=True, scoring='neg_mean_absolute_error',
                   verbose=1)

In [24]:
knnmodel=random_knr.best_estimator_
knnmodel=make_pipeline(RobustScaler(), knnmodel)
knnmodel_fit=knnmodel.fit(X_train, y_train)

In [25]:
y_pred=knnmodel_fit.predict(X_test)
print('Model Score(R2 Score):', round(knnmodel_fit.score(X_test, y_test), 3))
print('MAE:', round(mean_absolute_error(y_test, y_pred), 3))

Model Score(R2 Score): 0.759
MAE: 2.755


## XGBRegressor

In [26]:
parameters = {'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

In [27]:
# Create the model to use for hyperparameter tuning
model = XGBRegressor(random_state=42)

# Set up the random search with 4-fold cross validation
random_xgbr = RandomizedSearchCV(estimator=model,
                             param_distributions=parameters,
                             scoring = 'neg_mean_absolute_error',
                             n_jobs = -1, 
                             verbose = 1,
                             n_iter=10,
                             return_train_score = True)

In [28]:
random_xgbr.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n_jobs...
                                          scale_pos_weight=None, subsample=None,
                                          tree_method=None,
                                          validate_parameters=None,
 

In [29]:
xgbrmodel=random_xgbr.best_estimator_
xgbrmodel=make_pipeline(RobustScaler(), xgbrmodel)
xgbrmodel_fit=xgbrmodel.fit(X_train, y_train)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [30]:
y_pred=xgbrmodel_fit.predict(X_test)
print('Model Score(R2 Score):', round(xgbrmodel_fit.score(X_test, y_test), 3))
print('MAE:', round(mean_absolute_error(y_test, y_pred), 3))

Model Score(R2 Score): 0.879
MAE: 1.949
