# Download data Set and load

In [1]:
print("LAB 2: Evaluation of Machine Learning Regressors: Implementation and Analysis")
print("Python ≥3.5 and Scikit-Learn ≥0.20 are required.")

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os
import tarfile
import urllib.request
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Funtion create for download and extract the data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

#import data to Pandas DataFrame 
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

#Load housing Data
housing = load_housing_data()


LAB 2: Evaluation of Machine Learning Regressors: Implementation and Analysis
Python ≥3.5 and Scikit-Learn ≥0.20 are required.


# Test set generation

Adding data set splitting category based on median income

In [2]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [4]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [5]:
housing = strat_train_set.copy()

# Prepare the Data

In [6]:
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

In [7]:
housing_num = housing.drop("ocean_proximity", axis=1)

Fill the missing data with median value of the each attributes.

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing_num)
#Apply the imputation (Transform the training set)
housing_num_imputed = imputer.transform(housing_num)

Handling text attribute in the data set with OneHotEncoder

In [9]:
from sklearn.preprocessing import OneHotEncoder
housing_cat = housing[["ocean_proximity"]]
cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)


In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names] # get the column indices

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [11]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND,5.485836,3.168555
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN,6.927083,2.623698
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND,5.393333,2.223333
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN,3.886128,1.859213
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN,6.096552,3.167241


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [13]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [14]:
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

In [15]:
housing_prepared.shape

(16512, 16)

## Train Linear Regression model on prepared data set 

In [16]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

In [17]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [ 85120. 305856. 152256. 186624. 244480.]


In [18]:
print("Labels:", list(some_labels))

Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


 Root-Mean-Squared-Error

In [19]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68634.44789635955

In [20]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

49466.61415939922

Cross Validation: Linear Regression

In [21]:
from sklearn.model_selection import cross_val_score

def display_scores(scores): # define the function display_scores
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [71778.64956897 64114.99166359 67771.17124356 68613.18677014
 66828.8611363  72551.43826898 73997.08050233 68830.50397661
 66443.28836884 70144.65874699]
Mean: 69107.38302462916
Standard deviation: 2886.202548783683


In [22]:
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

count       10.000000
mean     69107.383025
std       3042.324614
min      64114.991664
25%      67064.438663
50%      68721.845373
75%      71370.151863
max      73997.080502
dtype: float64

## Train Decision Tree Regression model on prepared data set

In [23]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(random_state=42)

In [24]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

Evaluation Using Cross-Validation

In [25]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [26]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [72831.45749112 69973.18438322 69528.56551415 72517.78229792
 69145.50006909 79094.74123727 68960.045444   73344.50225684
 69826.02473916 71077.09753998]
Mean: 71629.89009727491
Standard deviation: 2914.035468468928


In [27]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [71778.64956897 64114.99166359 67771.17124356 68613.18677014
 66828.8611363  72551.43826898 73997.08050233 68830.50397661
 66443.28836884 70144.65874699]
Mean: 69107.38302462916
Standard deviation: 2886.202548783683


## Train Random Forest Regression model on prepared data set

In [28]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(random_state=42)

In [29]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18650.698705770003

Evaluation Using Cross-Validation

In [30]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [51559.63379638 48737.57100062 47210.51269766 51875.21247297
 47577.50470123 51863.27467888 52746.34645573 50065.1762751
 48664.66818196 54055.90894609]
Mean: 50435.58092066179
Standard deviation: 2203.3381412764606


In [31]:
scores_lin_reg = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores_lin_reg)).describe()

count       10.000000
mean     69107.383025
std       3042.324614
min      64114.991664
25%      67064.438663
50%      68721.845373
75%      71370.151863
max      73997.080502
dtype: float64

In [32]:
scores_forest_reg = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores_forest_reg)).describe()

count       10.000000
mean     50435.580921
std       2322.522327
min      47210.512698
25%      48682.893887
50%      50812.405036
75%      51872.228024
max      54055.908946
dtype: float64

In [33]:
scores_tree_reg = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores_tree_reg)).describe()

count       10.000000
mean     71629.890097
std       3071.663088
min      68960.045444
25%      69602.930320
50%      70525.140962
75%      72753.038693
max      79094.741237
dtype: float64

Support Vector Machine regressor

In [34]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

111095.06635291968

In [35]:
scores_svm_reg = cross_val_score(svm_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores_svm_reg)).describe()

count        10.000000
mean     111814.106926
std        2741.526360
min      106998.562270
25%      110848.005913
50%      112450.979115
75%      113251.558304
max      116063.778167
dtype: float64

# Fine-Tune Model

# Grid search

## RandomForestRegressor

In [72]:
from sklearn.model_selection import GridSearchCV

param_grid = [    
   { 'n_estimators': [3, 10, 30, 50,100],                      # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],         # Number of features to consider when looking for the best split
    'max_depth': [None, 10, 20, 30],                 # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],                 # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4,10],                   # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False] }
  ]

forest_reg = RandomForestRegressor(random_state=42)
 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,n_jobs=-1,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid=[{'bootstrap': [True, False],
                          'max_depth': [None, 10, 20, 30],
                          'max_features': ['auto', 'sqrt', 'log2'],
                          'min_samples_leaf': [1, 2, 4, 10],
                          'min_samples_split': [2, 5, 10],
                          'n_estimators': [3, 10, 30, 50, 100]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [73]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [74]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      random_state=42)

In [90]:
best_grid_search_RandomForestRegressor=RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      random_state=42)

In [39]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63895.161577951665 {'max_features': 2, 'n_estimators': 3}
54916.32386349543 {'max_features': 2, 'n_estimators': 10}
52885.86715332332 {'max_features': 2, 'n_estimators': 30}
60075.3680329983 {'max_features': 4, 'n_estimators': 3}
52495.01284985185 {'max_features': 4, 'n_estimators': 10}
50187.24324926565 {'max_features': 4, 'n_estimators': 30}
58064.73529982314 {'max_features': 6, 'n_estimators': 3}
51519.32062366315 {'max_features': 6, 'n_estimators': 10}
49969.80441627874 {'max_features': 6, 'n_estimators': 30}
58895.824998155826 {'max_features': 8, 'n_estimators': 3}
52459.79624724529 {'max_features': 8, 'n_estimators': 10}
49898.98913455217 {'max_features': 8, 'n_estimators': 30}
62381.765106921855 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54476.57050944266 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59974.60028085155 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52754.5632813202 {'bootstrap': False, 'max_features': 3, 'n_estimators': 1

## DecisionTreeRegressor

In [85]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Initialize the Decision Tree Regressor
tree_reg = DecisionTreeRegressor(random_state=42)

# Define the parameter grid for Grid Search
param_grid = {
    'max_depth': [None, 10, 20, 30, 40],               # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],                  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],                    # Minimum number of samples required to be at a leaf node
    'max_features': [None, 'auto', 'sqrt', 'log2'],    # Number of features to consider when looking for the best split
}

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(tree_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

# Fit the model with the prepared data
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
             param_grid={'max_depth': [None, 10, 20, 30, 40],
                         'max_features': [None, 'auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [86]:
grid_search.best_estimator_

DecisionTreeRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10,
                      random_state=42)

In [89]:
best_grid_search_DecisionTreeRegressor=DecisionTreeRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10,
                      random_state=42)

In [87]:
grid_search.best_params_

{'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10}

In [88]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

70911.53911695824 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
68813.57786263606 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
66569.9365618405 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
68954.53990376847 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
68081.85537599486 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5}
65453.62533767724 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
63849.08245892427 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
63849.08245892427 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 5}
63151.06887800804 {'max_depth': None, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
70911.53911695824 {'max_depth': Non

## SVR

In [80]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Initialize the SVR model
svr_reg = SVR()

# Define the parameter grid for Grid Search
param_grid = [
        {'kernel': ['linear'], 'C': [1000., 100., 1000., 10000.]},
        {'kernel': ['rbf'], 'C': [1.0, 10., 100.,1000.0],
         'gamma': [0.01, 0.1, 1.0]},
    ]
        


# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(svr_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',n_jobs=-1,
                           return_train_score=True)

# Fit the model with the prepared data
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=SVR(), n_jobs=-1,
             param_grid=[{'C': [1000.0, 100.0, 1000.0, 10000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 10.0, 100.0, 1000.0],
                          'gamma': [0.01, 0.1, 1.0], 'kernel': ['rbf']}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [81]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

70292.4276756548

In [82]:
grid_search.best_params_

{'C': 10000.0, 'kernel': 'linear'}

In [83]:
grid_search.best_estimator_

SVR(C=10000.0, kernel='linear')

In [84]:
best_grid_searchSVR=SVR(C=10000.0, kernel='linear')

In [47]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

84649.6069847477 {'C': 10.0, 'kernel': 'linear'}
71635.55363138489 {'C': 100.0, 'kernel': 'linear'}
70396.49756558672 {'C': 1000.0, 'kernel': 'linear'}
70292.4276756548 {'C': 10000.0, 'kernel': 'linear'}
118819.34364522224 {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
118643.66544284696 {'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}
118898.89058474178 {'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'}
117862.25734600889 {'C': 10.0, 'gamma': 0.01, 'kernel': 'rbf'}
116181.25173057283 {'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'}
118591.6498917307 {'C': 10.0, 'gamma': 1.0, 'kernel': 'rbf'}
109136.51158303731 {'C': 100.0, 'gamma': 0.01, 'kernel': 'rbf'}
98574.6823262157 {'C': 100.0, 'gamma': 0.1, 'kernel': 'rbf'}
115840.14747601148 {'C': 100.0, 'gamma': 1.0, 'kernel': 'rbf'}
78409.42560534432 {'C': 1000.0, 'gamma': 0.01, 'kernel': 'rbf'}
71916.27653729175 {'C': 1000.0, 'gamma': 0.1, 'kernel': 'rbf'}
100822.48562769855 {'C': 1000.0, 'gamma': 1.0, 'kernel': 'rbf'}


## LinearRegression

In [51]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

from sklearn.model_selection import GridSearchCV

# Remove max_features and other parameters not applicable to LinearRegression
param_grid = [
    
    {'fit_intercept': [True, False]}
  ]

grid_search = GridSearchCV(lin_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=LinearRegression(),
             param_grid=[{'fit_intercept': [True, False]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [52]:
grid_search.best_params_

{'fit_intercept': False}

In [53]:
grid_search.best_estimator_

LinearRegression(fit_intercept=False)

In [91]:
best_LinearRegression=LinearRegression(fit_intercept=False)

In [54]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

4184280755014805.5 {'fit_intercept': True}
69166.4308990913 {'fit_intercept': False}


# Randomized Search

## Random forest

In [75]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=100, high=300),
        'max_features': randint(low=1, high=8),
        'max_features': ['auto', 'sqrt', 'log2'],         # Number of features to consider when looking for the best split
        'max_depth': randint(low=10, high=100),                 # Maximum depth of the tree
        'min_samples_split': randint(low=5, high=50),                 # Minimum number of samples required to split a node
        'min_samples_leaf': randint(low=1, high=20),                   # Minimum number of samples required to be at a leaf node
        'bootstrap': [True, False] 
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5,n_jobs=-1, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C7CC10C820>,
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C7B7141340>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C7D0154550>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C7D01BBB20>},
                   random_state=42, scoring='neg_mean_squared_error')

In [76]:
rnd_search.best_params_

{'bootstrap': True,
 'max_depth': 30,
 'max_features': 'auto',
 'min_samples_leaf': 7,
 'min_samples_split': 22,
 'n_estimators': 231}

In [77]:
rnd_search.best_estimator_

RandomForestRegressor(max_depth=30, min_samples_leaf=7, min_samples_split=22,
                      n_estimators=231, random_state=42)

In [79]:
best_rnd_search_RandomForestRegressor=RandomForestRegressor(max_depth=30, min_samples_leaf=7, min_samples_split=22,
                      n_estimators=231, random_state=42)

In [78]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

53550.805474104374 {'bootstrap': True, 'max_depth': 61, 'max_features': 'auto', 'min_samples_leaf': 15, 'min_samples_split': 47, 'n_estimators': 171}
54012.08948261838 {'bootstrap': True, 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 19, 'min_samples_split': 27, 'n_estimators': 174}
52625.07981182363 {'bootstrap': True, 'max_depth': 97, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 44, 'n_estimators': 251}
51731.8726250901 {'bootstrap': True, 'max_depth': 31, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 28, 'n_estimators': 257}
61325.769226241144 {'bootstrap': False, 'max_depth': 11, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 16, 'n_estimators': 157}
59441.05994059053 {'bootstrap': False, 'max_depth': 98, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 32, 'n_estimators': 287}
52564.88985374141 {'bootstrap': False, 'max_depth': 24, 'max_features': 'sqrt', 'min_samples_leaf': 15, 'm

## DecisionTreeRegressor

In [59]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Initialize the Decision Tree Regressor
tree_reg = DecisionTreeRegressor(random_state=42)

# Define the parameter distributions for Randomized Search
param_distribs = {
    'max_depth': [None, 10, 20, 30, 40, 50],               # Maximum depth of the tree
    'min_samples_split': randint(2, 20),                   # Minimum number of samples required to split a node
    'min_samples_leaf': randint(1, 20),                    # Minimum number of samples required to be at a leaf node
    'max_features': [None, 'auto', 'sqrt', 'log2'],        # Number of features to consider when looking for the best split
}

# Perform Randomized Search with 5-fold cross-validation
rnd_search = RandomizedSearchCV(tree_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                random_state=42, n_jobs=-1,return_train_score=True)

# Fit the model with the prepared data
rnd_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'max_depth': [None, 10, 20, 30, 40, 50],
                                        'max_features': [None, 'auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C7CF4208B0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C7B78609D0>},
                   random_state=42, return_train_score=True,
                   scoring='neg_mean_squared_error')

In [60]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

4184280755014805.5 {'fit_intercept': True}
69166.4308990913 {'fit_intercept': False}


In [61]:
rnd_search.best_estimator_

DecisionTreeRegressor(max_depth=50, min_samples_leaf=18, min_samples_split=13,
                      random_state=42)

In [92]:
best_rnd_search_DecisionTreeRegressor=DecisionTreeRegressor(max_depth=50, min_samples_leaf=18, min_samples_split=13,
                      random_state=42)

In [62]:
rnd_search.best_params_

{'max_depth': 50,
 'max_features': None,
 'min_samples_leaf': 18,
 'min_samples_split': 13}

## SVR

In [63]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Initialize the SVR model
svr_reg = SVR()

# Define the parameter distributions for Randomized Search
param_distribs = {
    'C':  uniform(1000, 100000),                  # Regularization parameter (sampled from uniform distribution)
    'kernel': ['linear', 'rbf'],             # Kernel type: 'linear' or 'rbf' (Radial Basis Function)
    'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],              # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
}

# Perform Randomized Search with 5-fold cross-validation
rnd_search = RandomizedSearchCV(svr_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                random_state=42,n_jobs=-1, return_train_score=True)

# Fit the model with the prepared data
rnd_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=SVR(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C7D013EF40>,
                                        'gamma': [0.01, 0.03, 0.1, 0.3, 1.0,
                                                  3.0],
                                        'kernel': ['linear', 'rbf']},
                   random_state=42, return_train_score=True,
                   scoring='neg_mean_squared_error')

In [68]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

70287.70685340828 {'C': 38454.01188473625, 'gamma': 1.0, 'kernel': 'linear'}
70285.02591223882 {'C': 74199.39418114051, 'gamma': 1.0, 'kernel': 'linear'}
70290.6991268377 {'C': 16601.86404424365, 'gamma': 0.1, 'kernel': 'linear'}
79906.3091238576 {'C': 6808.361216819946, 'gamma': 1.0, 'kernel': 'rbf'}
59957.27268323541 {'C': 15286.681792194078, 'gamma': 0.1, 'kernel': 'rbf'}
63674.949423369726 {'C': 6641.157902710025, 'gamma': 0.3, 'kernel': 'rbf'}
70287.63987027176 {'C': 22233.911067827616, 'gamma': 0.3, 'kernel': 'linear'}
60603.93832010895 {'C': 62748.15096277165, 'gamma': 0.03, 'kernel': 'rbf'}
70346.07805864267 {'C': 1706.6305219717406, 'gamma': 0.01, 'kernel': 'linear'}
60820.49911977125 {'C': 53477.46602583891, 'gamma': 0.03, 'kernel': 'rbf'}
86398.5787774544 {'C': 30214.464853521815, 'gamma': 3.0, 'kernel': 'rbf'}
57123.34053999592 {'C': 79517.59613930136, 'gamma': 0.1, 'kernel': 'rbf'}
70285.63770230168 {'C': 52423.44384136116, 'gamma': 0.01, 'kernel': 'linear'}
70284.99994948

In [65]:
rnd_search.best_estimator_

SVR(C=87310.34258755935, gamma=0.3)

In [93]:
best_rnd_search_SVR=SVR(C=87310.34258755935, gamma=0.3)

In [66]:
rnd_search.best_params_

{'C': 87310.34258755935, 'gamma': 0.3, 'kernel': 'rbf'}

In [95]:
from sklearn.metrics import mean_squared_error

best_models=[best_LinearRegression,best_grid_search_DecisionTreeRegressor, best_rnd_search_DecisionTreeRegressor,
             best_grid_search_RandomForestRegressor,best_rnd_search_RandomForestRegressor,
             best_grid_searchSVR,best_rnd_search_SVR]

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

for model in best_models:
    final_model = model
    final_model.fit(housing_prepared, housing_labels)
    final_predictions = final_model.predict(X_test_prepared)
    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    print(f"{final_model} RMSE :{final_rmse}")

LinearRegression(fit_intercept=False) RMSE :66913.44191320929
DecisionTreeRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10,
                      random_state=42) RMSE :58772.103556837006
DecisionTreeRegressor(max_depth=50, min_samples_leaf=18, min_samples_split=13,
                      random_state=42) RMSE :55964.47646697768
RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      random_state=42) RMSE :46507.42330066248
RandomForestRegressor(max_depth=30, min_samples_leaf=7, min_samples_split=22,
                      n_estimators=231, random_state=42) RMSE :49103.77937788657
SVR(C=10000.0, kernel='linear') RMSE :68229.034147709
SVR(C=87310.34258755935, gamma=0.3) RMSE :53152.23567835587
