# Model Hyperparameters Optimization

From the tested models, Random Forest and XGBoost seem to be the best ones, consistently getting high
metrics on the test split. So we are going to run a gridsearch to find the best parameters for these two models on the v1 of te data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Models & Normalization
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Evaluation
import statsmodels.api as sm

# Extra
from utils import *

In [2]:
filename = "Data/v1_house_sales.csv"

df = pd.read_csv(filename)

In [3]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,zipcode,lat,long,sqft_living15,sqft_lot15,year_sold,month_sold,day_sold,q_99,q_95
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,...,98178,47.5112,-122.257,1340,5650,2014,10,13,1,1
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,...,98125,47.721,-122.319,1690,7639,2014,12,9,1,1
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,...,98028,47.7379,-122.233,2720,8062,2015,2,25,1,1
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,...,98136,47.5208,-122.393,1360,5000,2014,12,9,1,1
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,...,98074,47.6168,-122.045,1800,7503,2015,2,18,1,1


In [4]:
# Split into train and test
seed = 13
# The price is the target variable
y = df["price"]

# All other variables are the features for the baseline model
X = df.drop(["price"], axis=1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [5]:
X_train.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,...,zipcode,lat,long,sqft_living15,sqft_lot15,year_sold,month_sold,day_sold,q_99,q_95
1571,4,1.5,2000,6778,1.0,0,0,4,7,1170,...,98198,47.3708,-122.311,1940,7531,2015,3,23,1,1
16330,4,2.5,2630,48706,2.0,0,0,3,8,2630,...,98072,47.775,-122.125,2680,48706,2014,5,21,1,1
12786,4,2.5,2620,9525,2.5,0,0,4,9,2620,...,98040,47.5631,-122.219,2580,9525,2014,8,5,1,1
12524,3,2.5,1610,6000,2.0,0,0,4,7,1610,...,98038,47.349,-122.036,1570,6000,2014,8,26,1,1
16179,3,1.0,880,18205,1.0,0,0,4,6,880,...,98178,47.5013,-122.244,1110,16115,2014,6,24,1,1


## Metrics dataframe

In [6]:
from utils import *

metrics_df = create_metrics_df()

## Random Forest

### Grid Search

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define the model with a fixed, high number of trees
# n_estimators=300 is a good balance of speed vs stability
rf_regressor = RandomForestRegressor(n_estimators=300, random_state=seed)

# Improved Parameter Grid
param_dist = {
    # Tree Depth: Control complexity
    'max_depth': [None, 10, 20, 30, 40, 50],
    
    # Split Criteria: Higher values prevent overfitting
    'min_samples_split': [2, 5, 10, 15, 20],
    
    # Leaf Size: Critical for regression smoothness
    'min_samples_leaf': [1, 2, 4, 8, 12],
    
    # Feature Selection: 'sqrt' is standard, but try fractions (0.3, 0.5)
    # Using floats (0.3) means "use 30% of features"
    'max_features': ['sqrt', 'log2', 0.3, 0.5, None],
    
    # Bootstrapping: Usually True is best, but False can work for small data
    'bootstrap': [True, False]
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=rf_regressor,
    param_distributions=param_dist,
    n_iter=50,                  # 50 iterations is plenty for random search
    cv=3,
    scoring='neg_root_mean_squared_error', # Use RMSE (easier to interpret)
    n_jobs=-1,
    verbose=1,
    random_state=seed
)

# Fit
random_search.fit(X_train, y_train)

# Results
print(f"Best parameters: {random_search.best_params_}")
best_rf = random_search.best_estimator_


Fitting 3 folds for each of 50 candidates, totalling 150 fits




Best parameters: {'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 0.5, 'max_depth': 20, 'bootstrap': False}


In [8]:
# Evaluate on train and test
best_params_str = f"Best parameters: {random_search.best_params_}"

metrics_df = add_new_metrics(metrics_df,
                             best_rf,
                             X_train,
                             y_train,
                             split = "train",
                             comments=best_params_str)

metrics_df = add_new_metrics(metrics_df,
                             best_rf,
                             X_test,
                             y_test,
                             split = "test",
                             comments=best_params_str)

In [9]:
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,MAPE,RMSE,Comments
0,RandomForestRegressor,train,0.9824,0.9824,24386.7475,0.0499,48930.8643,"Best parameters: {'min_samples_split': 5, 'min..."
1,RandomForestRegressor,test,0.9323,0.9319,59310.3415,0.1198,93538.6406,"Best parameters: {'min_samples_split': 5, 'min..."


## XGBoost

### Grid Search

In [10]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# Define the model
xgb_clf = xgb.XGBRegressor(seed=seed, objective='reg:squarederror')

param_dist = {
    'n_estimators': [100, 200, 300, 500, 1000], # More trees is usually okay with early stopping
    'max_depth': [3, 5, 7, 10],                 # XGBoost prefers shallower trees than RF
    'learning_rate': [0.01, 0.05, 0.1, 0.3],    # Critical for XGBoost
    
    # "min_samples_leaf" equivalent:
    'min_child_weight': [1, 3, 5, 10],          
    
    # "max_features" equivalent:
    'colsample_bytree': [0.5, 0.7, 1.0],        
    
    # "min_samples_split" equivalent (approximate):
    'gamma': [0, 0.1, 0.5, 1],                  
    
    # Regularization (optional but good)
    'subsample': [0.6, 0.8, 1.0]                
}

# Randomized search
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=50,                  # 50 is usually enough for random search
    cv=3,
    scoring='neg_root_mean_squared_error', # Better metric for regression
    n_jobs=-1,
    verbose=1,
    random_state=seed
)

# Fit
random_search.fit(X_train, y_train)

# Results
print(f"Best parameters: {random_search.best_params_}")
best_xgb = random_search.best_estimator_


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'subsample': 0.6, 'n_estimators': 1000, 'min_child_weight': 10, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.5, 'colsample_bytree': 0.5}


In [11]:
# Evaluate on train and test
best_params_str = f"Best parameters: {random_search.best_params_}"

metrics_df = add_new_metrics(metrics_df,
                             best_xgb,
                             X_train,
                             y_train,
                             split = "train",
                             comments="Best parameters, outliers flagging, no normalization.")

metrics_df = add_new_metrics(metrics_df,
                             best_xgb,
                             X_test,
                             y_test,
                             split = "test",
                             comments="Best parameters, outliers flagging, no normalization.")

In [12]:
metrics_df

Unnamed: 0,Model,Split,R2,Adjusted_R2,MAE,MAPE,RMSE,Comments
0,RandomForestRegressor,train,0.9824,0.9824,24386.7475,0.0499,48930.8643,"Best parameters: {'min_samples_split': 5, 'min..."
1,RandomForestRegressor,test,0.9323,0.9319,59310.3415,0.1198,93538.6406,"Best parameters: {'min_samples_split': 5, 'min..."
2,XGBRegressor,train,0.9724,0.9724,43582.308,0.0964,61287.6651,"Best parameters, outliers flagging, no normali..."
3,XGBRegressor,test,0.9366,0.9362,57887.7513,0.1155,90536.7618,"Best parameters, outliers flagging, no normali..."


## Gradient Boosting

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define the base model
gb_regressor = GradientBoostingRegressor(random_state=seed)

# Parameter Grid optimized for Gradient Boosting
param_dist = {
    # 1. Boosting Parameters (Critical Pair: Rate vs Trees)
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    
    # 2. Tree Structure
    # GB trees are usually shallow (depth 3-5 works best)
    'max_depth': [3, 4, 5, 6, 8],
    
    # 3. Regularization (Prevents overfitting)
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 10],
    
    # 4. Stochastic Boosting (Using < 1.0 helps reduce variance)
    'subsample': [0.7, 0.8, 0.9, 1.0],
    
    # 5. Feature Randomness (Like RF, helps if features are correlated)
    'max_features': ['sqrt', 'log2', 0.5, None]
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=gb_regressor,
    param_distributions=param_dist,
    n_iter=50,                  # 50 iterations is plenty
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=seed
)

# Fit
random_search.fit(X_train, y_train)

# Results
print(f"Best parameters: {random_search.best_params_}")
best_gb = random_search.best_estimator_


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'subsample': 0.8, 'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'learning_rate': 0.1}


In [14]:
# Evaluate on train and test
best_params_str = f"Best parameters: {random_search.best_params_}"

metrics_df = add_new_metrics(metrics_df,
                             best_gb,
                             X_train,
                             y_train,
                             split = "train",
                             comments=best_params_str )

metrics_df = add_new_metrics(metrics_df,
                             best_gb,
                             X_test,
                             y_test,
                             split = "test",
                             comments=best_params_str)

## Saving Best Parameters

In [15]:
filename = "Metrics/best_model.csv"
metrics_df.to_csv(filename, index=False)