# Gradient Boosting Regressor

In [1]:
import pandas as pd

trainDF = pd.read_csv('../data/filtered/processedData.csv')

In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np


df = trainDF.dropna(subset=['SalePrice'])
df['SalePrice'] = np.log1p(df['SalePrice'])

# split features and target
X = df.drop(columns='SalePrice')
y = df['SalePrice']

# split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# identify categorical columns
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns

# fill NaNs in categorical columns with 'N/A'
X_train[cat_cols] = X_train[cat_cols].fillna('N/A')
X_val[cat_cols] = X_val[cat_cols].fillna('N/A')

# apply one-hot encoding
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, drop_first=True)

# align columns between training and validation data
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# initialize the model
gb_model = GradientBoostingRegressor(random_state=42)

# fit the model on training data
gb_model.fit(X_train_encoded, y_train)

# predict on validation data
val_preds_log = gb_model.predict(X_val_encoded)

# evaluate using RMSE on log scale
val_rmse_log = np.sqrt(mean_squared_error(y_val, val_preds_log))
print(f'Validation RMSE (log-scale): {val_rmse_log:.4f}')

# convert predictions back from log scale and compute real RMSE
val_preds_actual = np.expm1(val_preds_log)
y_val_actual = np.expm1(y_val)

# RMSE on the actual scale (log-reversed)
val_rmse_actual = np.sqrt(mean_squared_error(y_val_actual, val_preds_actual))
print(f'Validation RMSE (log-reversed): {val_rmse_actual:,.2f}')

Validation RMSE (log-scale): 0.1388
Validation RMSE (log-reversed): 30,790.01


This model would be off target by about 30,790.01 USD. Now let's see how it does after hyperparameter tuning.

# Gradient Boosting with HyperParameter Tuning

In [4]:
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 5, 10, 20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2']
}

# initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist,
    n_iter=20,  # Adjust n_iter to control the number of iterations
    cv=5,       # 5-fold cross-validation
    verbose=2,  # Print progress
    n_jobs=-1,  # Use all cores
    random_state=42
)


In [5]:
# fit the model with RandomizedSearchCV
random_search.fit(X_train_encoded, y_train)

# print the best parameters and best score
print('Best parameters:', random_search.best_params_)
print('Best cross-validation score:', random_search.best_score_)

# use the best estimator to make predictions
best_gb_model = random_search.best_estimator_

# predict on the validation set
val_preds_log = best_gb_model.predict(X_val_encoded)

# evaluate the performance using RMSE
val_rmse_log = np.sqrt(mean_squared_error(y_val, val_preds_log))
print(f'Validation RMSE (log-scale): {val_rmse_log:.4f}')


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[CV] END learning_rate=0.05, max_depth=20, max_features=log2, n_estimators=200, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.05, max_depth=20, max_features=log2, n_estimators=200, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.05, max_depth=20, max_features=log2, n_estimators=200, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.05, max_depth=40, max_features=sqrt, n_estimators=400, subsample=0.9; total time=   1.4s
[CV] END learning_rate=0.01, max_depth=5, max_features=log2, n_estimators=400, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=20, max_features=log2, n_estimators=200, subsample=1.0; total time=   0.4s
[CV] END learning_rate=0.05, max_depth=20, max_features=log2, n_estimators=200, subsample=1.0; total time=   0.4s
[CV] END learning_rate=0.05, max_depth=40, max_features=sqrt, n_estimators=400, subsample=0.9; total time=   1.3s
[CV] END learning_rate=0.05, max_depth=40, max_features=sqrt, n_estimators=400, subsample

In [6]:
# convert predictions back from log scale to actual values
val_preds_actual = np.expm1(val_preds_log)
y_val_actual = np.expm1(y_val)

# compute RMSE for the actual values (log-reversed)
val_rmse_actual = np.sqrt(mean_squared_error(y_val_actual, val_preds_actual))
print(f'Validation RMSE (log-reversed): {val_rmse_actual:,.2f}')

Validation RMSE (log-reversed): 29,577.51


This model would be off target by about 29,577.51 USD which is an improvement over 30,790.01 before hyperparam tuning. 

# Test Dataset

In [7]:
testDF = pd.read_csv('../data/raw/test.csv')

In [8]:
# Fill missing values in categorical columns
testDF[cat_cols] = testDF[cat_cols].fillna('N/A')

# Fill missing values in numeric columns with median
num_cols = testDF.select_dtypes(include=[np.number]).columns
testDF[num_cols] = testDF[num_cols].fillna(testDF[num_cols].median())

# One-hot encode
test_encoded = pd.get_dummies(testDF, drop_first=True)

# Align columns with training set
test_encoded = test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Predict
test_preds_log = best_gb_model.predict(test_encoded)
test_preds_actual = np.expm1(test_preds_log)