# **Import Required Libraries**


In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

# **Load and Preprocess Data**
We drop the 'Town' and 'Address' columns, before splitting the dataset into train-test and applying one-hot encoding to categorical features.

In [None]:
# Load Data
df = pd.read_csv('../datasets/Final_ResaleData.csv')
df = df.drop(columns=['Town', 'Address'])

# Split Dataset into Train (80%) and Test (20%) Sets
X = df.drop(columns=['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline for categorical features (One-Hot Encoding)
categorical_columns = ["Flat_Type"]
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
    remainder='passthrough'
)

# **Train Random Forest Model**
A pipeline is built for preprocessing and a Random Forest regressor. Hyperparameter fine-tuning is then carried out using HalvingGridSearchCV, which efficiently narrows down the best combination of parameters (like tree depth, number of estimators, and split criteria) by progressively focusing on the most promising configurations.

After finetuning, here are the best hyperparameters:
- 'model__max_depth': None
- 'model__min_samples_leaf': 2
- 'model__min_samples_split': 2
- 'model__n_estimators': 200


In [8]:
# Pipeline for Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Hyperparameter Tuning for Random Forest
rf_params = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [2, 4],
}

# GridSearchCV for Random Forest
rf_grid_search = HalvingGridSearchCV(estimator=rf_pipeline, param_grid=rf_params, cv=3, factor=2, scoring='neg_mean_squared_error', verbose=1)
rf_grid_search.fit(X_train, y_train)

# Best Parameters for Random Forest
best_rf_params = rf_grid_search.best_params_
print(f'Best Random Forest Parameters: {best_rf_params}')

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 10847
max_resources_: 173556
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 24
n_resources: 10847
Fitting 3 folds for each of 24 candidates, totalling 72 fits
----------
iter: 1
n_candidates: 12
n_resources: 21694
Fitting 3 folds for each of 12 candidates, totalling 36 fits
----------
iter: 2
n_candidates: 6
n_resources: 43388
Fitting 3 folds for each of 6 candidates, totalling 18 fits
----------
iter: 3
n_candidates: 3
n_resources: 86776
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 4
n_candidates: 2
n_resources: 173552
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best Random Forest Parameters: {'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}


# **Evaluate Random Forest Model**
Evaluation is carried out using RMSE (38243.07), MAE (25831.06) and R^2 (0.95665) to assess prediction accuracy

In [9]:
rf_best_model = rf_grid_search.best_estimator_
y_pred_rf = rf_best_model.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {rmse_rf}")
print(f"Random Forest MAE: {mae_rf}")
print(f"Random Forest R^2: {r2_rf}")

Random Forest RMSE: 38243.07451691749
Random Forest MAE: 25831.06009914324
Random Forest R^2: 0.956652309835798


# **Train XGBoost Model**
A pipeline is built for preprocessing and XGBRegressor. Hyperparameter fine-tuning is then carried out using GridSearchCV, which efficiently narrows down the best combination of parameters (like tree depth, number of estimators, learning rate, and subsampling ratios)

After finetuning, here are the best hyperparameters:
- 'model__colsample_bytree': 1.0
- 'model__learning_rate': 0.1
- 'model__max_depth': 10
- 'model__n_estimators': 250
- 'model__subsample': 0.8

In [13]:
# Pipeline for XGBoost
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(tree_method='hist', random_state=42))
])

# Hyperparameter Tuning for XGBoost
xgb_params = {
    'model__n_estimators': [100, 200, 250],
    'model__max_depth': [5, 10, 15],
    'model__learning_rate': [0.05, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

# GridSearchCV for XGBoost
xgb_grid_search = GridSearchCV(estimator=xgb_pipeline, param_grid=xgb_params, cv=3, scoring='neg_mean_squared_error', verbose=1)
xgb_grid_search.fit(X_train, y_train)

# Best Parameters for XGBoost
best_xgb_params = xgb_grid_search.best_params_
print(f'Best XGBoost Parameters: {best_xgb_params}')

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best XGBoost Parameters: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 250, 'model__subsample': 0.8}


# **Evaluate XGBoost Model**
Evaluation is carried out using RMSE (32223.05), MAE (22707.33) and R^2 (0.96923) to assess prediction accuracy

In [14]:
xgb_best_model = xgb_grid_search.best_estimator_
y_pred_xgb = xgb_best_model.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {rmse_xgb}")
print(f"XGBoost MAE: {mae_xgb}")
print(f"XGBoost R^2: {r2_xgb}")

XGBoost RMSE: 32223.0457240883
XGBoost MAE: 22707.330887877393
XGBoost R^2: 0.9692253206776796
