In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)

df.fillna(df.select_dtypes(include=[np.number]).median(), inplace=True)

if df.select_dtypes(include=['object']).shape[1] > 0:
    label_encoders = {}
    for col in df.select_dtypes(include=['object']).columns:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])

X = df.drop(columns=['median_house_value'])
y = df['median_house_value']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

default_model = XGBRegressor()
default_model.fit(X_train, y_train)
y_pred_default = default_model.predict(X_test)

mse_default = mean_squared_error(y_test, y_pred_default)
print(f"Default Model MSE: {mse_default:.4f}")

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 1.0]
}

grid_search = GridSearchCV(XGBRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

mse_best = mean_squared_error(y_test, y_pred_best)
print(f"Tuned Model MSE: {mse_best:.4f}")

improvement = (mse_default - mse_best) / mse_default * 100
print(f"Hyperparameter tuning improved performance by {improvement:.2f}%")


Default Model MSE: 2338148776.6440
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}
Tuned Model MSE: 2151622046.1227
Hyperparameter tuning improved performance by 7.98%
