In [2]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

# Step 1: Create a dataset with house features and prices
data = {
    "Size (sq ft)": [1200, 1500, 1800, 2500, 1100, 1400, 1600, 2000, 2200, 3000,
                     800, 1000, 2700, 1900, 2100, 1700, 2300, 3200, 2800, 3500],
    "Bedrooms": [2, 3, 3, 4, 2, 3, 3, 4, 4, 5,
                 1, 2, 5, 3, 4, 3, 4, 5, 4, 5],
    "Age (years)": [10, 15, 20, 5, 25, 30, 15, 10, 8, 4,
                    50, 45, 3, 12, 7, 20, 6, 2, 9, 1],
    "Price (USD)": [200000, 250000, 300000, 400000, 150000, 220000, 270000, 350000, 370000, 500000,
                    100000, 120000, 480000, 320000, 400000, 290000, 450000, 600000, 500000, 700000]
}

# Convert the data dictionary into a Pandas DataFrame
df = pd.DataFrame(data)

# Step 2: Define features (X) and target variable (y)
X = df[["Size (sq ft)", "Bedrooms", "Age (years)"]]  # Features
y = df["Price (USD)"]  # Target variable

# Step 3: Normalize features using Min-Max Scaling
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Split the normalized data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Step 4: Train an XGBoost Regressor
# Perform GridSearchCV to find the best hyperparameters
param_grid_xgb = {
    "n_estimators": [50, 100, 200],  # Number of boosting rounds
    "learning_rate": [0.01, 0.1, 0.2],  # Step size shrinkage
    "max_depth": [3, 5, 7],  # Maximum depth of trees
    "min_child_weight": [1, 3, 5],  # Minimum sum of weights of all observations needed in a child
    "subsample": [0.8, 1.0],  # Fraction of samples used for training
    "colsample_bytree": [0.8, 1.0]  # Fraction of features used for training each tree
}

grid_search_xgb = GridSearchCV(
    estimator=XGBRegressor(objective="reg:squarederror", random_state=42),
    param_grid=param_grid_xgb,
    scoring="neg_mean_squared_error",  # Use MSE as the scoring metric
    cv=5,  # 5-fold cross-validation
    n_jobs=-1  # Use all available cores
)

# Fit GridSearchCV on the training data
grid_search_xgb.fit(X_train, y_train)

# Retrieve the best model and parameters
best_xgb_model = grid_search_xgb.best_estimator_
best_xgb_params = grid_search_xgb.best_params_

# Step 5: Evaluate the best XGBoost model on the test set
y_pred_xgb = best_xgb_model.predict(X_test)

# Calculate metrics
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Display the best hyperparameters and model performance
print("Best Hyperparameters (XGBoost):")
print(best_xgb_params)
print("\nModel Performance Metrics (XGBoost):")
print(f"Mean Squared Error (MSE): {mse_xgb}")
print(f"Root Mean Squared Error (RMSE): {rmse_xgb}")
print(f"Mean Absolute Error (MAE): {mae_xgb}")
print(f"R-squared (R^2): {r2_xgb}")

# Step 6: Predict the price for a new house using the XGBoost model
new_house = [[2000, 3, 10]]  # Features of the new house
new_house_normalized = scaler.transform(new_house)  # Normalize the new house features
predicted_price_xgb = best_xgb_model.predict(new_house_normalized)
print("\nPredicted Price for the new house (2000 sq ft, 3 bedrooms, 10 years old):", predicted_price_xgb[0])


Best Hyperparameters (XGBoost):
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}

Model Performance Metrics (XGBoost):
Mean Squared Error (MSE): 624373057.1668091
Root Mean Squared Error (RMSE): 24987.457997299545
Mean Absolute Error (MAE): 20608.76171875
R-squared (R^2): 0.9744371175765991

Predicted Price for the new house (2000 sq ft, 3 bedrooms, 10 years old): 342338.72


