In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

# Step 1: Create a dataset with house features and prices
data = {
    "Size (sq ft)": [1200, 1500, 1800, 2500, 1100, 1400, 1600, 2000, 2200, 3000,
                     800, 1000, 2700, 1900, 2100, 1700, 2300, 3200, 2800, 3500],
    "Bedrooms": [2, 3, 3, 4, 2, 3, 3, 4, 4, 5,
                 1, 2, 5, 3, 4, 3, 4, 5, 4, 5],
    "Age (years)": [10, 15, 20, 5, 25, 30, 15, 10, 8, 4,
                    50, 45, 3, 12, 7, 20, 6, 2, 9, 1],
    "Price (USD)": [200000, 250000, 300000, 400000, 150000, 220000, 270000, 350000, 370000, 500000,
                    100000, 120000, 480000, 320000, 400000, 290000, 450000, 600000, 500000, 700000]
}

# Convert the data dictionary into a Pandas DataFrame
df = pd.DataFrame(data)

# Step 2: Define features (X) and target variable (y)
X = df[["Size (sq ft)", "Bedrooms", "Age (years)"]]  # Features
y = df["Price (USD)"]  # Target variable

# Step 3: Normalize features using Min-Max Scaling
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Split the normalized data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Step 4: Train a Random Forest Regressor
# Perform GridSearchCV to find the best hyperparameters
param_grid_rf = {
    "n_estimators": [50, 100, 200],  # Number of trees in the forest
    "max_depth": [4, 6, 8, None],  # Maximum depth of each tree
    "min_samples_split": [2, 5, 10],  # Minimum number of samples required to split a node
    "min_samples_leaf": [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid_rf,
    scoring="neg_mean_squared_error",  # Use MSE as the scoring metric
    cv=5,  # 5-fold cross-validation
    n_jobs=-1  # Use all available cores
)

# Fit GridSearchCV on the training data
grid_search_rf.fit(X_train, y_train)

# Retrieve the best model and parameters
best_rf_model = grid_search_rf.best_estimator_
best_rf_params = grid_search_rf.best_params_

# Step 5: Evaluate the best Random Forest model on the test set
y_pred_rf = best_rf_model.predict(X_test)

# Calculate metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Display the best hyperparameters and model performance
print("Best Hyperparameters (Random Forest):")
print(best_rf_params)
print("\nModel Performance Metrics (Random Forest):")
print(f"Mean Squared Error (MSE): {mse_rf}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf}")
print(f"Mean Absolute Error (MAE): {mae_rf}")
print(f"R-squared (R^2): {r2_rf}")

# Step 6: Predict the price for a new house using the Random Forest
new_house = [[2000, 3, 10]]  # Features of the new house
new_house_normalized = scaler.transform(new_house)  # Normalize the new house features
predicted_price_rf = best_rf_model.predict(new_house_normalized)
print("\nPredicted Price for the new house (2000 sq ft, 3 bedrooms, 10 years old):", predicted_price_rf[0])


Best Hyperparameters (Random Forest):
{'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}

Model Performance Metrics (Random Forest):
Mean Squared Error (MSE): 452250000.0
Root Mean Squared Error (RMSE): 21266.170318136737
Mean Absolute Error (MAE): 18550.0
R-squared (R^2): 0.9814841351074719

Predicted Price for the new house (2000 sq ft, 3 bedrooms, 10 years old): 336800.0


