In [2]:
import time
import joblib
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint
from math import sqrt

# === Load California housing data ===
housing = fetch_california_housing(as_frame=True)
df = housing.frame

X = df.drop("MedHouseVal", axis=1)
y = df["MedHouseVal"]

# === Train/Test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Define model and hyperparameter space ===
rf = RandomForestRegressor(random_state=42)

param_dist = {
    "n_estimators": randint(50, 300),
    "max_depth": [None, 10, 20, 30, 50],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10),
    "max_features": [1.0, "sqrt", "log2"]
}

random_forest_model = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# === Fit the model ===
start = time.time()
random_forest_model.fit(X_train, y_train)
end = time.time()
print(f"\n⏱ Model fit time: {end - start:.2f} seconds")

# === Best model ===
best_rf = random_forest_model.best_estimator_

# === Evaluate on test set ===
y_pred = best_rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print("\n🎯 Best parameters found:")
print(random_forest_model.best_params_)
print(f"\n📉 Test RMSE: {rmse:.3f}")



Fitting 3 folds for each of 20 candidates, totalling 60 fits

⏱ Model fit time: 99.05 seconds

🎯 Best parameters found:
{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 185}

📉 Test RMSE: 0.499


In [3]:
# === Save model ===
joblib.dump(best_rf, "random_forest.pkl")
print("\n💾 Model saved as 'random_forest.pkl'")


💾 Model saved as 'random_forest.pkl'
