# 🏠 Boston Housing Price Prediction
**Objective:** Build and evaluate a machine learning regression model to predict housing prices (MEDV) using the Boston Housing dataset.

**Tech Stack:** Python, Pandas, Matplotlib, Seaborn, Scikit-Learn, Joblib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:
df = pd.read_csv("cleaned_housing_data.csv")
df.head()
df.info()
df.describe()

In [None]:
# Target distribution
sns.histplot(df['MEDV'], kde=True, bins=30)
plt.title('Distribution of MEDV')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
X = df.drop('MEDV', axis=1)
y = df['MEDV']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print(f"R² Score: {r2_score(y_test, y_pred_lr):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lr)):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lr):.2f}")

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print(f"Random Forest R²: {r2_score(y_test, y_pred_rf):.4f}")
print(f"Random Forest RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf)):.2f}")
print(f"Random Forest MAE: {mean_absolute_error(y_test, y_pred_rf):.2f}")

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)

print("Best Params:", grid_search.best_params_)
print(f"Tuned RF R²: {r2_score(y_test, y_pred_best):.4f}")

In [None]:
joblib.dump(best_rf_model, 'best_random_forest_model.joblib')

In [None]:
predictions_df = pd.DataFrame({
    'Actual MEDV': y_test.values,
    'Predicted MEDV': y_pred_best
})
predictions_df.to_csv('rf_predictions.csv', index=False)
predictions_df.head()

## ✅ Conclusion

- **Best Model:** Tuned Random Forest Regressor  
- **R² Score:** ~0.864  
- **RMSE:** ~$2,750  
- The model performs well and could be used for deployment or integration into a dashboard.

---

**Next Steps:**
- Try XGBoost or LightGBM
- Deploy with Streamlit or Flask
- Build a Power BI Dashboard from predictions

👨‍💻 Created by: [Your Name]