In [0]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import torch
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor

In [0]:
source = "abfss://raw@cloudinfrastg.dfs.core.windows.net/00_data_source/"
data = "housing.csv"

In [0]:
import pandas as pd

housing = spark.read.csv(source + data, header=True, inferSchema=True)   
housing.display() 

In [0]:
housing_pd = housing.toPandas ()
housing_pd['ocean_proximity'].value_counts()

In [0]:
# Drop categorical feature for now (ocean_proximity)
housing_pd = housing_pd.drop(columns=['ocean_proximity'])

# Handle missing values
housing_pd = housing_pd.fillna(housing_pd.median())

# Split dataset into features and target
X = housing_pd.drop(columns=['median_house_value'])
y = housing_pd['median_house_value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
# 1. Gradient Boosted Tree Model (Using HistGradientBoostingRegressor)
gbm_model = HistGradientBoostingRegressor(max_iter=100, learning_rate=0.1, random_state=42)
gbm_model.fit(X_train, y_train)

y_pred_gbm = gbm_model.predict(X_test)
mae_gbm = mean_absolute_error(y_test, y_pred_gbm)
print(f'Gradient Boosted Model MAE: ${mae_gbm:,.2f}')


In [0]:
# 2. Random Forest Regressor (Replacing LSTM)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f'Random Forest Model MAE: ${mae_rf:,.2f}')

## Why Use PCA in Our Housing Dataset?
Our dataset has multiple numerical features, such as:

Longitude, Latitude → Geographical location
Housing Median Age → Age of the neighborhood
Total Rooms, Bedrooms, Population, Households → Structural information
Median Income → Economic factor
Instead of using all these features, PCA finds a smaller number of new features (principal components) that capture most of the important variations in the data.

In [0]:
# 3. PCA-based Dimensionality Reduction
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
pca_regressor = LinearRegression()
pca_regressor.fit(X_train_pca, y_train)
y_pred_pca = pca_regressor.predict(X_test_pca)
mae_pca = mean_absolute_error(y_test, y_pred_pca)
print(f'PCA Model MAE: ${mae_pca:,.2f}')

In [0]:
# Stacking Ensemble
stacking_model = StackingRegressor(
    estimators=[
        ('gbm', gbm_model),
        ('rf', rf_model),
        ('pca', pca_regressor)
    ],
    final_estimator=LinearRegression()
)

stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
mae_stack = mean_absolute_error(y_test, y_pred_stack)
print(f'Stacking Ensemble MAE: ${mae_stack:,.2f}')

## Explanation of Choice
We chose Stacking as the ensemble method since it leverages multiple diverse models
(GBM, Random Forest, and PCA) and uses a meta-learner (Linear Regression) to combine predictions.
This enhances generalization and ensures better performance compared to individual models.

In [0]:
# Plot Actual vs Predicted Prices
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_stack, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='dashed')
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title('Actual vs Predicted House Prices (Stacking Ensemble)')
plt.show()
