In [5]:
# =========================================
# Housing Price Prediction Capstone Project
# =========================================
#
# **Author:** Krishnan Ramaswami  
# 
# 
# ## Summary
# Initial setup : Load dataset and inspect structure, Feature and task understanding, Data cleaning and preparation,
# Preprocessing and scaling,Feature engineering,Exploratory Data Analysis (EDA),Outlier handling (IQR method),
# Baseline model performance calculation

# =========================================
# Import Libraries
# =========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
import sys
from math import pi
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from datetime import datetime


# Inline plots
%matplotlib inline
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10,6)

# =========================================
# Load and Inspect Dataset
# =========================================

housing = fetch_california_housing()

# Convert to DataFrame
housing_df = pd.DataFrame(housing.data,
                             columns=housing.feature_names)
housing_df['MedHouseValue'] = pd.Series(housing.target)
display(housing_df.head())
#print(housing_df.info())
#print(housing_df.describe())
#print(housing_df.isnull().sum())

# =========================================
# Feature & Task Understanding
# =========================================
#

target = 'MedHouseValue'
predictors = [col for col in housing_df.columns if col != target]
numerical_features = housing_df[predictors].select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = housing_df[predictors].select_dtypes(include=['object','category']).columns.tolist()
print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

plt.figure()
sns.heatmap(housing_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.savefig("images/initial_report/Correlation Heatmap.png")
plt.close()
#plt.show()


# =========================================
# Data Clean up and Preprocessing
# =========================================


X = housing_df[predictors]
y = housing_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Encode categorical features
if categorical_features:
    encoder = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_features]),
                                   columns=encoder.get_feature_names_out(categorical_features),
                                   index=X_train.index)
    X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_features]),
                                  columns=encoder.get_feature_names_out(categorical_features),
                                  index=X_test.index)
    X_train = X_train.drop(categorical_features, axis=1).join(X_train_encoded)
    X_test = X_test.drop(categorical_features, axis=1).join(X_test_encoded)
    
# =========================================
# Feature Engineering
# =========================================

X_train['Rooms_per_Household'] = X_train['AveRooms'] / X_train['AveOccup']
X_test['Rooms_per_Household'] = X_test['AveRooms'] / X_test['AveOccup']
X_train['Bedrooms_per_Room'] = X_train['AveBedrms'] / X_train['AveRooms']
X_test['Bedrooms_per_Room'] = X_test['AveBedrms'] / X_test['AveRooms']
X_train['Population_per_Household'] = X_train['Population'] / X_train['AveOccup']
X_test['Population_per_Household'] = X_test['Population'] / X_test['AveOccup']
#all_features = X_train.columns.tolist()

# =========================================
# Exploratory Data Analysis (EDA)
# =========================================

sns.set(style="whitegrid")
palette = sns.color_palette("Set2")

# %% [code]
# Histograms
housing_df['Rooms_per_Household'] = housing_df['AveRooms'] / housing_df['AveOccup']
housing_df['Bedrooms_per_Room'] = housing_df['AveBedrms'] / housing_df['AveRooms']
housing_df['Population_per_Household'] = housing_df['Population'] / housing_df['AveOccup']

derived_features = ['Rooms_per_Household', 'Bedrooms_per_Room', 'Population_per_Household']
all_features = numerical_features + derived_features
fig, axes = plt.subplots(4, 3, figsize=(18, 16))
axes = axes.flatten()

for i, feature in enumerate(all_features):
    sns.histplot(X_train[feature], bins=30, kde=True, ax=axes[i], color=palette[i % len(palette)])
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel('')
    axes[i].set_ylabel('Count')

# Remove the last empty subplot (if features < 12)
if len(all_features) < len(axes):
    for j in range(len(all_features), len(axes)):
        fig.delaxes(axes[j])

plt.tight_layout()
#plt.savefig("housing_feature_histograms.png", dpi=300, bbox_inches='tight')
plt.savefig("images/initial_report/housing_feature_histograms.png")
plt.close()
#plt.show()

# Scatter plots
plt.figure(figsize=(10,6))
sns.scatterplot(x=X_train['MedInc'], y=y_train, alpha=0.5, color='royalblue')
plt.title("Median Income vs Median House Value")
plt.xlabel("Median Income (scaled)")
plt.ylabel("Median House Value")
plt.tight_layout()
plt.savefig("images/initial_report/median_income_vs_housevalue.png")
#plt.show()
plt.close()

plt.figure(figsize=(10,6))
sns.scatterplot(x=X_train['Longitude'], y=X_train['Latitude'], hue=y_train, palette='viridis', alpha=0.5)
plt.title("Geographic Distribution of House Values")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(title="MedHouseValue", bbox_to_anchor=(1.05,1), loc='upper left')
plt.tight_layout()
plt.savefig("images/initial_report/geographic_distribution.png", dpi=300, bbox_inches='tight')
#plt.show()
plt.close()

# =========================================
# Outlier Handling
# =========================================

def cap_outliers(df, features):
    capped = df.copy()
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5*IQR
        upper = Q3 + 1.5*IQR
        capped[feature] = np.where(capped[feature] < lower, lower, capped[feature])
        capped[feature] = np.where(capped[feature] > upper, upper, capped[feature])
    return capped

X_train_capped = cap_outliers(X_train, all_features)
X_test_capped = cap_outliers(X_test, all_features)

# =========================================
# Baseline Model(Median Predictor)
# =========================================

baseline_pred = np.median(y_train)
baseline_mse = mean_squared_error(y_test, [baseline_pred]*len(y_test))
baseline_mae = mean_absolute_error(y_test, [baseline_pred]*len(y_test))
baseline_r2 = r2_score(y_test, [baseline_pred]*len(y_test))
baseline_results = pd.DataFrame({
    'Model': ['Baseline (Median Predictor)'],
    'MSE': [baseline_mse],
    'MAE': [baseline_mae],
    'R2': [baseline_r2]
})

print("\n📊 Baseline Model Performance Summary")
display(baseline_results.style.set_table_styles(
    [{'selector': 'th', 'props': [('background-color', '#4C72B0'), ('color', 'white'), ('font-weight', 'bold')]},
     {'selector': 'td', 'props': [('text-align', 'center')]}]
).format({'MSE': '{:.4f}', 'MAE': '{:.4f}', 'R2': '{:.4f}'}))


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseValue
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


Numerical Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Categorical Features: []

📊 Baseline Model Performance Summary


Unnamed: 0,Model,MSE,MAE,R2
0,Baseline (Median Predictor),1.3762,0.874,-0.0502
