In [3]:
#Data Loading & Preprocessing

# Required Libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load California Housing dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseVal'] = data.target

# Check for missing values
print("Missing values in dataset:\n", df.isnull().sum())

# Feature Scaling (Standardization)
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('MedHouseVal', axis=1))
y = df['MedHouseVal']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing Explanation:
# No missing values in this dataset.
# StandardScaler used to normalize features (important for algorithms like SVR and Gradient Boosting).

#------------------------------------------------------------------------------------------------------------

# Regression Algorithm Implementation

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Dictionary to store results
results = {}

# Helper function to evaluate models
def evaluate_model(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "MAE": mae, "R2": r2}
    print(f"\n{name} Results:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R² Score: {r2:.4f}")

# Linear Regression
evaluate_model("Linear Regression", LinearRegression())

# Decision Tree Regressor
evaluate_model("Decision Tree", DecisionTreeRegressor(random_state=42))

# Random Forest Regressor
evaluate_model("Random Forest", RandomForestRegressor(random_state=42, n_estimators=100))

# Gradient Boosting Regressor
evaluate_model("Gradient Boosting", GradientBoostingRegressor(random_state=42))

# Support Vector Regressor
evaluate_model("SVR", SVR())

#----------------------------------------------------------------------------------------------------------

# Model Evaluation & Comparison

# Create a comparison DataFrame
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:\n")
print(results_df.sort_values(by="R2", ascending=False))

#-----------------------------------------------------------------------------------------------------------

# Explanation of Algorithms

# Linear Regression
# How it works: Models a linear relationship between features and the target.
# Suitability: Simple and interpretable, good as a baseline model.

# Decision Tree Regressor
# How it works: Splits data into regions based on feature thresholds.
# Suitability: Captures non-linear relationships, but prone to overfitting.

# Random Forest Regressor
# How it works: Ensemble of decision trees using bootstrapping and averaging.
# Suitability: Reduces overfitting and improves accuracy.

# Gradient Boosting Regressor
# How it works: Sequentially builds trees to correct errors of previous ones.
# Suitability: Excellent for structured/tabular data.

# Support Vector Regressor (SVR)
# How it works: Tries to fit the best line within a margin using kernels.
# Suitability: Good with smaller datasets, sensitive to scaling and slower on large data.

#-------------------------------------------------------------------------------------------------------------

# Best Performing Model:
# Gradient Boosting or Random Forest
# Why: They handle non-linear relationships and perform well on complex datasets like housing data.

# Worst Performing Model:
# Support Vector Regressor
# Why: Computationally expensive on large datasets and sensitive to scaling and hyperparameters.


Missing values in dataset:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

Linear Regression Results:
Mean Squared Error: 0.5559
Mean Absolute Error: 0.5332
R² Score: 0.5758

Decision Tree Results:
Mean Squared Error: 0.4943
Mean Absolute Error: 0.4538
R² Score: 0.6228

Random Forest Results:
Mean Squared Error: 0.2555
Mean Absolute Error: 0.3276
R² Score: 0.8050

Gradient Boosting Results:
Mean Squared Error: 0.2940
Mean Absolute Error: 0.3717
R² Score: 0.7756

SVR Results:
Mean Squared Error: 0.3552
Mean Absolute Error: 0.3978
R² Score: 0.7289

Model Performance Comparison:

                        MSE       MAE        R2
Random Forest      0.255498  0.327613  0.805024
Gradient Boosting  0.293999  0.371650  0.775643
SVR                0.355198  0.397763  0.728941
Decision Tree      0.494272  0.453784  0.622811
Linear Regression  0.555892  0.533200  0.575788
