In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

import warnings
warnings.filterwarnings("ignore")

# LOADING AND PREPROCESSING
# Load dataset
housing = fetch_california_housing()

# Convert into pandas DataFrame
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df["MedHouseVal"] = housing.target
print("First 5 rows of dataset:\n")
print(df.head())

print("\nDataset Shape:", df.shape)
print("\nDataset Info:\n")
print(df.info())

print("\nChecking Missing Values:\n")
print(df.isnull().sum())

# Handle Missing Values
df.fillna(df.mean(), inplace=True)

print("\nMissing values handled (if any existed).")

# Feature Scaling

print("\nPerforming Feature Scaling using StandardScaler...\n")

X = df.drop("MedHouseVal", axis=1)
y = df["MedHouseVal"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train-Test Split Done.")
print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)


# Explanation of Preprocessing


print("\nExplanation of Preprocessing Steps:")
print("1. Loaded dataset using fetch_california_housing().")
print("2. Converted dataset into pandas DataFrame.")
print("3. Checked and handled missing values (filled with mean if any).")
print("4. Split data into training and testing sets (80-20 split).")
print("5. Applied StandardScaler for feature scaling.")
print("   Scaling is necessary for algorithms like SVR and Linear Regression.")
print("   It ensures all features are on same scale.\n")

#  REGRESSION ALGORITHM IMPLEMENTATION 

results = []

#  Linear Regression
print("Applying Linear Regression...")

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

print("Linear Regression Explanation:")
print("Linear Regression models the relationship between features and target")
print("using a straight-line equation. Suitable for predicting continuous values.\n")

mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

results.append(["Linear Regression", mse_lr, mae_lr, r2_lr])


# Decision Tree Regressor


print("Applying Decision Tree Regressor...")

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree Explanation:")
print("Decision Tree splits the dataset into smaller subsets")
print("based on feature values. Good for capturing non-linear relationships.\n")

mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

results.append(["Decision Tree", mse_dt, mae_dt, r2_dt])


# Random Forest Regressor

print("Applying Random Forest Regressor...")

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Explanation:")
print("Random Forest is an ensemble of multiple decision trees.")
print("It reduces overfitting and improves accuracy.\n")

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

results.append(["Random Forest", mse_rf, mae_rf, r2_rf])

#  Gradient Boosting Regressor

print("Applying Gradient Boosting Regressor...")

gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

print("Gradient Boosting Explanation:")
print("Gradient Boosting builds trees sequentially.")
print("Each new tree corrects the errors of previous trees.\n")

mse_gb = mean_squared_error(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

results.append(["Gradient Boosting", mse_gb, mae_gb, r2_gb])


# Support Vector Regressor (SVR)

print("Applying Support Vector Regressor (SVR)...")

svr = SVR()
svr.fit(X_train_scaled, y_train)
y_pred_svr = svr.predict(X_test_scaled)

print("SVR Explanation:")
print("Support Vector Regressor tries to fit the best line within a margin.")
print("Works well for complex relationships but sensitive to scaling.\n")

mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

results.append(["SVR", mse_svr, mae_svr, r2_svr])



#  MODEL EVALUATION AND COMPARISON 

print("\nModel Evaluation Results:\n")

results_df = pd.DataFrame(results, columns=["Model", "MSE", "MAE", "R2 Score"])

print(results_df)

# Identify Best & Worst Model
best_model = results_df.loc[results_df["R2 Score"].idxmax()]
worst_model = results_df.loc[results_df["R2 Score"].idxmin()]

print("\nBest Performing Model:")
print(best_model)

print("\nWorst Performing Model:")
print(worst_model)

print("\nComparison Justification:")
print("Best model is selected based on highest R2 Score and lowest errors.")
print("Worst model is selected based on lowest R2 Score and higher errors.")

print("\n================ FINAL SUMMARY ================\n")
print("Dataset Used: California Housing Dataset from sklearn")
print("Algorithms Implemented:")
print("1. Linear Regression")
print("2. Decision Tree Regressor")
print("3. Random Forest Regressor")
print("4. Gradient Boosting Regressor")
print("5. Support Vector Regressor (SVR)")
print("\nEvaluation Metrics Used:")
print("- Mean Squared Error (MSE)")
print("- Mean Absolute Error (MAE)")
print("- R2 Score")

First 5 rows of dataset:

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  

Dataset Shape: (20640, 9)

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-nu