In [1]:
# Importing Required Libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score




In [None]:
# Loading and Preprocessing

In [3]:
# Step 1: Load the Dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target

In [7]:
display(df.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [9]:
# Step 2: Check for Missing Values
print("Missing values in each column:\n", df.isnull().sum())


Missing values in each column:
 MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64


In [23]:
# Step 2: Evaluate Models
results = []
for model_name, model in [('Linear Regression', lin_reg), 
                          ('Decision Tree', dt_reg), 
                          ('Random Forest', rf_reg),
                          ('Gradient Boosting', gb_reg), 
                          ('SVR', svr)]:
    # Get predictions
    y_pred = model.predict(X_test)
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    # Append metrics to results
    results.append({'Model': model_name, 'MSE': mse, 'MAE': mae, 'R²': r2})
    
   # Step 3: Convert Results to DataFrame
    results_df = pd.DataFrame(results)
    print(results_df)

               Model       MSE     MAE        R²
0  Linear Regression  0.555892  0.5332  0.575788
               Model       MSE       MAE        R²
0  Linear Regression  0.555892  0.533200  0.575788
1      Decision Tree  0.494272  0.453784  0.622811
               Model       MSE       MAE        R²
0  Linear Regression  0.555892  0.533200  0.575788
1      Decision Tree  0.494272  0.453784  0.622811
2      Random Forest  0.255498  0.327613  0.805024
               Model       MSE       MAE        R²
0  Linear Regression  0.555892  0.533200  0.575788
1      Decision Tree  0.494272  0.453784  0.622811
2      Random Forest  0.255498  0.327613  0.805024
3  Gradient Boosting  0.293999  0.371650  0.775643
               Model       MSE       MAE        R²
0  Linear Regression  0.555892  0.533200  0.575788
1      Decision Tree  0.494272  0.453784  0.622811
2      Random Forest  0.255498  0.327613  0.805024
3  Gradient Boosting  0.293999  0.371650  0.775643
4                SVR  0.355198  0.3

In [62]:
# Step 4: Feature Scaling
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df.iloc[:, :-1]), columns=data.feature_names)
df_scaled['Target'] = df['Target']
display(df.Target)

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: Target, Length: 20640, dtype: float64

In [None]:
#Regression Algorithm Implementation

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 1: Define and Train Models
lin_reg = LinearRegression()
dt_reg = DecisionTreeRegressor(random_state=42)
rf_reg = RandomForestRegressor(random_state=42)
gb_reg = GradientBoostingRegressor(random_state=42)
svr = SVR()

# Train the models
lin_reg.fit(X_train, y_train)
dt_reg.fit(X_train, y_train)
rf_reg.fit(X_train, y_train)
gb_reg.fit(X_train, y_train)
svr.fit(X_train, y_train)

# Step 2: Evaluate Models
results = []
for model_name, model in [('Linear Regression', lin_reg), 
                          ('Decision Tree', dt_reg), 
                          ('Random Forest', rf_reg),
                          ('Gradient Boosting', gb_reg), 
                          ('SVR', svr)]:
    # Get predictions
    y_pred = model.predict(X_test)
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    # Append metrics to results
    results.append({'Model': model_name, 'MSE': mse, 'MAE': mae, 'R²': r2})

# Step 3: Convert Results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)


               Model       MSE       MAE        R²
0  Linear Regression  0.555892  0.533200  0.575788
1      Decision Tree  0.494272  0.453784  0.622811
2      Random Forest  0.255498  0.327613  0.805024
3  Gradient Boosting  0.293999  0.371650  0.775643
4                SVR  0.355198  0.397763  0.728941


In [None]:
#Model Evaluation and Comparison

In [30]:
# Identify best and worst models
sorted_results = sorted(results, key=lambda x: x['MSE'])

print("\nModel Performance Comparison:")
for metrics in sorted_results:
    print(f"Model: {metrics['Model']}, MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R²: {metrics['R²']:.4f}")

# Best-performing model
best_model = sorted_results[0]
print("\nBest-performing model:")
print(f"Model: {best_model['Model']}, MSE: {best_model['MSE']:.4f}, MAE: {best_model['MAE']:.4f}, R²: {best_model['R²']:.4f}")

# Worst-performing model
worst_model = sorted_results[-1]
print("\nWorst-performing model:")
print(f"Model: {worst_model['Model']}, MSE: {worst_model['MSE']:.4f}, MAE: {worst_model['MAE']:.4f}, R²: {worst_model['R²']:.4f}")




Model Performance Comparison:
Model: Random Forest, MSE: 0.2555, MAE: 0.3276, R²: 0.8050
Model: Gradient Boosting, MSE: 0.2940, MAE: 0.3717, R²: 0.7756
Model: SVR, MSE: 0.3552, MAE: 0.3978, R²: 0.7289
Model: Decision Tree, MSE: 0.4943, MAE: 0.4538, R²: 0.6228
Model: Linear Regression, MSE: 0.5559, MAE: 0.5332, R²: 0.5758

Best-performing model:
Model: Random Forest, MSE: 0.2555, MAE: 0.3276, R²: 0.8050

Worst-performing model:
Model: Linear Regression, MSE: 0.5559, MAE: 0.5332, R²: 0.5758
