Import used packages

In [8]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from algorithms import *

Load Dataset

In [9]:
# Load the California housing dataset
california = fetch_california_housing(as_frame=True)

# Create a copy of the data and the target
X = california['data'].copy()
y = california['target'].copy()

X_original = california['data'].copy()
MedInc_original = X['MedInc'].copy()

Add null values randomly

In [10]:
# Replace a percentage of values in the 'medInc' column of X with NaNs
percent_missing = 20
num_missing = int(percent_missing / 100 * len(X))
missing_indices = np.random.choice(X.index, num_missing, replace=False)
X.loc[missing_indices, 'MedInc'] = np.nan
percent_missing = X['MedInc'].isna().sum() / len(X) * 100

In [11]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


Perform Global and Piecewise Interpolation

In [12]:
# Perform global interpolation
df_global = global_interp(X, 'MedInc')
print("Percentage of NANs after global interpolation:",df_global['MedInc'].isna().sum() / len(X) * 100)

# Perform piecewise interpolation
df_piecewise = piecewise_interp(X, 'MedInc')
print("Percentage of NANs after piece-wise interpolation:",df_piecewise['MedInc'].isna().sum() / len(X) * 100)

Percentage of NANs after global interpolation: 0.0
Percentage of NANs after piece-wise interpolation: 0.0


Perform Least Square Approximation using Pseudo Inverse

In [13]:
df_least_squares = least_squares(X,'MedInc')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_missing[colname] = df_missing.index.values.reshape(-1, 1).dot(y_pred)


Linear Regression Implementation

In [14]:
y_pred, r2,  mse = model(X_original,y,'Original')
y_pred, r2,  mse = model(df_global,y,'Global')
y_pred, r2,  mse = model(df_piecewise,y,'Piecewise')
y_pred, r2,  mse = model(df_least_squares,y,'Least Squares')


-------------Original--------------------
R-squared:  0.6456819729261931
Mean Squared Error:  0.4643015238301188

-------------Global--------------------
R-squared:  0.6404190715510336
Mean Squared Error:  0.4711980770437242

-------------Piecewise--------------------
R-squared:  0.6404190715510318
Mean Squared Error:  0.47119807704372646

-------------Least Squares--------------------
R-squared:  0.5414139393861555
Mean Squared Error:  0.6009352911245073



R-squared (R²): The R-squared value is a metric that measures the proportion of the variance in the dependent variable (y) that can be explained by the independent variables (X) in the model. It ranges from 0 to 1, with higher values indicating a better fit.

Mean Squared Error (MSE): The mean squared error is a metric that measures the average squared difference between the predicted and actual values. It is widely used in linear regression models to evaluate their accuracy.

Residual Plot: A residual plot is a scatter plot that shows the differences between the predicted and actual values (i.e., the residuals) against the predicted values. It can help identify any patterns or outliers in the residuals, which can indicate issues with the model.