Import used packages

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from algorithms import *

Load Dataset

In [None]:
# Load the California housing dataset
california = fetch_california_housing(as_frame=True)

# Create a copy of the data and the target
X = california['data'].copy()
y = california['target'].copy()

original_MedInc = X['MedInc'].copy

Add null values randomly

In [None]:
# Replace a percentage of values in the 'medInc' column of X with NaNs
percent_missing = 20
num_missing = int(percent_missing / 100 * len(X))
missing_indices = np.random.choice(X.index, num_missing, replace=False)
X.loc[missing_indices, 'MedInc'] = np.nan
percent_missing = X['MedInc'].isna().sum() / len(X) * 100

Perform Global and Piecewise Interplotation

In [None]:
# Perform global interpolation
df_global = global_interp(X, 'MedInc')

# Perform piecewise interpolation
df_piecewise = piecewise_interp(X, 'MedInc')

print(df_global['MedInc'].isna().sum() / len(X) * 100)
print(df_piecewise['MedInc'].isna().sum() / len(X) * 100)

Linear Regression Implementation

In [None]:
# Split the data into training and testing sets
X_train_global, X_test_global, y_train, y_test = train_test_split(df_global, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train_global, y_train)
y_pred_global = model.predict(X_test_global)

In [None]:
# Split the data into training and testing sets
X_train_piecewise, X_test_piecewise, y_train, y_test = train_test_split(df_piecewise, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train_piecewise, y_train)
y_pred_piecewise = model.predict(X_test_piecewise)

R-squared (R²): The R-squared value is a metric that measures the proportion of the variance in the dependent variable (y) that can be explained by the independent variables (X) in the model. It ranges from 0 to 1, with higher values indicating a better fit.

In [None]:
r2_global = r2_score(y_test, y_pred_global)
print("R-squared: ", r2_global)

In [None]:
r2_piecewise = r2_score(y_test, y_pred_piecewise)
print("R-squared: ", r2_piecewise)

Mean Squared Error (MSE): The mean squared error is a metric that measures the average squared difference between the predicted and actual values. It is widely used in linear regression models to evaluate their accuracy.

In [None]:
mse_global = mean_squared_error(y_test, y_pred_global)
print("Mean Squared Error: ", mse_global)

In [None]:
mse_piecewise = mean_squared_error(y_test, y_pred_piecewise)
print("Mean Squared Error: ", mse_piecewise)

Residual Plot: A residual plot is a scatter plot that shows the differences between the predicted and actual values (i.e., the residuals) against the predicted values. It can help identify any patterns or outliers in the residuals, which can indicate issues with the model.

In [None]:
residual_plot(y_test, y_pred_global)

In [None]:
residual_plot(y_test, y_pred_piecewise)