In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# data = pd.read_csv(
#     "https://github.com/ageron/handson-ml/raw/refs/heads/master/datasets/housing/housing.csv")

housing = fetch_california_housing()
X = pd.DataFrame(data=housing.data, columns=housing.feature_names)
y = pd.DataFrame(data=housing.target, columns=housing.target_names)
data = pd.concat([X, y], axis=1)

In [None]:
data.head()

In [None]:
data.dropna(inplace=True)

if "ocean_proximity" in data.columns:
  data = pd.get_dummies(data, columns=['ocean_proximity'], drop_first=True)  

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(9, 7))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.show()

In [None]:
data.hist(figsize=(15, 10), bins=50)
plt.show()

In [None]:
boxdata = data.drop(columns=data.filter(like="ocean_proximity").columns)
nFeatures = len(boxdata.columns)
rows = math.ceil(nFeatures / 3)
cols = min(3, nFeatures)

fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 5 * rows))
axes = axes.flatten() if rows > 1 else [axes]

for i, col in enumerate(boxdata.columns):
  sns.boxplot(boxdata[col], ax=axes[i])
  axes[i].set_title(f'Boxplot of {col}')
  axes[i].set_xlabel(col)

plt.show()

In [None]:
if "X" not in locals() or "y" not in locals(): 
  X = data.drop(columns=["median_house_value"])
  y = data['median_house_value']


XTrain, XTest, yTrain, yTest = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [None]:
model = LinearRegression()
model.fit(XTrain, yTrain)

print("Model Params:")
print(model.coef_)
print(model.intercept_)

In [None]:
yPred = model.predict(XTest)

mae = mean_absolute_error(yTest, yPred)
mse = mean_squared_error(yTest, yPred)
r2 = r2_score(yTest, yPred)

print("Model Metrics:")
print(f" Mean Absolute Error (MAE): {mae}")
print(f" Mean Squared Error (MSE): {mse}")
print(f" R-Squared Score: {r2}")

In [None]:
plt.figure(figsize=(8, 5))
plt.scatter(x=yTest, y=yPred, alpha=0.3, s=20, linewidth=0)
plt.xlabel('Actual House Value')
plt.ylabel('Predicted House Value')
plt.title('Actual vs. Predicted House Prices')
plt.show()