## Import Libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Load Dataset

In [None]:
housing = fetch_california_housing(as_frame= True)
df = housing.frame
df.head()

## Basic Dataset Info

In [None]:
print("Shape: ", df.shape)
df.info()

## Check Missing Values

In [None]:
missing = df.isnull().sum().sum()

if missing > 0:
    df = df.fillna(df.median(numeric_only=True))
    print(f"Filled missing values: {missing}")
else:
    print("No missing values found.")


## Features

In [None]:
X = df.drop(columns=["MedHouseVal"])
y = df["MedHouseVal"]

#check
print("X shape:", X.shape)
print("y shape:", y.shape)

## Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42)

## Train Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

## Predictions

In [None]:
y_predict = model.predict(X_test)
y_predict[:5]

## Evaluate Model (MSE, RMSE, R²)

In [None]:
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)
r2_score = r2_score(y_test, y_predict)

print("MSE:", mse)
print("RMSE:", rmse)
print("R² Score:", r2_score)

## Results

In [None]:
results_df = pd.DataFrame({"Actual ": y_test.values,
                          "Predicted ": y_predict})

results_df.head(10)

## Save Sample Predictions

In [None]:
results_df.head(20).to_csv("results/sample_predictions.csv", index=False)
print("Saved: sample_predictions.csv")

## PREDICTED vs ACTUAL SCATTER PLOT

In [None]:
plt.figure(figsize=(7,5))
plt.scatter(y_test, y_predict, alpha=0.5)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.show()


## Model Coefficients

In [None]:
coef_df = pd.DataFrame({"features":X.columns,
                        "coefficient": model.coef_
}).sort_values(by="coefficient", ascending=False)

coef_df.head()