# Linear Regression: Predicting California Housing Prices

## 1. Load and Explore the Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import importlib

from sklearn.datasets import fetch_california_housing
from tensorflow import keras
import ex1_helper as helper

In [None]:
housing = fetch_california_housing(as_frame=True)
full_df = pd.DataFrame(housing.frame)

relevant_cols = ['MedInc','MedHouseVal']
df = full_df[relevant_cols]
print(type(df), df.shape)
print(df)

In [None]:
medInc = df['MedInc']
medHouseVal = df['MedHouseVal']

plt.scatter(medInc, medHouseVal, s=2, alpha=0.5)
plt.xlabel("Household income in a district (x $10,000s)")
plt.ylabel("House value in a district (x $100,000)")
plt.title("House Value vs. Income")
plt.show()

In [None]:
helper.calculate_stats(medInc)
helper.calculate_stats(medHouseVal)

## 2. Preprocess the Data

In [None]:
target = ['MedHouseVal']

X_filtered, y_filtered = helper.mask_df(df, ['MedInc'], target)
X_train, X_test, y_train, y_test = helper.transform_data(X_filtered, y_filtered)
print(X_train, y_train)

In [None]:
helper.calculate_stats(X_filtered['MedInc'])
helper.calculate_stats(y_filtered['MedHouseVal'])

In [None]:
plt.scatter(df['MedInc'], df['MedHouseVal'], s=2, alpha=0.5, label="Non-Processed")
plt.scatter(X_train, y_train, s=2, alpha=0.5, label="Processed")
plt.xlabel("Household income in a district")
plt.ylabel("House value in a district")
plt.title("Data comparison")
plt.legend()
plt.show()

## 3. Build a Linear Regression Model

[Tensorflow documentation](https://www.tensorflow.org/tutorials/keras/regression)

### Stochastic Gradient Descent (SGD)

In [None]:
sgd_model = helper.create_model(0.01, X_train.shape[1])
sgd_history = sgd_model.fit(X_train, y_train, epochs=10, batch_size=1, verbose=0)
helper.show_history(sgd_history)

### Batch Gradient Descent (BGD)

In [None]:
bgd_model = helper.create_model(0.01, X_train.shape[1])
bgd_history = bgd_model.fit(X_train, y_train, epochs=200, batch_size=len(X_train), verbose=0)
helper.show_history(bgd_history)

In [None]:
plt.plot(bgd_history.history['loss'], label='BGD Loss')
plt.plot(sgd_history.history['loss'], label='SGD Loss')
plt.xlim([0,200])
plt.ylim([0, 10])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

## 4. Make Predictions

In [None]:
y_pred_bgd = bgd_model.predict(X_test)
y_pred_sgd = sgd_model.predict(X_test)

In [None]:
param = np.array([[8.0]])
y_pred_given = bgd_model.predict(param)

In [None]:
print(y_pred_given)

In [None]:
mse_bgd = keras.losses.MeanSquaredError()(y_test, y_pred_bgd)
mse_sgd = keras.losses.MeanSquaredError()(y_test, y_pred_sgd)
print("MSE (BGD):", mse_bgd.numpy())
print("MSE (SGD):", mse_sgd.numpy())

## 5. Visualize the Results

In [None]:
plt.scatter(X_test, y_test, s=2, alpha=0.5)
plt.plot(X_test, y_pred_bgd, label = 'Batch GD')
plt.plot(X_test, y_pred_sgd, label = 'Stochastic GD')
plt.xlabel("Household income in a district (x $10,000s)")
plt.ylabel("House value in a district (x $100,000)")
plt.title("Comparison of GD flavours")
plt.legend()
plt.show()

## Discussion Points
• Why does income alone not fully explain house prices? Consider geographic factors (e.g.,
coastal vs. inland) and household size.

• Increase the number of features you think might improve the model and discuss it.


In [None]:
importlib.reload(helper)

In [None]:
helper.plot_heatmap(full_df, "California Housing - Initial heatmap")

In [None]:
extended_cols = ['MedInc','AveRooms','HouseAge', 'MedHouseVal']
extended_df = full_df[extended_cols]

In [None]:
# Cluster Longitud and Latitude
from sklearn.cluster import KMeans
geo_df = full_df[['Latitude', 'Longitude']]
kmeans = KMeans(n_clusters=5, random_state=3103)
extended_df['LocationCluster'] = kmeans.fit_predict(geo_df)

Relying solely on income to predict house prices oversimplifies the problem. Although each available variable exhibits a mild correlation with median house values, their individual contributions are limited. Moreover, key factors like interior finishes and furnished status, which are not recorded, also impact prices. This, combined with unavoidable data arbitrarity, results in additional noise that challenges prediction accuracy.

In [None]:
# Plot the correlation heatmap
helper.plot_heatmap(extended_df, "California Housing - Refined Heatmap")

In [None]:
# Preprocess
extended_features = ['MedInc', 'AveRooms', 'HouseAge', 'LocationCluster']
X_filtered, y_filtered = helper.mask_df(extended_df, extended_features, target)
X_train, X_test, y_train, y_test = helper.transform_data(X_filtered, y_filtered)

In [None]:
# Build model
advanced_model = helper.create_model(0.01, 4)
advanced_history = advanced_model.fit(X_train, y_train, epochs=200, batch_size=len(X_train), verbose=0)

In [None]:
# Visualize results
y_pred_adv = advanced_model.predict(X_test)
mse_adv = keras.losses.MeanSquaredError()(y_test, y_pred_adv)
print("MSE (adv):", mse_adv.numpy())
helper.show_history(advanced_history)

plt.plot(advanced_history.history['loss'], label=f'Advanced - Loss')
plt.plot(bgd_history.history['loss'], label=f'BGD - Loss')
plt.plot(sgd_history.history['loss'], label=f'SGD - Loss')
plt.xlim([0, 200])
plt.ylim([0, 10])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

In [None]:
# Visualize clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(full_df['Longitude'], full_df['Latitude'], c=extended_df['LocationCluster'], s=2, alpha=0.5)
plt.colorbar(scatter, label='Cluster Label')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('KMeans Clusters - based on Latitude and Longitude')
plt.show()

In [None]:
# Plot for Learning rate 0.1
sgd_model_2 = helper.create_model(0.1, X_train.shape[1])
sgd_history_2 = sgd_model_2.fit(X_train, y_train, epochs=10, batch_size=1, verbose=0)
bgd_model_2 = helper.create_model(0.1, X_train.shape[1])
bgd_history_2 = bgd_model_2.fit(X_train, y_train, epochs=200, batch_size=len(X_train), verbose=0)
plt.plot(bgd_history_2.history['loss'], label='BGD Loss')
plt.plot(sgd_history_2.history['loss'], label='SGD Loss')
plt.xlim([0,200])
plt.ylim([0, 10])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)