## California Housing Prices Regression

A sample analysis using various other linear models, here: https://sherbold.github.io/intro-to-data-science/exercises/Solution_Regression.html


Another analysis here: https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html


... or here https://scikit-learn-extra.readthedocs.io/en/stable/auto_examples/robust/plot_robust_regression_california_houses.html


### Resources

- Python 101: https://python101.pythonlibrary.org/
- StackOverflow: https://stackoverflow.com/questions/tagged/python
- Pandas User Guide: https://pandas.pydata.org/docs/user_guide/index.html#user-guide
- Pandas API: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html

In [None]:
import pandas as pd
import numpy as np

### Load the housing data set

https://scikit-learn.org/stable/datasets/real_world.html

In [None]:
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing(as_frame=True)

In [None]:
print(california_housing.DESCR)

In [None]:
df_raw = california_housing.data.copy()
df_raw["med_house_value"] = california_housing.target
df_raw.head(5)

### Helper functions

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

def print_metrics(y_test, y_pred):
    display("R2: {0}".format(r2_score(y_test, y_pred)))
    display("MAE: {0}".format(mean_absolute_error(y_test, y_pred)))
    display("RMSE: {0}".format(mean_squared_error(y_test, y_pred, squared=False)))
    display("MSE: {0}".format(mean_squared_error(y_test, y_pred, squared=True)))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# render the regression line and the coefficient of determination
def plot_reg(y_test, y_pred):
    plt.figure(figsize=(10, 8), dpi=80)
    
    plt.scatter(y_test, y_pred)
    plt.xlabel('Actual values')
    plt.ylabel('Predicted values')

    y_mean = [np.mean(y_test)]*len(y_test)

    plt.plot(y_test, y_mean, linestyle='--', color="green")
    plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_pred, 1))(np.unique(y_test)), color="red")

    plt.text(0, 0, 'R-squared = %0.2f' % r2_score(y_test, y_pred), color="red")
    
    plt.show()

### Inspect the data

In [None]:
sns.set(rc={"figure.figsize": (11.7, 8.27)})
plt.figure(figsize=(12, 8), dpi=80)

In [None]:
# all features are numeric
df_raw.info()

In [None]:
df_raw.describe().transpose()

In [None]:
# mask = np.triu(np.ones_like(corr, dtype=bool))
corr = df_raw.corr()
sns.heatmap(
    corr,
    annot=True,
    fmt=".2f",
    xticklabels=corr.columns.values,
    yticklabels=corr.columns.values,
    cmap="Greens",
)
plt.title("Correlation Heatmap")

In [None]:
sns.pairplot(df_raw.sample(100), x_vars=["HouseAge", "AveRooms"], y_vars="med_house_value", height=8, kind="reg");

In [None]:
# this will take a few seconds to run 
# please wait

df_plot = df_raw.copy().sample(1000)

df_plot = df_plot.drop(columns=["Longitude", "Latitude"])

# Quantize the target and keep the midpoint for each interval
df_plot["med_house_value"] = pd.qcut(df_plot["med_house_value"], 6, retbins=False)
df_plot["med_house_value"] = df_plot["med_house_value"].apply(lambda x: x.mid)

_ = sns.pairplot(data=df_plot, hue="med_house_value", palette="viridis")

### Scikit Learn OLS

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
X = california_housing.data
y = california_housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
reg = LinearRegression().fit(X_train, y_train)
y_pred_ols = reg.predict(X_test)

In [None]:
plot_reg(y_test, y_pred_ols)

In [None]:
print_metrics(y_test, y_pred_ols)

In [None]:
# show model coefficients. for a nicer display, pack them as a dataframe
df_coeff = pd.DataFrame(X_train.columns, columns=["Feature"])
df_coeff["Coefficient"] = reg.coef_

display(df_coeff)

print( "Intercept term = {0}".format(reg.intercept_))

### Stats models OLS

https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html#statsmodels.regression.linear_model.OLS

Note: an intercept term is not added as done by the ScikitLearn implementation

In [None]:
import statsmodels.api as sm

# stats model OLS does not add an intercept term
X_sm = sm.add_constant(california_housing.data)
y_sm = california_housing.target
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

model = sm.OLS(y_train_sm, X_train_sm)
mdf = model.fit()
display(mdf.summary())

y_pred_sm = mdf.predict(X_test_sm)

print_metrics(y_test_sm, y_pred_sm)

### Decission Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(random_state=42, max_depth=3)
%time dtree = dtree.fit(X_train, y_train)

display("R2 score on training data: {0}".format(dtree.score(X_train, y_train)))

In [None]:
y_pred_dt = dtree.predict(X_test)
print_metrics(y_test, y_pred_dt)

In [None]:
try:
    import graphviz
    from sklearn import tree
    dot_data = tree.export_graphviz(dtree, feature_names=X_train.columns.values, out_file=None, filled=True)
    graph = graphviz.Source(dot_data, format="svg") 
    display(graph)
except Exception as e:
    print("Failed rendering to graphviz. Verify that graphviz is installed. Error is: {0}".format(e))

### XGBoost

In [None]:
from xgboost import XGBRegressor

# train using the default parameters
xgb = XGBRegressor()

%time rf = xgb.fit(X_train, y_train)
print("R2 on training data: {0}".format(xgb.score(X_train, y_train)))

y_pred_xgb = xgb.predict(X_test)
print_metrics(y_test, y_pred_xgb)