In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
import sys
import pandas as pd
import pymc as pm
import numpy as np
sys.path.append("../src/")
from models import create_blr_full_pooling, create_blr_no_pooling

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [101]:
from sklearn.preprocessing import LabelEncoder

# Load preprocessed data
data = pd.read_csv("../data/california_housing_pre.csv")
data.dropna(inplace=True)
y = data.pop("median_house_value")

# Convert "county" column to integer
label_encoder = LabelEncoder()
data["county_nr"] = label_encoder.fit_transform(data["county"])

data = data.loc[:, ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "county_lon", "county_lat", "county_nr"]]

data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 20637 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20637 non-null  float64
 1   HouseAge    20637 non-null  float64
 2   AveRooms    20637 non-null  float64
 3   AveBedrms   20637 non-null  float64
 4   Population  20637 non-null  float64
 5   AveOccup    20637 non-null  float64
 6   county_lon  20637 non-null  float64
 7   county_lat  20637 non-null  float64
 8   county_nr   20637 non-null  int32  
dtypes: float64(8), int32(1)
memory usage: 1.5 MB


In [102]:
# split into train and test data
x_train, x_test, y_train, y_test = train_test_split(data, y, random_state=0, test_size=0.3)
data.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,county_lon,county_lat,county_nr
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,-122.226577,37.772457,230
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,-122.226577,37.772457,230
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,-122.226577,37.772457,230
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,-122.226577,37.772457,230
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,-122.226577,37.772457,230


### Random Forest Baseline

In [116]:
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

MAE: 0.34778482248062026
MSE: 0.27485948682222744


### BLR - Full Pooling

In [76]:
# create bayesian linear regression model
blr_full_pooling = create_blr_full_pooling(x_train, y_train)

In [77]:
with blr_full_pooling:
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_medinc, beta_house_age, beta_ave_rooms, beta_ave_bedrms, beta_population, beta_ave_occup, sigma]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 256 seconds.


#### Predict on test set

In [79]:
with blr_full_pooling:
    new_data = {"medinc": x_test.MedInc, 
                "house_age": x_test.HouseAge, 
                "ave_rooms": x_test.AveRooms,
                "ave_bedrms": x_test.AveBedrms,
                "population": x_test.Population,
                "ave_occup": x_test.AveOccup,
                "median_house_value": np.zeros(shape=y_test.shape)}  
    pm.set_data(new_data)
    idata.extend(pm.sample_posterior_predictive(idata))

y_pred = idata.posterior_predictive.y.mean(axis=0).mean(axis=0).values
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


Sampling: [y]


MAE: 0.5776323067376261
MSE: 0.6214313199199963


### BLR - no pooling

In [126]:
coords = {"spatial_groups": list(data.county_nr.unique())}


In [128]:
blr_no_pooling = create_blr_no_pooling(x_train, y_train, coords, spatial_grouping_var="county_nr")

In [138]:
with blr_no_pooling:
    #approx = pm.fit()
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, beta_medinc, beta_house_age, beta_ave_rooms, beta_ave_bedrms, beta_population, beta_ave_occup, sigma]


In [137]:
with blr_no_pooling:
    new_data = {"medinc": x_test.MedInc, 
                "house_age": x_test.HouseAge, 
                "ave_rooms": x_test.AveRooms,
                "ave_bedrms": x_test.AveBedrms,
                "population": x_test.Population,
                "ave_occup": x_test.AveOccup,
                "spatial_group_idx": x_test.county_nr,
                "median_house_value": np.zeros(shape=y_test.shape)}  
    pm.set_data(new_data)
    idata.extend(pm.sample_posterior_predictive(idata))

y_pred = idata.posterior_predictive.y.mean(axis=0).mean(axis=0).values
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

Sampling: [y]


MAE: 11.219145448190293
MSE: 337.83597810262995
