In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import config

In [2]:
#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

In [3]:
#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [4]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [5]:
from data_setup import ZRI_format

In [86]:
%%time
ZRI_new = ZRI_format(ZRI_MF, time_unit = 'Year', window_size = 3, future_time = 4)

Wall time: 2.6 s


In [87]:
ZRI_new

Unnamed: 0,Target_index,Target_ZRI,Year,ZRI_minus_5_Y,ZRI_minus_6_Y,ZRI_minus_7_Y,ZipCode
0,201001013,994.25,2010.0,,,,01013
1,201001020,1053.00,2010.0,,,,01020
2,201001040,1006.00,2010.0,,,,01040
3,201001085,,2010.0,,,,01085
4,201001089,,2010.0,,,,01089
...,...,...,...,...,...,...,...
20466,202099501,1281.00,2020.0,1301.083333,1308.363636,,99501
20467,202099504,1443.00,2020.0,1529.583333,1506.916667,1513.916667,99504
20468,202099508,1283.00,2020.0,1334.500000,1325.416667,1294.000000,99508
20469,202099654,,2020.0,1228.000000,1218.545455,,99654


In [88]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [73]:
#Imputation Strategy
#Dropna for now
ZRI_new = ZRI_new.dropna()

In [74]:
#Find feature columns
feature_columns = [x for x in ZRI_new.columns if 'minus' in x]

In [75]:
#Train test split, test data is above a given year
test_year = 2019
training_data = ZRI_new[ZRI_new.Year < test_year]
final_test_data = ZRI_new[ZRI_new.Year > test_year]

In [76]:
 X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1
                                                    ) 

In [78]:
lr = RidgeCV()

In [79]:
lr.fit(X_train,y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

In [80]:
lr.score(X_test,y_test), lr.score(X_train, y_train)

(0.9914497763739754, 0.9914387094055347)

In [81]:
prediction_error = final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns])

In [82]:
prediction_error.describe()

count    1392.000000
mean      -18.023939
std        62.773345
min      -497.541831
25%       -46.867705
50%       -11.610827
75%        18.241855
max       163.849840
Name: Target_ZRI, dtype: float64

In [84]:
lr.coef_, lr.alpha_

(array([ 2.2353101 , -1.90489661,  0.67738555]), 10.0)

In [141]:
window_sizes = list(range(1,5))
future_time = 1

num_obs = defaultdict()
errors = defaultdict()
scores = defaultdict()

for window_size in window_sizes:
    ZRI_new = ZRI_format(ZRI_MF, time_unit = 'Quarter', 
                         window_size = window_size,
                         future_time = future_time)
    ZRI_new = ZRI_new.dropna()
    num_obs[window_size] = ZRI_new.shape[0]
    feature_columns = [x for x in ZRI_new.columns if 'minus' in x]
    test_year = 2019
    training_data = ZRI_new[ZRI_new.Year < test_year]
    final_test_data = ZRI_new[ZRI_new.Year > test_year]
    X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1
                                                    ) 
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    scores[window_size] = (lr.score(X_test,y_test), lr.score(X_train, y_train))
    errors[window_size] = final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns])
    

In [None]:
plt.boxplot(list(map(lambda x: x.apply(lambda y: np.log10(abs(y))),errors.values())))

In [None]:
pd.DataFrame(errors).describe()