In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import config

In [None]:
#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

In [None]:
#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [None]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [None]:
from data_setup import ZRI_format

In [None]:
%%time
ZRI_new = ZRI_format(ZRI_MF, time_unit = 'Year', window_size = 3, future_time = 1)

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [None]:
#Imputation Strategy
#Dropna for now
ZRI_new = ZRI_new.dropna()

In [None]:
#Find feature columns
feature_columns = [x for x in ZRI_new.columns if 'minus' in x]

In [None]:
#Train test split, test data is above a given year
test_year = 2019
training_data = ZRI_new[ZRI_new.Year < test_year]
final_test_data = ZRI_new[ZRI_new.Year >= test_year]

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1
                                                    ) 

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
lr.score(X_test,y_test), lr.score(X_train, y_train)

In [None]:
prediction_error = final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns])

In [None]:
prediction_error.describe()

In [None]:
lr.coef_, lr.alpha_

In [None]:
window_sizes = list(range(1,13))
future_time = 24
time_unit = 'Month'
num_obs = defaultdict()
errors = defaultdict()
scores = defaultdict()
coefficients = defaultdict()

for window_size in window_sizes:
    ZRI_new = ZRI_format(ZRI_MF, time_unit = time_unit, 
                         window_size = window_size,
                         future_time = future_time)
    ZRI_new = ZRI_new.dropna()
    num_obs[window_size] = ZRI_new.shape[0]
    feature_columns = [x for x in ZRI_new.columns if 'minus' in x]
    test_year = 2019
    training_data = ZRI_new[ZRI_new.Year < test_year]
    final_test_data = ZRI_new[ZRI_new.Year >= test_year]
    most_recent_feature = f'ZRI_minus_{future_time}{time_unit[0]}'
    X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1
                                                    ) 
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    coefficients[window_size] = defaultdict()
    scores[window_size] = (lr.score(X_test,y_test), lr.score(X_train, y_train))
    errors[window_size] = (final_test_data['Target_ZRI'] - 
                           lr.predict(final_test_data[feature_columns])).div(final_test_data[most_recent_feature])
    

In [None]:
lr.coef_

In [None]:
plt.boxplot(errors.values())

In [None]:
plt.boxplot(list(map(lambda x: x.apply(lambda y: np.log10(y+1250)),errors.values())))

In [None]:
pd.DataFrame(errors).describe()