In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import config

In [None]:
#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

In [None]:
#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [None]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [None]:
%%bigquery --use_rest_api Zip_5yr
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2018_5yr` 

In [None]:
from data_setup import ZRI_format

In [None]:
%%time
ZRI_new = ZRI_format(ZRI_MF, time_unit = 'Month', window_size = 3, future_time = 4)

In [None]:
#Columns to use in the final analysis
zip_columns = ['geo_id','unemployed_pop','white_pop','vacant_housing_units','total_pop','worked_at_home',
               'poverty','percent_income_spent_on_rent','occupied_housing_units',
               'median_year_structure_built','median_age','married_households','masters_degree',
              'male_pop','female_pop','income_per_capita','housing_units','employed_pop','black_pop',
              'asian_pop','amerindian_pop','graduate_professional_degree']

In [None]:
#Merge zip code data onto the ZRI data
final_data = ZRI_new.merge(Zip_5yr[zip_columns],how = 'left',left_on='ZipCode',right_on ='geo_id')

#Convert columns to percentage
#Columns to divide by total population
pop_columns = ['unemployed_pop','white_pop','masters_degree',
               'graduate_professional_degree','employed_pop','black_pop',
              'asian_pop','amerindian_pop','poverty','worked_at_home']

#Columns to divide by total housing units
house_columns = ['vacant_housing_units','occupied_housing_units']

#Division
final_data.loc[:,pop_columns] = static_data[pop_columns].div(static_data['total_pop'], axis = 0)
final_data.loc[:,house_columns] = static_data[house_columns].div(static_data['housing_units'], axis = 0)

In [None]:
final_data

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [None]:
#Imputation Strategy
#Dropna for now
final_data = final_data.dropna()

In [None]:
#Find feature columns
full_feature_columns = [x for x in final_data.columns if 'minus' in x] +\
                                                     pop_columns  +\
                                                    house_columns +\
                                                    ['income_per_capita',
                                                    'percent_income_spent_on_rent',
                                                    'median_age']

In [None]:
ZRI_feature_columns = [x for x in final_data.columns if 'minus' in x]

In [None]:
#Train test split, test data is above a given year
test_year = 2019
training_data = final_data[final_data.Year < test_year]
final_test_data = final_data[final_data.Year >= test_year]

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1,
                                                     random_state = 42
                                                    ) 

In [None]:
 X_train_zri, X_test_zri, y_train_zri, y_test_zri = train_test_split(training_data[ZRI_feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1,
                                                     random_state = 42
                                                    ) 

In [None]:
lr_zri = LinearRegression()
lr_full = LinearRegression()

In [None]:
lr_full.fit(X_train,y_train); lr_zri.fit(X_train_zri, y_train_zri)

In [None]:
lr_zri.score(X_test_zri,y_test_zri) , lr_zri.score(X_train_zri, y_train_zri)

In [None]:
lr_full.score(X_test,y_test), lr_full.score(X_train, y_train)

In [None]:
prediction_error_full = final_test_data['Target_ZRI'] - lr_full.predict(final_test_data[full_feature_columns])

In [None]:
prediction_error_ZRI = final_test_data['Target_ZRI'] - lr_zri.predict(final_test_data[ZRI_feature_columns])

In [None]:
prediction_error_full.describe(), prediction_error_ZRI.describe()

In [None]:
prediction_error = final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns])

In [None]:
prediction_error.describe()

In [None]:
lr.coef_, lr.alpha_

In [None]:
window_sizes = list(range(1,13))
future_time = 24
time_unit = 'Month'
num_obs = defaultdict()
errors = defaultdict()
scores = defaultdict()
coefficients = defaultdict()

for window_size in window_sizes:
    ZRI_new = ZRI_format(ZRI_MF, time_unit = time_unit, 
                         window_size = window_size,
                         future_time = future_time)
    ZRI_new = ZRI_new.dropna()
    num_obs[window_size] = ZRI_new.shape[0]
    feature_columns = [x for x in ZRI_new.columns if 'minus' in x]
    test_year = 2019
    training_data = ZRI_new[ZRI_new.Year < test_year]
    final_test_data = ZRI_new[ZRI_new.Year >= test_year]
    most_recent_feature = f'ZRI_minus_{future_time}{time_unit[0]}'
    X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1
                                                    ) 
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    coefficients[window_size] = defaultdict()
    scores[window_size] = (lr.score(X_test,y_test), lr.score(X_train, y_train))
    errors[window_size] = (final_test_data['Target_ZRI'] - 
                           lr.predict(final_test_data[feature_columns])).div(final_test_data[most_recent_feature])
    

In [None]:
lr.coef_

In [None]:
plt.boxplot(errors.values())

In [None]:
plt.boxplot(list(map(lambda x: x.apply(lambda y: np.log10(y+1250)),errors.values())))

In [None]:
pd.DataFrame(errors).describe()