In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import config

In [None]:
#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

In [None]:
#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [None]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [None]:
ZRI_MF.head()

In [None]:
year_columns = [x for x in ZRI_MF.columns if ('20' in x)]

In [None]:
#Plot 50 largest zip code's ZRI over time
fig, ax = plt.subplots()
for i in range(1,50):
    ZRI_MF[ZRI_MF.SizeRank == i].loc[:,year_columns].transpose().plot(kind = 'line', ax = ax)
ax.get_legend().remove()

Next steps:
Use various time series analyses techniques to forecast purely based off of previous ZRI. 


In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [None]:
def lregress(row):
    '''
    Does a linear regression on one region's (row) ZRI over time. 
    Outputs the slope, intercept, MSE, and error the predicting the most recent month.
    '''
    years = row[year_columns].reset_index().dropna()
    if (years.empty) or (years.shape[0] < 3):
        return(None,None,None,None)
    y = years.iloc[:,1].values[:-1]
    X = np.array(years.index)[:-1]
    X_test = np.array(years.index)[-1]
    y_test = years.iloc[:,1].values[-1]
    lr.fit(X.reshape(-1,1),y)
    test_residual = lr.predict(X_test.reshape(-1,1)) - y_test
    return(lr.coef_[0],lr.intercept_,lr.score(X.reshape(-1,1),y),test_residual[0])

In [None]:
#Run lregress function on every row
lr_data = ZRI_MF.apply(lregress,axis = 1)

In [None]:
#Inputs results from lregress to the dataframe
ZRI_MF['slope'] = [x[0] for x in lr_data]
ZRI_MF['intercept'] = [x[1] for x in lr_data]
ZRI_MF['score'] = [x[2] for x in lr_data]
ZRI_MF['error'] = [x[3] for x in lr_data]

In [None]:
#Print results of the lregress
ZRI_MF[['slope','intercept','score','error']].describe()