### Description
Multi-linear Regression on a static (one slice of time) value of the ZRI. This allows us to use the most accurate geographic data of the ACS without doing any of our own transformations. 

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import config

#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [None]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [None]:
%%bigquery --use_rest_api Zip_5yr
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2018_5yr` 

### ZIP Data
Massage data to be used in the first linear regression. 
### Note:
For this first pass the columns chosen from the zip code data was done quickly. There is much more work to be done picking columns and performing feature engineering to find the best columns to use. 

In [None]:
#Convert geo_id column to int datatype
Zip_5yr.loc[:,'geo_id'] = Zip_5yr.geo_id.astype(int)

In [None]:
ZRI_MF.head()

In [None]:
Zip_5yr.sample(5)

In [None]:
#Columns to use in the final analysis
zip_columns = ['geo_id','unemployed_pop','white_pop','vacant_housing_units','total_pop','worked_at_home',
               'poverty','percent_income_spent_on_rent','occupied_housing_units',
               'median_year_structure_built','median_age','married_households','masters_degree',
              'male_pop','female_pop','income_per_capita','housing_units','employed_pop','black_pop',
              'asian_pop','amerindian_pop','graduate_professional_degree']

In [None]:
#Merge zip code data onto the ZRI data
static_data = ZRI_MF.merge(Zip_5yr[zip_columns],how = 'left',left_on='RegionName',right_on ='geo_id')

#Convert columns to percentage
#Columns to divide by total population
pop_columns = ['unemployed_pop','white_pop','masters_degree',
               'graduate_professional_degree','employed_pop','black_pop',
              'asian_pop','amerindian_pop','poverty','worked_at_home']
#Columns to divide by total housing units
house_columns = ['vacant_housing_units','occupied_housing_units']

#Division
static_data.loc[:,pop_columns] = static_data[pop_columns].div(static_data['total_pop'], axis = 0)
static_data.loc[:,house_columns] = static_data[house_columns].div(static_data['housing_units'], axis = 0)

### ZRI Data
Need to decide what 'static' value of ZRI to use. 

First try will use the average of all. 

In [None]:
zri_months = [x for x in ZRI_MF.columns if ('20' in x)]
static_data.loc[:,'ZRI_Static'] = static_data[zri_months].apply(np.nanmean,axis = 1)

In [None]:
#Trying with different ZRI static value, with only data before 2020
zri_months = [x for x in zri_months if ('2020' not in x)]
static_data.loc[:,'ZRI_Static'] = static_data[zri_months].apply(np.nanmean,axis = 1)

In [None]:
#ZRI Static is just the most recent ZRI
static_data.loc[:,'ZRI_Static'] = static_data['_2020_01']

In [None]:
static_data.ZRI_Static

### Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
#Choose all of the columns to use
regression_columns = pop_columns  + ['income_per_capita',
                                                    'percent_income_spent_on_rent',
                                                    'median_age',
                                                    'ZRI_Static']
#Drop nan values
Xy = static_data[regression_columns].dropna()
#Split data into indepent and target variables and training/testing
X = Xy.drop('ZRI_Static',axis = 1)
y = Xy[['ZRI_Static']]
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

In [None]:
lr = LinearRegression()
static_model = make_pipeline(StandardScaler(),lr)

In [None]:
static_model.fit(X_train,y_train)

In [None]:
static_model.score(X_train,y_train), static_model.score(X_test,y_test)

In [None]:
coefficients = pd.Series(dict(zip(X_train.columns, static_model.named_steps.linearregression.coef_[0]))).sort_values()
coefficients.plot.bar()