In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
from data_setup import ZRI_format

In [None]:
#Load Data
ZRI_MF = pd.read_pickle('./pickles/ZRI_filtered.p')
ACS = pd.read_pickle('./acs_data/ACS.p')
crime = pd.read_pickle('./pickles/crime.p')
dominant_county = pd.read_pickle('./pickles/dominant_county_zip.p')
weather = pd.read_pickle('./pickles/weather.p')

In [None]:
%%time
ZRI_new = ZRI_format(ZRI_MF, time_unit = 'Month', window_size = 6, future_time = 36)

In [None]:
ZRI_new

In [None]:
acs_lag = 1
ACS = ACS.assign(year_avail = (ACS.year.astype(int) + 2).astype(str))

In [None]:
ZRI = ZRI_new.merge(ACS,how = 'left',left_on = ['ZipCode','Predict_Year'], 
                                              right_on = ['geo_id','year_avail'])

In [None]:
ZRI = ZRI.assign(dominant_county = ZRI.ZipCode.apply(lambda x: dominant_county[x]))
ZRI = ZRI.merge(crime[['crime_rate_per_100000','county_fips_code']],how = 'left',
          left_on = 'dominant_county',right_on = 'county_fips_code').drop('county_fips_code',axis = 1)


In [None]:
#Columns to use in the final analysis
zip_columns = ['geo_id','unemployed_pop','white_pop','vacant_housing_units','total_pop','worked_at_home',
               'poverty','percent_income_spent_on_rent','occupied_housing_units',
               'median_year_structure_built','median_age','married_households','masters_degree',
               'male_pop','female_pop','income_per_capita','housing_units','employed_pop','black_pop',
               'asian_pop','amerindian_pop','graduate_professional_degree']

In [None]:
#Convert columns to percentage
#Columns to divide by total population
pop_columns = ['unemployed_pop','white_pop','masters_degree',
               'graduate_professional_degree','employed_pop','black_pop',
               'asian_pop','amerindian_pop','poverty','worked_at_home']

#Columns to divide by total housing units
house_columns = ['vacant_housing_units','occupied_housing_units']

#Division
ZRI.loc[:,pop_columns] = ZRI[pop_columns].div(ZRI['total_pop'], axis = 0)
ZRI.loc[:,house_columns] = ZRI[house_columns].div(ZRI['housing_units'], axis = 0)

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from collections import defaultdict

In [None]:
#Find feature columns
full_feature_columns = [x for x in ZRI.columns if 'minus' in x] +\
                                                     zip_columns +\
                                                   ['crime_rate_per_100000']

In [None]:
ZRI_feature_columns = [x for x in ZRI.columns if 'minus' in x]

In [None]:
#Train test split, test data is above a given year
test_year = 2019
data_4_model = ZRI[full_feature_columns + ['Target_ZRI','Year']].dropna()
training_data = data_4_model[data_4_model.Year < test_year]
test_data = data_4_model[data_4_model.Year >= test_year]

In [None]:
data_4_model[full_feature_columns]

In [None]:
X_train_full, y_train_full = training_data[full_feature_columns], training_data['Target_ZRI']

In [None]:
X_test_full, y_test_full = test_data[full_feature_columns], test_data['Target_ZRI']

In [None]:
lasso_params = {'alpha': [0.1,1,2,3,4,5]}
lasso_grid = GridSearchCV(Lasso(), param_grid=lasso_params)
lasso_model = make_pipeline(StandardScaler(),lasso_grid)
rf_params = {'max_depth' : [None,10]}
rf_model = GridSearchCV(RandomForestRegressor(n_jobs = -1),param_grid= rf_params)

In [None]:
rf_model.fit(X_train_full,y_train_full)

In [None]:
rf_coef_importance = pd.Series(dict(zip(X_train_full.columns, rf_model.best_estimator_.feature_importances_))).sort_values()

In [None]:
rf_model.score(X_test_full,y_test_full), rf_model.score(X_train_full,y_train_full)

In [None]:
rf_coef_importance.loc[[x for x in X_train_full.columns if ('minus' not in x) and (x != 'geo_id')]].sort_values().plot(kind = 'bar')

In [None]:
lasso_model.fit(X_train_full,y_train_full)

In [None]:
coefficients = pd.Series(dict(zip(X_train_full.columns, lasso_model.named_steps.gridsearchcv.best_estimator_.coef_))).sort_values()

In [None]:
coefficients.loc[[x for x in X_train_full.columns if ('minus' not in x) and (x != 'geo_id')]].sort_values().plot(kind='bar')

In [None]:
lasso_model.score(X_test_full,y_test_full), lasso_model.score(X_train_full, y_train_full)

In [None]:
prediction_error_full = final_test_data['Target_ZRI'] - lr_full.predict(final_test_data[full_feature_columns])

In [None]:
prediction_error_ZRI = final_test_data['Target_ZRI'] - lr_zri.predict(final_test_data[ZRI_feature_columns])

In [None]:
prediction_error_full.describe(), prediction_error_ZRI.describe()

In [None]:
prediction_error = final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns])

In [None]:
prediction_error.describe()

In [None]:
lr.coef_, lr.alpha_

In [None]:
lr.coef_

In [None]:
plt.boxplot(errors.values())

In [None]:
pd.DataFrame(errors).describe()