## imports and options

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import bigquery
import warnings

In [4]:
# Ignores bigquery warnings about credentials type
warnings.filterwarnings(
    "ignore", "Your application has authenticated using end user credentials"
)

pd.options.display.max_rows = 100
client = bigquery.Client()

## SQL queries

sql_query = f"""
WITH race_data AS 
    (SELECT geo_id, total_pop, black_pop 
    FROM `bigquery-public-data.census_bureau_acs.zcta5_2017_5yr`),
    age_data AS
    (SELECT zcta5, total AS total_pop, total_10_to_14, total_15_to_19, total_20_to_24
    total_25_to_29, total_30_to_34, total_40_to_44, total_45_to_49, total_50_to_54, total_55_to_59,
    total_60_to_64, total_65_to_69, total_70_to_74, total_75_to_79, total_80_to_84, total_85_and_over
    FROM `w2ohcwork.census.acs_2017_5yr_s0101_zcta5_population_age_sex`)
SELECT age_data.*, race_data.black_pop
FROM race_data 
JOIN age_data ON race_data.geo_id = age_data.zcta5
"""
res = client.query(sql_query)
demo_df = res.to_dataframe()

In [6]:
sql_query = f'''
SELECT geo_id as fips, total_pop, black_pop
FROM `bigquery-public-data.census_bureau_acs.county_2018_5yr`
'''
res = client.query(sql_query) 
demo_df = res.to_dataframe()
demo_df.head()

Unnamed: 0,fips,total_pop,black_pop
0,35039,39307.0,164.0
1,72133,22066.0,0.0
2,72043,39265.0,8.0
3,72151,34149.0,0.0
4,72071,42420.0,98.0


In [15]:
demo_df.dtypes
demo_df.fips = demo_df.fips.astype(np.int64)

In [9]:
fips_zip = pd.read_excel(r'data\COUNTY_ZIP_032018.xlsx')
fips_zip.head()

Unnamed: 0,county,zip,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,72001,601,1.0,1.0,1.0,1.0
1,72093,606,1.0,1.0,1.0,1.0
2,72013,688,0.096351,0.037037,0.008299,0.090974
3,72013,616,0.090904,0.006119,0.014523,0.083784
4,72013,650,0.018933,0.001288,0.008299,0.017511


In [10]:
fips_zip.dtypes

county         int64
zip            int64
res_ratio    float64
bus_ratio    float64
oth_ratio    float64
tot_ratio    float64
dtype: object

In [17]:
fips_zip = fips_zip[['county', 'zip']]
demo_df = demo_df.merge(fips_zip, how='left', left_on='fips', right_on='county')

## ZIP/ZCTA Crosswalk

In [None]:
zip_zcta = pd.read_excel(r"data\Zip_to_zcta_crosswalk_2020.xlsx")
zip_zcta

In [None]:
zip_zcta.ZIP_TYPE.value_counts()

In [None]:
zip_zcta.zip_join_type.value_counts()

In [None]:
len(zip_zcta.ZCTA.unique())

In [None]:
zip_zcta.dtypes

In [None]:
zip_zcta = zip_zcta[zip_zcta.ZCTA != 'No ZCTA']
zip_zcta.ZCTA = zip_zcta.ZCTA.astype(np.int64)

## Demographics Table

In [None]:
demo_df = pd.read_csv(r'data\acs_zcta5_2017_5yr.csv')
demo_df.head()

In [7]:
demo_df.dtypes

fips          object
total_pop    float64
black_pop    float64
dtype: object

In [8]:
demo_df.isna().any()

fips         False
total_pop    False
black_pop    False
dtype: bool

In [None]:
demo_df.black_pop = demo_df.black_pop.astype(np.int64)

In [None]:
demo_df.describe()

## Join zip_zcta and demo_df

In [None]:
demo_zip_df = demo_df.merge(zip_zcta, how='left', left_on='zcta5', 
                            right_on='ZCTA').drop(columns=['ZCTA'])

demo_zip_df.head()

In [None]:
demo_zip_df.dtypes

In [None]:
demo_zip_df.isna().any()

In [None]:
demo_zip_df.ZIP_CODE.isna().sum()

In [None]:
demo_zip_df = demo_zip_df.dropna()
demo_zip_df.ZIP_CODE = demo_zip_df.ZIP_CODE.astype(np.int64)

In [None]:
len(demo_zip_df.ZIP_CODE.unique())

In [None]:
len(demo_zip_df.zcta5.unique())

## Vaccine Hesitancy Table

In [18]:
vax_df = pd.read_csv(r"data\vaccine_hesitancy_by_zip.csv")
vax_df.head()

Unnamed: 0,week,start_date,end_date,zip_code,vaccine_measure_id,final_zip_pred,state_name,county_name,vaccine_measure_name,definition
0,26,6/25/2021,7/1/2021,10001,1,0.043939,New York,New York County,high_vaccine_potential,yes probably will and no probably wont respond...
1,26,6/25/2021,7/1/2021,10002,1,0.062324,New York,New York County,high_vaccine_potential,yes probably will and no probably wont respond...
2,26,6/25/2021,7/1/2021,10003,1,0.024404,New York,New York County,high_vaccine_potential,yes probably will and no probably wont respond...
3,26,6/25/2021,7/1/2021,10004,1,0.036839,New York,New York County,high_vaccine_potential,yes probably will and no probably wont respond...
4,26,6/25/2021,7/1/2021,10005,1,0.03694,New York,New York County,high_vaccine_potential,yes probably will and no probably wont respond...


In [None]:
# clear out all_priority results -- can change this later
vax_df = vax_df[vax_df.vaccine_measure_id != 2]

In [20]:
vax_df = vax_df.drop(columns=['week', 'start_date', 'end_date', 'vaccine_measure_id',
                    'vaccine_measure_name', 'definition'])
vax_df.head()

Unnamed: 0,zip_code,final_zip_pred,state_name,county_name
0,10001,0.043939,New York,New York County
1,10002,0.062324,New York,New York County
2,10003,0.024404,New York,New York County
3,10004,0.036839,New York,New York County
4,10005,0.03694,New York,New York County


In [21]:
vax_df.dtypes

zip_code            int64
final_zip_pred    float64
state_name         object
county_name        object
dtype: object

In [None]:
vax_df.isna().any()

In [None]:
len(vax_df.zip_code.unique())

In [None]:
vax_df.final_zip_pred.describe()

## Join the hesitancy and demographic tables

In [22]:
demo_df.head()

Unnamed: 0,fips,total_pop,black_pop,county,zip
0,35039,39307.0,164.0,35039,87527
1,35039,39307.0,164.0,35039,87046
2,35039,39307.0,164.0,35039,87520
3,35039,39307.0,164.0,35039,87017
4,35039,39307.0,164.0,35039,87522


In [29]:
df = demo_df.merge(vax_df, how='left', left_on='zip', right_on='zip_code')
df.head()

Unnamed: 0,fips,total_pop,black_pop,county,zip,zip_code,final_zip_pred,state_name,county_name
0,35039,39307.0,164.0,35039,87527,,,,
1,35039,39307.0,164.0,35039,87046,87046.0,0.037686,New Mexico,Rio Arriba County
2,35039,39307.0,164.0,35039,87046,87046.0,0.037686,New Mexico,Sandoval County
3,35039,39307.0,164.0,35039,87046,87046.0,0.08875,New Mexico,Rio Arriba County
4,35039,39307.0,164.0,35039,87046,87046.0,0.08875,New Mexico,Sandoval County


In [33]:
df.dtypes

fips                int64
total_pop         float64
black_pop         float64
county              int64
zip                 int64
zip_code          float64
final_zip_pred    float64
state_name         object
county_name        object
dtype: object

In [30]:
df.isna().any()

fips              False
total_pop         False
black_pop         False
county            False
zip               False
zip_code           True
final_zip_pred     True
state_name         True
county_name        True
dtype: bool

In [32]:
df = df.dropna()

In [39]:
df.total_pop = df.total_pop.astype(np.int64)
df.black_pop = df.black_pop.astype(np.int64)
df = df.drop(columns=['county'])
df.head()

Unnamed: 0,fips,total_pop,black_pop,zip,final_zip_pred,state_name,county_name
1,35039,39307,164,87046,0.037686,New Mexico,Rio Arriba County
2,35039,39307,164,87046,0.037686,New Mexico,Sandoval County
3,35039,39307,164,87046,0.08875,New Mexico,Rio Arriba County
4,35039,39307,164,87046,0.08875,New Mexico,Sandoval County
7,35039,39307,164,87522,0.063541,New Mexico,Rio Arriba County


In [38]:
len(df.zip.unique())

32858

In [41]:
df.to_csv(r'data\demo_hesitancy_by_fips.csv', index=False)

## Exploration and cleaning

In [None]:
df.dtypes

In [None]:
df.isna().any()

In [None]:
df.isna().black_pop.sum()

In [None]:
df_na = df[df.isna().any(axis=1)]
df_na

In [None]:
# I checked other population data for these missing rows and they're all 0
df = df.dropna()
df.isna().black_pop.sum()

In [None]:
# also drop rows that have 0 black population
df = df[df.black_pop != 0]
len(df)

In [None]:
df[df.duplicated(subset=['zip_code'], 
                 keep=False) == True].sort_values(by='zip_code')

In [None]:
# looks like duplicated zip codes keep the same final_zip_pred value
# so duplicates are safe to delete
df = df.drop_duplicates(subset=['zip_code'], keep='first')
len(df)

In [None]:
df.black_pop.describe()

In [None]:
df.final_zip_pred.describe()

In [None]:
# should add up to <328,200,000
df.total_pop.sum()

In [None]:
# should be between 12-13% of actual population ( = 328.2 million)
df.black_pop.sum()

In [None]:
len(df.zip_code.unique())

In [None]:
df['percent_black'] = df.black_pop/df.total_pop

In [None]:
df.to_csv(r'data\demo_hesitancy_by_zip.csv', index=False)