## Data Exploration
To interact with this notebook run `jupyter notebook viz_prod.ipynb` from command line.

In [1]:
import pandas as pd
import build_db
from core.data.socrata import soda_data, socrata_api_requests
from core.data import dbclient, daily_case_data_by_zip
from core.data.groundtruth import process_ground_truth_data

In [2]:
# connect to db and show all tables
db = dbclient.DBClient()
db.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(db.cursor.fetchall())

[('VACCINATIONS',), ('DAILY_COVID_CASE_DATA',), ('DAILY_FOOT_TRAFFIC_DATA',), ('TRAFFIC_CRASH_DATA',), ('DEMOGRAPHICS',)]


### Daily Case Data

In [3]:
query = f"select * from {build_db.CASE_TBL}"
print(f"query = {query}")
covid_case_df = pd.read_sql_query(query, db.conn)
covid_case_df.describe(include='all')

query = select * from DAILY_COVID_CASE_DATA


Unnamed: 0,index,STD_DATE,ZIPCODE,confirmed_cases,confirmed_cases_change,total_tested,total_tested_change,AVG7DAY_confirmed_cases,AVG7DAY_total_tested
count,324519.0,324519,324416.0,324519.0,324519.0,324519.0,324519.0,315758.0,315758.0
unique,,317,1445.0,6760.0,194.0,38665.0,1755.0,,
top,,2021-03-07 00:00:00,60630.0,6.0,0.0,147.0,0.0,,
freq,,1429,317.0,8950.0,144271.0,334.0,27164.0,,
mean,162259.0,,,,,,,469.053303,6228.459485
std,93680.710341,,,,,,,981.12657,17667.067157
min,0.0,,,,,,,6.0,10.428571
25%,81129.5,,,,,,,25.714286,383.142857
50%,162259.0,,,,,,,98.0,1219.0
75%,243388.5,,,,,,,439.428571,5647.571429


### Vaccination Data

In [4]:
query = f"select * from {build_db.VACC_TBL}"
print(f"query = {query}")
vacc_df = pd.read_sql_query(query, db.conn)
vacc_df.describe(include='all')

query = select * from VACCINATIONS


Unnamed: 0,index,ZIPCODE,STD_DATE,total_doses_daily,total_doses_cumulative,vaccine_series_completed_daily,vaccine_series_completed_percent_population,population,AVG7DAY_total_doses_daily,AVG7DAY_vaccine_series_completed_daily
count,5000.0,4917.0,5000,5000.0,5000.0,5000.0,5000.0,5000.0,4563.0,4563.0
unique,,59.0,85,,,,,,,
top,,60640.0,2021-03-05 00:00:00,,,,,,,
freq,,85.0,60,,,,,,,
mean,2499.5,,,129.7286,3883.3802,45.2486,0.027418,46186.8794,133.371998,46.083905
std,1443.520003,,,144.694755,4568.824501,65.568638,0.039322,26539.641677,120.48442,54.03017
min,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1249.75,,,20.0,459.0,0.0,0.0,28569.0,36.5,2.428571
50%,2499.5,,,81.0,1960.5,17.0,0.011,46591.0,100.285714,25.857143
75%,3749.25,,,195.0,5978.0,63.0,0.04,67711.0,202.714286,68.928571


### Foot Traffic Data

In [5]:
query = f"select * from {build_db.FOOT_TRAFF_TBL}"
print(f"query = {query}")
foot_traffic_df = pd.read_sql_query(query, db.conn)
foot_traffic_df.describe(include='all')

query = select * from DAILY_FOOT_TRAFFIC_DATA


Unnamed: 0,index,STD_DATE,ZIPCODE,AIRPORTS_TRANSIT_CENTERS,BARS,BEAUTY_WELLNESS,FITNESS_CENTERS,GROCERY,MASS_MERCH,MEDICAL_CENTERS_HOSPITALS,...,RESTAURANT,RETAIL,SCHOOLS_LIBRARIES,SHOPPING_CENTERS_MALLS,TOURIST_ATTRACTIONS,AVG7DAY_BARS,AVG7DAY_GROCERY,AVG7DAY_RESTAURANT,AVG7DAY_PARKS_BEACHES,AVG7DAY_SCHOOLS_LIBRARIES
count,24360.0,24360,24360.0,15960.0,23940.0,24360.0,21000.0,15960.0,15540.0,23940.0,...,23520.0,18480.0,23520.0,21000.0,18480.0,23598.0,15732.0,23184.0,21528.0,23184.0
unique,,420,58.0,,,,,,,,...,,,,,,,,,,
top,,2020-04-01 00:00:00,60661.0,,,,,,,,...,,,,,,,,,,
freq,,58,420.0,,,,,,,,...,,,,,,,,,,
mean,12179.5,,,50.05589,55.199165,64.894048,57.042762,84.678133,61.462162,62.712114,...,91.944005,59.928842,53.697279,65.139333,57.546158,54.925835,84.644355,91.696114,78.781082,53.345349
std,7032.270615,,,57.201082,38.19965,28.01634,50.879086,33.032386,36.647893,29.982223,...,258.186658,40.681179,40.434988,28.709798,50.413012,32.797114,29.828784,245.813491,97.942508,33.411937
min,0.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.428571,0.0
25%,6089.75,,,20.0,30.0,48.0,31.0,63.0,40.0,42.0,...,47.0,30.0,28.0,46.0,23.0,32.714286,64.571429,49.428571,44.857143,30.285714
50%,12179.5,,,33.0,47.0,63.0,45.0,81.0,63.0,59.0,...,68.0,57.0,48.0,64.0,50.0,47.857143,82.428571,69.142857,66.571429,48.428571
75%,18269.25,,,65.0,72.0,81.0,69.0,102.0,84.0,77.0,...,90.0,83.0,68.0,81.0,82.0,71.857143,102.142857,90.285714,91.571429,67.464286


### Traffic Crash Data

In [6]:
query = f"select * from {build_db.CRASHES_TBL}"
print(f"query = {query}")
crashes_df = pd.read_sql_query(query, db.conn)
crashes_df.describe(include='all')

query = select * from TRAFFIC_CRASH_DATA


Unnamed: 0,index,STD_DATE,ZIPCODE,crash_count,AVG7DAY_crash_count
count,43049.0,43049,42381.0,43049.0,41979.0
unique,,796,70.0,,
top,,2019-07-22 00:00:00,60619.0,,
freq,,62,796.0,,
mean,21524.0,,,5.260401,5.317969
std,12427.320206,,,3.597711,2.777847
min,0.0,,,1.0,1.0
25%,10762.0,,,2.0,3.0
50%,21524.0,,,5.0,5.0
75%,32286.0,,,7.0,7.285714


### Demographics

In [7]:
query = f"select * from {build_db.CENSUS_TBL}"
print(f"query = {query}")
census_df = pd.read_sql_query(query, db.conn)
census_df.describe(include='all')

query = select * from DEMOGRAPHICS


Unnamed: 0,index,zcta,hhold_size,fam_size,unemploy_rate,median_income,pct_below_poverty_lvl,median_age,pct_65_or_older,pct_hispanic,...,pct_white,pct_pacific_islander,pct_american_indian,pct_other_race,pct_high_school_grad,pct_hholds_w_computer,pct_hholds_w_internet,pct_w_health_insur,state,ZIPCODE
count,58.0,58,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,...,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0
unique,,58,,,,,,,,,...,,,,,,,,,,58.0
top,,ZCTA5 60612,,,,,,,,,...,,,,,,,,,,60620.0
freq,,1,,,,,,,,,...,,,,,,,,,,1.0
mean,28.5,,2.416724,3.222759,5.306897,69298.913793,13.234483,35.377586,12.405172,21.343103,...,38.406897,0.012069,0.2,0.210345,20.458621,88.603448,79.363793,91.806897,17.0,
std,16.886879,,0.545085,0.527662,3.301986,35812.102415,9.850827,3.657678,4.305705,21.570831,...,27.44321,0.032861,0.861659,0.213319,11.979237,7.252391,9.459622,4.489983,0.0,
min,0.0,,1.49,2.14,0.2,22158.0,0.0,30.2,0.5,1.3,...,1.0,0.0,0.0,0.0,0.0,69.5,56.6,79.8,17.0,
25%,14.25,,1.965,2.8725,2.65,41563.5,5.0,32.3,9.9,5.725,...,7.9,0.0,0.0,0.1,8.425,83.45,73.75,88.175,17.0,
50%,28.5,,2.5,3.355,4.2,62631.5,10.35,34.5,11.85,11.5,...,44.65,0.0,0.1,0.1,21.7,90.1,79.45,91.7,17.0,
75%,42.75,,2.7875,3.6075,7.425,94877.0,19.2,37.95,15.35,38.1,...,62.075,0.0,0.1,0.3,30.15,93.85,87.4,95.675,17.0,
