## Import Library

In [1]:
import os
import pandas as pd
from datetime import date


## Set up Environment Flag

In [2]:
using_Google_colab = False
using_Anaconda_on_Mac_or_Linux = True
using_Anaconda_on_windows = False

if using_Google_colab:
    dir_input = "/content/drive/MyDrive/COVID_Project/input"
    dir_output = "/content/drive/MyDrive/COVID_Project/output"
if using_Anaconda_on_Mac_or_Linux:
    dir_input = "../input"
    dir_output = "../output"
if using_Anaconda_on_windows:
    dir_input = r"..\input"
    dir_output = r"..\output" 

## Connect to Google Drive

In [3]:
if using_Google_colab:
    from google.colab import drive
    drive.mount('/content/drive')

Data Prep for USA_Facts confirmed cases at county level

Read file

In [4]:
df_confirmed_cases = pd.read_csv(os.path.join(dir_input, 
                                              'USA_Facts', 
                                              'covid_confirmed_usafacts.csv'))
df_confirmed_cases = df_confirmed_cases.astype({'countyFIPS': str}).astype({'StateFIPS': str})
df_confirmed_cases

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2022-01-18,2022-01-19,2022-01-20,2022-01-21,2022-01-22,2022-01-23,2022-01-24,2022-01-25,2022-01-26,2022-01-27
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,12738,12833,12928,13019,13019,13019,13251,13251,13251,13251
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,47143,47662,48338,49168,49168,49168,50313,50313,50313,50313
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,4741,4800,4843,4902,4902,4902,5054,5054,5054,5054
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,5385,5486,5565,5663,5663,5663,5795,5795,5795,5795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,56037,Sweetwater County,WY,56,0,0,0,0,0,0,...,9082,9184,9241,9449,9449,9449,9609,9712,9810,10007
3189,56039,Teton County,WY,56,0,0,0,0,0,0,...,8531,8638,8741,8814,8814,8814,8960,9049,9121,9195
3190,56041,Uinta County,WY,56,0,0,0,0,0,0,...,4660,4751,4827,4927,4927,4927,5034,5081,5167,5222
3191,56043,Washakie County,WY,56,0,0,0,0,0,0,...,1994,2002,2023,2025,2025,2025,2041,2066,2093,2130


Now I need to transpose so all dates are in rows


In [5]:
df_confirmed_cases_by_county_by_date = df_confirmed_cases.melt(id_vars=['County Name',
                                                                        'countyFIPS', 
                                                                        'State', 
                                                                        'StateFIPS'], 
                                                               var_name='Date', 
                                                               value_name='Total Cases')
df_confirmed_cases_by_county_by_date

Unnamed: 0,County Name,countyFIPS,State,StateFIPS,Date,Total Cases
0,Statewide Unallocated,0,AL,1,2020-01-22,0
1,Autauga County,1001,AL,1,2020-01-22,0
2,Baldwin County,1003,AL,1,2020-01-22,0
3,Barbour County,1005,AL,1,2020-01-22,0
4,Bibb County,1007,AL,1,2020-01-22,0
...,...,...,...,...,...,...
2353236,Sweetwater County,56037,WY,56,2022-01-27,10007
2353237,Teton County,56039,WY,56,2022-01-27,9195
2353238,Uinta County,56041,WY,56,2022-01-27,5222
2353239,Washakie County,56043,WY,56,2022-01-27,2130


In [6]:
df_confirmed_cases_by_county_by_date = df_confirmed_cases_by_county_by_date.astype({'Date': 'datetime64[ns]'})
df_sorted_confirmed_cases = df_confirmed_cases_by_county_by_date.sort_values(by=['countyFIPS','Date'])
df_sorted_confirmed_cases

Unnamed: 0,County Name,countyFIPS,State,StateFIPS,Date,Total Cases
0,Statewide Unallocated,0,AL,1,2020-01-22,0
68,Statewide Unallocated,0,AK,2,2020-01-22,0
98,Statewide Unallocated,0,AZ,4,2020-01-22,0
114,Statewide Unallocated,0,AR,5,2020-01-22,0
190,Statewide Unallocated,0,CA,6,2020-01-22,0
...,...,...,...,...,...,...
2337598,Windham County,9015,CT,9,2022-01-23,23067
2340791,Windham County,9015,CT,9,2022-01-24,23620
2343984,Windham County,9015,CT,9,2022-01-25,23811
2347177,Windham County,9015,CT,9,2022-01-26,23984


Now shift by one to get previous day cases and compute incremental cases

In [7]:
df_sorted_confirmed_cases['Inc_Cases'] = df_sorted_confirmed_cases.groupby('countyFIPS')['Total Cases'].apply(
    lambda x: x - x.shift(1))
df_sorted_confirmed_cases

Unnamed: 0,County Name,countyFIPS,State,StateFIPS,Date,Total Cases,Inc_Cases
0,Statewide Unallocated,0,AL,1,2020-01-22,0,
68,Statewide Unallocated,0,AK,2,2020-01-22,0,0.0
98,Statewide Unallocated,0,AZ,4,2020-01-22,0,0.0
114,Statewide Unallocated,0,AR,5,2020-01-22,0,0.0
190,Statewide Unallocated,0,CA,6,2020-01-22,0,0.0
...,...,...,...,...,...,...,...
2337598,Windham County,9015,CT,9,2022-01-23,23067,0.0
2340791,Windham County,9015,CT,9,2022-01-24,23620,553.0
2343984,Windham County,9015,CT,9,2022-01-25,23811,191.0
2347177,Windham County,9015,CT,9,2022-01-26,23984,173.0


Now compute 7 days rolling average

In [8]:
df_sorted_confirmed_cases['cases_moving_avg'] = df_sorted_confirmed_cases.groupby('countyFIPS')['Inc_Cases'].apply(
    lambda x: (x + x.shift(1) + x.shift(2) + x.shift(3) + x.shift(4) + x.shift(5) + x.shift(6))/7)
df_sorted_confirmed_cases

Unnamed: 0,County Name,countyFIPS,State,StateFIPS,Date,Total Cases,Inc_Cases,cases_moving_avg
0,Statewide Unallocated,0,AL,1,2020-01-22,0,,
68,Statewide Unallocated,0,AK,2,2020-01-22,0,0.0,
98,Statewide Unallocated,0,AZ,4,2020-01-22,0,0.0,
114,Statewide Unallocated,0,AR,5,2020-01-22,0,0.0,
190,Statewide Unallocated,0,CA,6,2020-01-22,0,0.0,
...,...,...,...,...,...,...,...,...
2337598,Windham County,9015,CT,9,2022-01-23,23067,0.0,201.428571
2340791,Windham County,9015,CT,9,2022-01-24,23620,553.0,178.000000
2343984,Windham County,9015,CT,9,2022-01-25,23811,191.0,186.714286
2347177,Windham County,9015,CT,9,2022-01-26,23984,173.0,184.571429


In [9]:
def direction_of_spread(x):
    if x['Inc Cases'] > x['moving_avg']: 
        return 'increasing'
    else: 
        return'decreasing'

In [10]:
df_sorted_confirmed_cases['difference'] = df_sorted_confirmed_cases['Inc_Cases'] - df_sorted_confirmed_cases[
    'cases_moving_avg']
df_sorted_confirmed_cases['direction']=df_sorted_confirmed_cases['difference'].apply(
    lambda x: 'increasing' if x>0 else 'not increasing')
df_oc_confirmed_cases = df_sorted_confirmed_cases[df_sorted_confirmed_cases['countyFIPS'] == '6059']

In [11]:
df_sorted_confirmed_cases.to_csv(os.path.join(dir_output, 'confirmed_cases_by_county.csv'))

In [12]:
df_oc_confirmed_cases.to_csv(os.path.join(dir_output, 'confirmed_cases_orange_county_ca.csv'))

Data prep for COVID deaths by County

Read file

In [13]:
df_covid_deaths = pd.read_csv(os.path.join(dir_input, "USA_Facts", "covid_deaths_usafacts.csv"))
df_covid_deaths = df_covid_deaths.astype({'countyFIPS': str}).astype({'StateFIPS': str})
df_covid_deaths

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2022-01-18,2022-01-19,2022-01-20,2022-01-21,2022-01-22,2022-01-23,2022-01-24,2022-01-25,2022-01-26,2022-01-27
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,163,163,163,163,163,163,163,163,163,163
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,605,605,607,608,608,608,608,608,608,608
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,83,83,83,83,83,83,83,83,83,83
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,95,95,95,95,95,95,95,95,95,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,56037,Sweetwater County,WY,56,0,0,0,0,0,0,...,114,114,114,114,114,114,114,115,115,115
3189,56039,Teton County,WY,56,0,0,0,0,0,0,...,14,14,14,14,14,14,14,15,15,15
3190,56041,Uinta County,WY,56,0,0,0,0,0,0,...,34,34,34,34,34,34,34,34,34,34
3191,56043,Washakie County,WY,56,0,0,0,0,0,0,...,41,41,41,41,41,41,41,41,41,41


Now group the data by State

Now I need to transpose so all dates are in rows

In [14]:
df_covid_deaths_by_county_by_date = df_covid_deaths.melt(id_vars=['State',
                                                                  'StateFIPS', 
                                                                  'County Name', 
                                                                  'countyFIPS'], 
                                                         var_name='Date', 
                                                         value_name='Total Deaths')
df_covid_deaths_by_county_by_date

Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Deaths
0,AL,1,Statewide Unallocated,0,2020-01-22,0
1,AL,1,Autauga County,1001,2020-01-22,0
2,AL,1,Baldwin County,1003,2020-01-22,0
3,AL,1,Barbour County,1005,2020-01-22,0
4,AL,1,Bibb County,1007,2020-01-22,0
...,...,...,...,...,...,...
2353236,WY,56,Sweetwater County,56037,2022-01-27,115
2353237,WY,56,Teton County,56039,2022-01-27,15
2353238,WY,56,Uinta County,56041,2022-01-27,34
2353239,WY,56,Washakie County,56043,2022-01-27,41


In [15]:
df_covid_deaths_by_county_by_date = df_covid_deaths_by_county_by_date.astype({'Date': 'datetime64[ns]'})
df_sorted_covid_deaths = df_covid_deaths_by_county_by_date.sort_values(by=['countyFIPS', 'Date'])
df_sorted_covid_deaths

Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Deaths
0,AL,1,Statewide Unallocated,0,2020-01-22,0
68,AK,2,Statewide Unallocated,0,2020-01-22,0
98,AZ,4,Statewide Unallocated,0,2020-01-22,0
114,AR,5,Statewide Unallocated,0,2020-01-22,0
190,CA,6,Statewide Unallocated,0,2020-01-22,0
...,...,...,...,...,...,...
2337598,CT,9,Windham County,9015,2022-01-23,265
2340791,CT,9,Windham County,9015,2022-01-24,267
2343984,CT,9,Windham County,9015,2022-01-25,268
2347177,CT,9,Windham County,9015,2022-01-26,272


Now shift by one to get previous day deaths and compute incremental deaths

In [16]:
df_sorted_covid_deaths['Inc Deaths'] = df_sorted_covid_deaths.groupby('countyFIPS')['Total Deaths'].apply(
    lambda x: x - x.shift(1))
df_sorted_covid_deaths

Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Deaths,Inc Deaths
0,AL,1,Statewide Unallocated,0,2020-01-22,0,
68,AK,2,Statewide Unallocated,0,2020-01-22,0,0.0
98,AZ,4,Statewide Unallocated,0,2020-01-22,0,0.0
114,AR,5,Statewide Unallocated,0,2020-01-22,0,0.0
190,CA,6,Statewide Unallocated,0,2020-01-22,0,0.0
...,...,...,...,...,...,...,...
2337598,CT,9,Windham County,9015,2022-01-23,265,0.0
2340791,CT,9,Windham County,9015,2022-01-24,267,2.0
2343984,CT,9,Windham County,9015,2022-01-25,268,1.0
2347177,CT,9,Windham County,9015,2022-01-26,272,4.0


Now compute 7 days rolling average

In [17]:
df_sorted_covid_deaths['death moving_avg'] = df_sorted_covid_deaths.groupby('countyFIPS')['Inc Deaths'].apply(
    lambda x: (x + x.shift(1) + x.shift(2) + x.shift(3) + x.shift(4) + x.shift(5) + x.shift(6))/7)
df_sorted_covid_deaths

Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Deaths,Inc Deaths,death moving_avg
0,AL,1,Statewide Unallocated,0,2020-01-22,0,,
68,AK,2,Statewide Unallocated,0,2020-01-22,0,0.0,
98,AZ,4,Statewide Unallocated,0,2020-01-22,0,0.0,
114,AR,5,Statewide Unallocated,0,2020-01-22,0,0.0,
190,CA,6,Statewide Unallocated,0,2020-01-22,0,0.0,
...,...,...,...,...,...,...,...,...
2337598,CT,9,Windham County,9015,2022-01-23,265,0.0,1.000000
2340791,CT,9,Windham County,9015,2022-01-24,267,2.0,1.285714
2343984,CT,9,Windham County,9015,2022-01-25,268,1.0,1.428571
2347177,CT,9,Windham County,9015,2022-01-26,272,4.0,1.714286


In [18]:
df_sorted_covid_deaths.to_csv(os.path.join(dir_output, 'covid_deaths_by_county.csv'))

Create partial analytics_base_table with confirmed cases and deaths


In [19]:
df_partial_abt_by_county = pd.merge(df_sorted_confirmed_cases, df_sorted_covid_deaths, 
                                    on=['countyFIPS', 'Date'], 
                                    suffixes=('', '_DROP'), 
                                    how='inner').filter(regex='^(?!.*_DROP)')
df_partial_abt_by_county

Unnamed: 0,County Name,countyFIPS,State,StateFIPS,Date,Total Cases,Inc_Cases,cases_moving_avg,difference,direction,Total Deaths,Inc Deaths,death moving_avg
0,Statewide Unallocated,0,AL,1,2020-01-22,0,,,,not increasing,0,,
1,Statewide Unallocated,0,AL,1,2020-01-22,0,,,,not increasing,0,0.0,
2,Statewide Unallocated,0,AL,1,2020-01-22,0,,,,not increasing,0,0.0,
3,Statewide Unallocated,0,AL,1,2020-01-22,0,,,,not increasing,0,0.0,
4,Statewide Unallocated,0,AL,1,2020-01-22,0,,,,not increasing,0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4232586,Windham County,9015,CT,9,2022-01-23,23067,0.0,201.428571,-201.428571,not increasing,265,0.0,1.000000
4232587,Windham County,9015,CT,9,2022-01-24,23620,553.0,178.000000,375.000000,increasing,267,2.0,1.285714
4232588,Windham County,9015,CT,9,2022-01-25,23811,191.0,186.714286,4.285714,increasing,268,1.0,1.428571
4232589,Windham County,9015,CT,9,2022-01-26,23984,173.0,184.571429,-11.571429,not increasing,272,4.0,1.714286


In [20]:
df_partial_abt_by_county[df_partial_abt_by_county['State'] == "CA"]['County Name'].unique()

array(['Statewide Unallocated', 'Alameda County ', 'Alpine County ',
       'Amador County ', 'Butte County ', 'Calaveras County ',
       'Colusa County ', 'Contra Costa County ', 'Del Norte County ',
       'El Dorado County ', 'Fresno County ', 'Glenn County ',
       'Humboldt County ', 'Imperial County ', 'Inyo County ',
       'Kern County ', 'Kings County ', 'Lake County ', 'Lassen County ',
       'Los Angeles County ', 'Madera County ', 'Marin County ',
       'Mariposa County ', 'Mendocino County ', 'Merced County ',
       'Modoc County ', 'Mono County ', 'Monterey County ',
       'Napa County ', 'Nevada County ', 'Orange County ',
       'Placer County ', 'Plumas County ', 'Riverside County ',
       'Sacramento County ', 'San Benito County ',
       'San Bernardino County ', 'San Diego County ',
       'City and County of San Francisco', 'San Joaquin County ',
       'San Luis Obispo County ', 'San Mateo County ',
       'Santa Barbara County ', 'Santa Clara County ',
   

In [21]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County '].count()

County Name         737
countyFIPS          737
State               737
StateFIPS           737
Date                737
Total Cases         737
Inc_Cases           736
cases_moving_avg    730
difference          730
direction           737
Total Deaths        737
Inc Deaths          736
death moving_avg    730
dtype: int64

In [22]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County '].Date.min()

Timestamp('2020-01-22 00:00:00')

In [23]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County '].Date.max()

Timestamp('2022-01-27 00:00:00')