# Updating county_daily_data Table

In [1]:
# Install Dependencies
import pandas as pd

# EXTRACT

#### All data is in folder named 'Potential Data Sources'
* Downloaded NYT Cases Deaths files from https://github.com/nytimes/covid-19-data and placed in folder named 'NYT- Cty Cases Deaths Thru Oct2'
* Downloaded mobility data from https://www.google.com/covid19/mobility/ and placed in folder named 'Google US Mobility'

## Import .csv Files

In [2]:
#  datasets filepaths
NYT_file="../Potential Data Sources/NYT- Cty Cases Deaths Thru Oct2/us-counties_10_19.csv"
Mob_file="../Potential Data Sources/Google US Mobility/2020_US_Region_Mobility_Report_10_19.csv"

In [3]:
# read datasets
NYT=pd.read_csv(NYT_file, parse_dates=['date'])
Mob=pd.read_csv(Mob_file, parse_dates=['date'])

## Pre-Process Data

### NYT Case and Death Data Preprocessing

In [4]:
NYT.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [5]:
# limit dataset to only latest data
newNYT = NYT.loc[NYT['date']>'2020-10-03']
newNYT.head()

Unnamed: 0,date,county,state,fips,cases,deaths
596025,2020-10-04,Autauga,Alabama,1001.0,1828,27
596026,2020-10-04,Baldwin,Alabama,1003.0,6073,53
596027,2020-10-04,Barbour,Alabama,1005.0,921,7
596028,2020-10-04,Bibb,Alabama,1007.0,686,10
596029,2020-10-04,Blount,Alabama,1009.0,1656,15


In [6]:
# limit dataset to texas
Texas=newNYT.loc[newNYT['state']=='Texas']
Texas=Texas.dropna(how='any', axis=0)
Texas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3786 entries, 598644 to 644305
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3786 non-null   datetime64[ns]
 1   county  3786 non-null   object        
 2   state   3786 non-null   object        
 3   fips    3786 non-null   float64       
 4   cases   3786 non-null   int64         
 5   deaths  3786 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 207.0+ KB


In [7]:
# drop county and state column
Texas2=Texas.drop(['county', 'state'], axis=1)
Texas2.head()

Unnamed: 0,date,fips,cases,deaths
598644,2020-10-04,48001.0,2821,36
598645,2020-10-04,48003.0,457,10
598646,2020-10-04,48005.0,2601,83
598647,2020-10-04,48007.0,335,18
598648,2020-10-04,48009.0,88,1


In [8]:
# convert fips to integer
Texas2['fips'] = Texas2['fips'].astype(int)

# Rename the column headers
Texas3 = Texas2.rename(columns={'fips': 'fips_code'})

Texas3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3786 entries, 598644 to 644305
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       3786 non-null   datetime64[ns]
 1   fips_code  3786 non-null   int32         
 2   cases      3786 non-null   int64         
 3   deaths     3786 non-null   int64         
dtypes: datetime64[ns](1), int32(1), int64(2)
memory usage: 133.1 KB


In [9]:
Texas3.tail()

Unnamed: 0,date,fips_code,cases,deaths
644301,2020-10-18,48499,590,34
644302,2020-10-18,48501,284,5
644303,2020-10-18,48503,470,6
644304,2020-10-18,48505,342,6
644305,2020-10-18,48507,488,18


### Google Mobility Data Pre-Processing

In [10]:
# read dataset
Mob.tail()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
613716,US,United States,Wyoming,Weston County,,,56045.0,2020-10-12,,,,,-18.0,
613717,US,United States,Wyoming,Weston County,,,56045.0,2020-10-13,,,,,-13.0,
613718,US,United States,Wyoming,Weston County,,,56045.0,2020-10-14,,,,,-16.0,
613719,US,United States,Wyoming,Weston County,,,56045.0,2020-10-15,,,,,-10.0,
613720,US,United States,Wyoming,Weston County,,,56045.0,2020-10-16,,,,,-9.0,


In [11]:
# filter to texas
TX_mob=Mob.loc[(Mob['sub_region_1']=='Texas')& (Mob['date'] > '2020-10-03')]

# drop unnecessary columns and rename columns to standardize with NYT data
TX_mob=TX_mob.drop(['country_region_code','country_region','metro_area','iso_3166_2_code','sub_region_1', 'sub_region_2'], axis=1)

# rename columns
TX_mob.rename(columns={'census_fips_code':'fips_code'}, inplace=True)

TX_mob.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2522 entries, 492609 to 537697
Data columns (total 8 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   fips_code                                           2509 non-null   float64       
 1   date                                                2522 non-null   datetime64[ns]
 2   retail_and_recreation_percent_change_from_baseline  1516 non-null   float64       
 3   grocery_and_pharmacy_percent_change_from_baseline   1291 non-null   float64       
 4   parks_percent_change_from_baseline                  614 non-null    float64       
 5   transit_stations_percent_change_from_baseline       1204 non-null   float64       
 6   workplaces_percent_change_from_baseline             2390 non-null   float64       
 7   residential_percent_change_from_baseline            1366 non-null   float64       
dtypes:

In [12]:
# Drop rows where FIPS is NA
TX_mob2 = TX_mob.dropna(subset=['fips_code'])

# convert fips to integer
TX_mob2['fips_code'] = TX_mob2['fips_code'].astype(int)

TX_mob2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2509 entries, 492848 to 537697
Data columns (total 8 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   fips_code                                           2509 non-null   int32         
 1   date                                                2509 non-null   datetime64[ns]
 2   retail_and_recreation_percent_change_from_baseline  1503 non-null   float64       
 3   grocery_and_pharmacy_percent_change_from_baseline   1278 non-null   float64       
 4   parks_percent_change_from_baseline                  601 non-null    float64       
 5   transit_stations_percent_change_from_baseline       1191 non-null   float64       
 6   workplaces_percent_change_from_baseline             2377 non-null   float64       
 7   residential_percent_change_from_baseline            1353 non-null   float64       
dtypes:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TX_mob2['fips_code'] = TX_mob2['fips_code'].astype(int)


In [13]:
TX_mob2.head()

Unnamed: 0,fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
492848,48001,2020-10-04,-11.0,-1.0,,,-13.0,0.0
492849,48001,2020-10-05,-9.0,-6.0,,,-19.0,1.0
492850,48001,2020-10-06,-8.0,-6.0,,,-18.0,1.0
492851,48001,2020-10-07,-4.0,-1.0,,,-15.0,2.0
492852,48001,2020-10-08,-5.0,-3.0,,,-15.0,1.0


# Join Data Frames

## Join all the case and death rates with mobility by county

In [14]:
# Join mobility data with cumulative case data:
TX_cases_deaths_mob=Texas3.set_index(['fips_code','date']).join(TX_mob2.set_index(['fips_code','date']), how='outer')
TX_cases_deaths_mob=TX_cases_deaths_mob.reset_index()
TX_cases_deaths_mob.head()

Unnamed: 0,fips_code,date,cases,deaths,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,48001,2020-10-04,2821,36,-11.0,-1.0,,,-13.0,0.0
1,48001,2020-10-05,2859,36,-9.0,-6.0,,,-19.0,1.0
2,48001,2020-10-06,2836,37,-8.0,-6.0,,,-18.0,1.0
3,48001,2020-10-07,2849,37,-4.0,-1.0,,,-15.0,2.0
4,48001,2020-10-08,2858,37,-5.0,-3.0,,,-15.0,1.0


In [15]:
TX_cases_deaths_mob.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3786 entries, 0 to 3785
Data columns (total 10 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   fips_code                                           3786 non-null   int64         
 1   date                                                3786 non-null   datetime64[ns]
 2   cases                                               3786 non-null   int64         
 3   deaths                                              3786 non-null   int64         
 4   retail_and_recreation_percent_change_from_baseline  1503 non-null   float64       
 5   grocery_and_pharmacy_percent_change_from_baseline   1278 non-null   float64       
 6   parks_percent_change_from_baseline                  601 non-null    float64       
 7   transit_stations_percent_change_from_baseline       1191 non-null   float64       
 8   workplac

In [16]:
# # Backfill all case and death data with 0's for dates prior to first case
# TX_cases_deaths_mob[['cases', 'deaths']] = TX_cases_deaths_mob[['cases','deaths']].fillna(value=0)

# TX_cases_deaths_mob.info()

## Export to CSV

In [17]:
TX_cases_deaths_mob.to_csv('Texas_Cases_Deaths_Mobility_by_County_Updated.csv', index=False)