# Forming county_daily_data Table

In [1]:
# Install Dependencies
import pandas as pd
from sqlalchemy import create_engine
from secret import username, password

## Import .csv Files

In [2]:
#  datasets filepaths
NYT_file="../Potential Data Sources/NYT- Cty Cases Deaths Thru Oct2/us-counties.csv"
Mob_file="../Potential Data Sources/Google US Mobility/2020_US_Region_Mobility_Report.csv"

In [3]:
# read datasets
NYT=pd.read_csv(NYT_file, parse_dates=['date'])
Mob=pd.read_csv(Mob_file, parse_dates=['date'])

## Pre-Process Data

### NYT Case and Death Data Preprocessing

In [4]:
NYT.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [5]:
# limit dataset to texas
Texas=NYT.loc[NYT['state']=='Texas']
Texas=Texas.dropna(how='any', axis=0)
Texas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44448 entries, 143 to 595652
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    44448 non-null  datetime64[ns]
 1   county  44448 non-null  object        
 2   state   44448 non-null  object        
 3   fips    44448 non-null  float64       
 4   cases   44448 non-null  int64         
 5   deaths  44448 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 2.4+ MB


In [6]:
# drop county and state column
Texas2=Texas.drop(['county', 'state'], axis=1)
Texas2.head()

Unnamed: 0,date,fips,cases,deaths
143,2020-02-12,48029.0,1,0
154,2020-02-13,48029.0,2,0
165,2020-02-14,48029.0,2,0
176,2020-02-15,48029.0,2,0
187,2020-02-16,48029.0,2,0


In [7]:
# convert fips to integer
Texas2['fips'] = Texas2['fips'].astype(int)

# Rename the column headers
Texas3 = Texas2.rename(columns={'fips': 'FIPS'})

Texas3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44448 entries, 143 to 595652
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    44448 non-null  datetime64[ns]
 1   FIPS    44448 non-null  int32         
 2   cases   44448 non-null  int64         
 3   deaths  44448 non-null  int64         
dtypes: datetime64[ns](1), int32(1), int64(2)
memory usage: 1.5 MB


In [8]:
Texas3.tail()

Unnamed: 0,date,FIPS,cases,deaths
595648,2020-10-03,48499,540,26
595649,2020-10-03,48501,250,5
595650,2020-10-03,48503,346,6
595651,2020-10-03,48505,335,6
595652,2020-10-03,48507,409,16


### Google Mobility Data Pre-Processing

In [9]:
# read dataset
Mob.tail()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
581567,US,United States,Wyoming,Weston County,,,56045.0,2020-09-28,,,,,-22.0,
581568,US,United States,Wyoming,Weston County,,,56045.0,2020-09-29,,,,,-15.0,
581569,US,United States,Wyoming,Weston County,,,56045.0,2020-09-30,,,,,-19.0,
581570,US,United States,Wyoming,Weston County,,,56045.0,2020-10-01,,,,,-14.0,
581571,US,United States,Wyoming,Weston County,,,56045.0,2020-10-02,,,,,-12.0,


In [10]:
# filter to texas
TX_mob=Mob.loc[(Mob['sub_region_1']=='Texas')& (Mob['date']< '2020-10-04')]

# drop unnecessary columns and rename columns to standardize with NYT data
TX_mob=TX_mob.drop(['country_region_code','country_region','metro_area','iso_3166_2_code','sub_region_1', 'sub_region_2'], axis=1)

# rename columns
TX_mob.rename(columns={'census_fips_code':'FIPS'}, inplace=True)

TX_mob.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42799 entries, 466573 to 509530
Data columns (total 8 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   FIPS                                                42567 non-null  float64       
 1   date                                                42799 non-null  datetime64[ns]
 2   retail_and_recreation_percent_change_from_baseline  27907 non-null  float64       
 3   grocery_and_pharmacy_percent_change_from_baseline   26183 non-null  float64       
 4   parks_percent_change_from_baseline                  10198 non-null  float64       
 5   transit_stations_percent_change_from_baseline       19784 non-null  float64       
 6   workplaces_percent_change_from_baseline             39138 non-null  float64       
 7   residential_percent_change_from_baseline            18633 non-null  float64       
dtypes

In [11]:
# Drop rows where FIPS is NA
TX_mob2 = TX_mob.dropna(subset=['FIPS'])

# convert fips to integer
TX_mob2['FIPS'] = TX_mob2['FIPS'].astype(int)

TX_mob2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42567 entries, 466806 to 509530
Data columns (total 8 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   FIPS                                                42567 non-null  int32         
 1   date                                                42567 non-null  datetime64[ns]
 2   retail_and_recreation_percent_change_from_baseline  27675 non-null  float64       
 3   grocery_and_pharmacy_percent_change_from_baseline   25951 non-null  float64       
 4   parks_percent_change_from_baseline                  9966 non-null   float64       
 5   transit_stations_percent_change_from_baseline       19552 non-null  float64       
 6   workplaces_percent_change_from_baseline             38906 non-null  float64       
 7   residential_percent_change_from_baseline            18401 non-null  float64       
dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TX_mob2['FIPS'] = TX_mob2['FIPS'].astype(int)


In [12]:
TX_mob2.head()

Unnamed: 0,FIPS,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
466806,48001,2020-02-15,3.0,1.0,,,-1.0,
466807,48001,2020-02-16,7.0,10.0,,,-4.0,
466808,48001,2020-02-17,1.0,-3.0,,,-13.0,3.0
466809,48001,2020-02-18,2.0,-4.0,,,0.0,0.0
466810,48001,2020-02-19,-5.0,-9.0,,,1.0,2.0


# Join Data Frames

## Join all the case and death rates with mobility by county

In [13]:
# Join mobility data with cumulative case data:
TX_cases_deaths_mob=Texas3.set_index(['FIPS','date']).join(TX_mob2.set_index(['FIPS','date']), how='outer')
TX_cases_deaths_mob=TX_cases_deaths_mob.reset_index()
TX_cases_deaths_mob.head()

Unnamed: 0,FIPS,date,cases,deaths,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,48001,2020-02-15,,,3.0,1.0,,,-1.0,
1,48001,2020-02-16,,,7.0,10.0,,,-4.0,
2,48001,2020-02-17,,,1.0,-3.0,,,-13.0,3.0
3,48001,2020-02-18,,,2.0,-4.0,,,0.0,0.0
4,48001,2020-02-19,,,-5.0,-9.0,,,1.0,2.0


In [14]:
TX_cases_deaths_mob.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54009 entries, 0 to 54008
Data columns (total 10 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   FIPS                                                54009 non-null  int64         
 1   date                                                54009 non-null  datetime64[ns]
 2   cases                                               44448 non-null  float64       
 3   deaths                                              44448 non-null  float64       
 4   retail_and_recreation_percent_change_from_baseline  27675 non-null  float64       
 5   grocery_and_pharmacy_percent_change_from_baseline   25951 non-null  float64       
 6   parks_percent_change_from_baseline                  9966 non-null   float64       
 7   transit_stations_percent_change_from_baseline       19552 non-null  float64       
 8   workpl

In [15]:
# Backfill all case and death data with 0's for dates prior to first case
TX_cases_deaths_mob[['cases', 'deaths']] = TX_cases_deaths_mob[['cases','deaths']].fillna(value=0)

TX_cases_deaths_mob.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54009 entries, 0 to 54008
Data columns (total 10 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   FIPS                                                54009 non-null  int64         
 1   date                                                54009 non-null  datetime64[ns]
 2   cases                                               54009 non-null  float64       
 3   deaths                                              54009 non-null  float64       
 4   retail_and_recreation_percent_change_from_baseline  27675 non-null  float64       
 5   grocery_and_pharmacy_percent_change_from_baseline   25951 non-null  float64       
 6   parks_percent_change_from_baseline                  9966 non-null   float64       
 7   transit_stations_percent_change_from_baseline       19552 non-null  float64       
 8   workpl

## Export to CSV

In [16]:
TX_cases_deaths_mob.to_csv('Texas_Cases_Deaths_Mobility_by_County.csv', index=False)

In [17]:
connection_string = f'{username}:{password}@localhost:5432/Covid_19'
engine = create_engine(f'postgresql://{connection_string}')

In [18]:
# Confirm tables
engine.table_names()

['county_demographics', 'county_daily_data']

In [19]:
TX_cases_deaths_mob.to_sql(name='county_daily_data', con=engine, if_exists='append', index=False)