In [1]:
# Import all required packages
import pandas as pd

In [32]:
# Import COVID data from github
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
covid = pd.read_csv(url, 
                    sep=',', 
                    compression=None, 
                    dtype={'fips':str},
                    parse_dates=['date'], 
                    cache_dates = True)

In [33]:
# View first 5 rows to check data loaded correctly
covid.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0.0
3,2020-01-24,Cook,Illinois,17031,1,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0.0


In [34]:
# See minimum date to determine if need to use different Census population estimates
min(covid.date)

Timestamp('2020-01-21 00:00:00')

In [35]:
# Check data types and if there are Nulls
# Note there are NULLs in FIPS and deaths is set to float because it also has nulls
# Will need to fix this
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1280791 entries, 0 to 1280790
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   date    1280791 non-null  datetime64[ns]
 1   county  1280791 non-null  object        
 2   state   1280791 non-null  object        
 3   fips    1269067 non-null  object        
 4   cases   1280791 non-null  int64         
 5   deaths  1252514 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 58.6+ MB


In [41]:
# Replace any NULLS in cases and deaths with 0 - does not appear to be documentation to suggest otherwise
int_cols = ['cases','deaths']
covid.fillna(value={col:0 for col in int_cols}, axis=0, inplace=True)
for col in int_cols:
    covid[col] = covid[col].astype(pd.Int64Dtype())
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1280791 entries, 0 to 1280790
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   date    1280791 non-null  datetime64[ns]
 1   county  1280791 non-null  object        
 2   state   1280791 non-null  object        
 3   fips    1269067 non-null  object        
 4   cases   1280791 non-null  Int64         
 5   deaths  1280791 non-null  Int64         
dtypes: Int64(2), datetime64[ns](1), object(3)
memory usage: 61.1+ MB


In [52]:
# Investigate NULL FIPS values
covid['county,state'] = covid.county + ',' + covid.state
pd.unique(covid['county,state'][covid.fips.isna()])

array(['New York City,New York', 'Unknown,Rhode Island',
       'Unknown,New Jersey', 'Unknown,Puerto Rico',
       'Unknown,Virgin Islands', 'Unknown,Guam', 'Unknown,Maine',
       'Unknown,Massachusetts', 'Unknown,Louisiana', 'Unknown,Kentucky',
       'Unknown,Nevada', 'Unknown,Tennessee', 'Unknown,Arkansas',
       'Unknown,Georgia', 'Kansas City,Missouri', 'Unknown,Missouri',
       'Unknown,Minnesota', 'Unknown,California', 'Unknown,Colorado',
       'Unknown,Florida', 'Unknown,Hawaii', 'Unknown,Illinois',
       'Unknown,Vermont', 'Unknown,Arizona', 'Unknown,Michigan',
       'Unknown,Texas', 'Unknown,Virginia', 'Unknown,Washington',
       'Unknown,Utah', 'Unknown,Idaho', 'Unknown,Mississippi',
       'Unknown,Northern Mariana Islands', 'Unknown,New York',
       'Unknown,Connecticut', 'Unknown,Nebraska', 'Unknown,Montana',
       'Unknown,Pennsylvania', 'Unknown,Indiana', 'Unknown,Iowa',
       'Unknown,Maryland', 'Unknown,Oklahoma', 'Unknown,West Virginia',
       'Unknown,Al

In [55]:
# Unable to identify correct county for 'Unknown' county so will need to exclude from final data set- check what remains that isn't 'Unknown'
# Able to identify correct county for New York City, Kansas City, and Joplin so will want to include in the correct county in final result
pd.unique(covid['county,state'][(covid.fips.isna()) & (covid.county != 'Unknown')])


array(['New York City,New York', 'Kansas City,Missouri',
       'Joplin,Missouri'], dtype=object)

In [5]:
# Import Census data from local file using relative path
filepath = '../data/co-est2019-alldata.csv'
census = pd.read_csv(filepath, sep = ',', compression=None)

In [6]:
# View first 5 rows to check data loaded correctly
census.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,...,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,...,24.017829,16.64187,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952


In [11]:
#Get list of column names to identify the only columns needed for analysis so we only load the columns we need in to memory to speed up processing
census.columns.values

array(['SUMLEV', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME',
       'CTYNAME', 'CENSUS2010POP', 'ESTIMATESBASE2010', 'POPESTIMATE2010',
       'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013',
       'POPESTIMATE2014', 'POPESTIMATE2015', 'POPESTIMATE2016',
       'POPESTIMATE2017', 'POPESTIMATE2018', 'POPESTIMATE2019',
       'NPOPCHG_2010', 'NPOPCHG_2011', 'NPOPCHG_2012', 'NPOPCHG_2013',
       'NPOPCHG_2014', 'NPOPCHG_2015', 'NPOPCHG_2016', 'NPOPCHG_2017',
       'NPOPCHG_2018', 'NPOPCHG_2019', 'BIRTHS2010', 'BIRTHS2011',
       'BIRTHS2012', 'BIRTHS2013', 'BIRTHS2014', 'BIRTHS2015',
       'BIRTHS2016', 'BIRTHS2017', 'BIRTHS2018', 'BIRTHS2019',
       'DEATHS2010', 'DEATHS2011', 'DEATHS2012', 'DEATHS2013',
       'DEATHS2014', 'DEATHS2015', 'DEATHS2016', 'DEATHS2017',
       'DEATHS2018', 'DEATHS2019', 'NATURALINC2010', 'NATURALINC2011',
       'NATURALINC2012', 'NATURALINC2013', 'NATURALINC2014',
       'NATURALINC2015', 'NATURALINC2016', 'NATURALINC2017',
       'NAT

In [14]:
# Re-load file with just the desired columns
columns = ['REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME',
       'CTYNAME', 'CENSUS2010POP', 'ESTIMATESBASE2010', 'POPESTIMATE2010',
       'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013',
       'POPESTIMATE2014', 'POPESTIMATE2015', 'POPESTIMATE2016',
       'POPESTIMATE2017', 'POPESTIMATE2018', 'POPESTIMATE2019']
census = pd.read_csv(filepath, sep = ',', compression=None, usecols = columns )

In [15]:
# View first 5 rows to check data loaded correctly
census.head()

Unnamed: 0,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019
0,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,4799069,4815588,4830081,4841799,4852347,4863525,4874486,4887681,4903185
1,3,6,1,1,Alabama,Autauga County,54571,54597,54773,55227,54954,54727,54893,54864,55243,55390,55533,55869
2,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,186558,190145,194885,199183,202939,207601,212521,217855,223234
3,3,6,1,5,Alabama,Barbour County,27457,27455,27327,27341,27169,26937,26755,26283,25806,25157,24872,24686
4,3,6,1,7,Alabama,Bibb County,22915,22915,22870,22745,22667,22521,22553,22566,22586,22550,22367,22394


In [16]:
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3193 entries, 0 to 3192
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   REGION             3193 non-null   int64 
 1   DIVISION           3193 non-null   int64 
 2   STATE              3193 non-null   int64 
 3   COUNTY             3193 non-null   int64 
 4   STNAME             3193 non-null   object
 5   CTYNAME            3193 non-null   object
 6   CENSUS2010POP      3193 non-null   int64 
 7   ESTIMATESBASE2010  3193 non-null   int64 
 8   POPESTIMATE2010    3193 non-null   int64 
 9   POPESTIMATE2011    3193 non-null   int64 
 10  POPESTIMATE2012    3193 non-null   int64 
 11  POPESTIMATE2013    3193 non-null   int64 
 12  POPESTIMATE2014    3193 non-null   int64 
 13  POPESTIMATE2015    3193 non-null   int64 
 14  POPESTIMATE2016    3193 non-null   int64 
 15  POPESTIMATE2017    3193 non-null   int64 
 16  POPESTIMATE2018    3193 non-null   int64 


In [59]:
# Check FIPS code in COVID data compared to break-down in Census data
covid['fips'][(covid['county'] == 'Autauga') & (covid['state'] =='Alabama')].unique()

array(['01001'], dtype=object)

In [61]:
census[census['CTYNAME'] == 'Autauga County']

Unnamed: 0,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019
1,3,6,1,1,Alabama,Autauga County,54571,54597,54773,55227,54954,54727,54893,54864,55243,55390,55533,55869


In [65]:
# Create FIPS column in Census data = 2-digit State Code with zero padding + 3-digit County Code with zero padding
census['FIPS'] = census['STATE'].astype(str).str.zfill(2) + census['COUNTY'].astype(str).str.zfill(3)

In [66]:
census['FIPS'].head()

0    01000
1    01001
2    01003
3    01005
4    01007
Name: FIPS, dtype: object

In [73]:
# Sanity check same value for County/State in both datasets
covid['county,state'][covid['fips']=='49051'].unique()

array(['Wasatch,Utah'], dtype=object)

In [74]:
census[['STNAME','CTYNAME']][census['FIPS']=='49051']

Unnamed: 0,STNAME,CTYNAME
2847,Utah,Wasatch County
