In [37]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

In [38]:
# store CSV into DataFrame
csv_file = "resources/county_level_confirmed_cases.csv"
cases_df = pd.read_csv(csv_file)
cases_df.head()

Unnamed: 0,last_update,location_type,state,county_name,county_name_long,fips_code,lat,lon,nchs_urbanization,total_population,confirmed,confirmed_per_100000,deaths,deaths_per_100000,location
0,2020-09-04 00:28:22 UTC,county,Alabama,Autauga,"Autauga, Alabama, US",1001.0,32.539527,-86.644082,Medium metro,55200.0,1349,2443.84,23,41.67,POINT(-86.64408227 32.53952745)
1,2020-09-04 00:28:22 UTC,county,Alabama,Baldwin,"Baldwin, Alabama, US",1003.0,30.72775,-87.722071,Small metro,208107.0,4495,2159.95,40,19.22,POINT(-87.72207058 30.72774991)
2,2020-09-04 00:28:22 UTC,county,Alabama,Barbour,"Barbour, Alabama, US",1005.0,31.868263,-85.387129,Non-core,25782.0,614,2381.51,7,27.15,POINT(-85.3871286 31.868263)
3,2020-09-04 00:28:22 UTC,county,Alabama,Bibb,"Bibb, Alabama, US",1007.0,32.996421,-87.125115,Large fringe metro,22527.0,542,2406.0,6,26.63,POINT(-87.1251146 32.99642064)
4,2020-09-04 00:28:22 UTC,county,Alabama,Blount,"Blount, Alabama, US",1009.0,33.982109,-86.567906,Large fringe metro,57645.0,1037,1798.94,11,19.08,POINT(-86.56790593 33.98210918)


In [39]:
# create new data with select columns
new_cases_df = cases_df[['last_update','fips_code', 'county_name', 'state', 'total_population', 'confirmed', 'confirmed_per_100000', 'deaths', 'deaths_per_100000']].copy()
new_cases_df.head()

Unnamed: 0,last_update,fips_code,county_name,state,total_population,confirmed,confirmed_per_100000,deaths,deaths_per_100000
0,2020-09-04 00:28:22 UTC,1001.0,Autauga,Alabama,55200.0,1349,2443.84,23,41.67
1,2020-09-04 00:28:22 UTC,1003.0,Baldwin,Alabama,208107.0,4495,2159.95,40,19.22
2,2020-09-04 00:28:22 UTC,1005.0,Barbour,Alabama,25782.0,614,2381.51,7,27.15
3,2020-09-04 00:28:22 UTC,1007.0,Bibb,Alabama,22527.0,542,2406.0,6,26.63
4,2020-09-04 00:28:22 UTC,1009.0,Blount,Alabama,57645.0,1037,1798.94,11,19.08


In [40]:
# identify incomplete rows
new_cases_df.count()

last_update             3264
fips_code               3254
county_name             3264
state                   3264
total_population        3192
confirmed               3264
confirmed_per_100000    3192
deaths                  3264
deaths_per_100000       3192
dtype: int64

In [41]:
# drop rows without fips_code and verify dropped rows
new_cases_df = new_cases_df.dropna(subset=['fips_code'])
new_cases_df.count()

last_update             3254
fips_code               3254
county_name             3254
state                   3254
total_population        3192
confirmed               3254
confirmed_per_100000    3192
deaths                  3254
deaths_per_100000       3192
dtype: int64

In [42]:
# check data types
new_cases_df.dtypes

last_update              object
fips_code               float64
county_name              object
state                    object
total_population        float64
confirmed                 int64
confirmed_per_100000    float64
deaths                    int64
deaths_per_100000       float64
dtype: object

In [43]:
# change fips_code dtype to int64
new_cases_df['fips_code'].astype('int')

0        1001
1        1003
2        1005
3        1007
4        1009
        ...  
3249    90051
3250    90053
3251    90054
3252    90055
3253    90056
Name: fips_code, Length: 3254, dtype: int32

In [36]:
# check data types
new_cases_df.dtypes

last_update              object
fips_code               float64
county_name              object
state                    object
total_population        float64
confirmed                 int64
confirmed_per_100000    float64
deaths                    int64
deaths_per_100000       float64
dtype: object

In [48]:
# add leading zeros to the fips_code
new_cases_df['fips_code']=new_cases_df['fips_code'].apply(lambda x: '{0:0>5}'.format(x))
new_cases_df.head()

Unnamed: 0,last_update,fips_code,county_name,state,total_population,confirmed,confirmed_per_100000,deaths,deaths_per_100000
0,2020-09-04 00:28:22 UTC,1001.0,Autauga,Alabama,55200.0,1349,2443.84,23,41.67
1,2020-09-04 00:28:22 UTC,1003.0,Baldwin,Alabama,208107.0,4495,2159.95,40,19.22
2,2020-09-04 00:28:22 UTC,1005.0,Barbour,Alabama,25782.0,614,2381.51,7,27.15
3,2020-09-04 00:28:22 UTC,1007.0,Bibb,Alabama,22527.0,542,2406.0,6,26.63
4,2020-09-04 00:28:22 UTC,1009.0,Blount,Alabama,57645.0,1037,1798.94,11,19.08
