# Extract and Clean Times COVID-19 Data New York

**Instructions:**

1. Provide the file path to the source data, the 

In [1]:
import pandas as pd
import geopandas
import matplotlib.pyplot as plt

In [2]:
# New York Counties
fips_ny = ['36061','36047','36005','36085','36081']

# File path the New York Times us-counties.csv file
source = '/Volumes/Data_01/nyt/us-counties.csv'

# Target file location where we will save new york data
target_ny = '/Users/justinsnider/nyu-big-data/project/clean-data/ny/ny-covid.csv'

# LA Counties
fips_la = ['06037','06059']

# Target file location where we will save Los Angeles data
target_la = '/Users/justinsnider/nyu-big-data/project/clean-data/la/la-covid.csv'

# Chicago Counties
fips_ch = ['17031','17037','17043','17063','17091','17089','17093','17111','17197','17097','18073','18089','18111','18127','55059']

# Target file location where we will save Chicago data
target_ch = '/Users/justinsnider/nyu-big-data/project/clean-data/ch/ch-covid.csv'


In [3]:
def filter_fips(df, fips):
    return df[df['cbg'].astype(str).str[:5].isin(fips)]

## Extract and Clean New York

In [13]:
df = pd.read_csv(source, dtype={'fips': object})
df

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0.0
3,2020-01-24,Cook,Illinois,17031,1,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0.0
...,...,...,...,...,...,...
1134652,2021-03-18,Sweetwater,Wyoming,56037,3912,36.0
1134653,2021-03-18,Teton,Wyoming,56039,3495,9.0
1134654,2021-03-18,Uinta,Wyoming,56041,2101,12.0
1134655,2021-03-18,Washakie,Wyoming,56043,888,26.0


In [14]:
df = df.rename(columns={'fips':'cbg'})
df

Unnamed: 0,date,county,state,cbg,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0.0
3,2020-01-24,Cook,Illinois,17031,1,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0.0
...,...,...,...,...,...,...
1134652,2021-03-18,Sweetwater,Wyoming,56037,3912,36.0
1134653,2021-03-18,Teton,Wyoming,56039,3495,9.0
1134654,2021-03-18,Uinta,Wyoming,56041,2101,12.0
1134655,2021-03-18,Washakie,Wyoming,56043,888,26.0


In [15]:
# Filter out just New York City
# NOTE: We will not be using CBG for this dataset.
df = df[df['county'].astype(str).isin(['New York City'])]
df

Unnamed: 0,date,county,state,cbg,cases,deaths
416,2020-03-01,New York City,New York,,1,0.0
448,2020-03-02,New York City,New York,,1,0.0
482,2020-03-03,New York City,New York,,2,0.0
518,2020-03-04,New York City,New York,,2,0.0
565,2020-03-05,New York City,New York,,4,0.0
...,...,...,...,...,...,...
1120297,2021-03-14,New York City,New York,,777418,30258.0
1123543,2021-03-15,New York City,New York,,781734,30303.0
1126789,2021-03-16,New York City,New York,,785307,30366.0
1130036,2021-03-17,New York City,New York,,788302,30406.0


In [16]:
# Save NYC Data
df.to_csv(target_ny)

In [17]:
if len(df.groupby(['date','county'])) == len(df):
    print('Everything is good. Every unique date, county combo has a unique row!')
else:
    print('We have a problem, not every unique date, county combo has a unique row.')

Everything is good. Every unique date, county combo has a unique row!


In [18]:
for col in ['date','cases','deaths']:
    print('\n')
    print(col)
    print('Min: {}\nMax: {}'.format(df[col].min(), df[col].max()))



date
Min: 2020-03-01
Max: 2021-03-18


cases
Min: 1
Max: 790069


deaths
Min: 0.0
Max: 30471.0


In [19]:
# Print all Null and NaN values
cols = ['date', 'county', 'state', 'cases', 'deaths']
print('We have {} null values.'.format(len(df[df[cols].isna().any(axis=1)])))

We have 0 null values.


In [20]:
pd.read_csv(target_ny, dtype={'cbg': object})

Unnamed: 0.1,Unnamed: 0,date,county,state,cbg,cases,deaths
0,416,2020-03-01,New York City,New York,,1,0.0
1,448,2020-03-02,New York City,New York,,1,0.0
2,482,2020-03-03,New York City,New York,,2,0.0
3,518,2020-03-04,New York City,New York,,2,0.0
4,565,2020-03-05,New York City,New York,,4,0.0
...,...,...,...,...,...,...,...
378,1120297,2021-03-14,New York City,New York,,777418,30258.0
379,1123543,2021-03-15,New York City,New York,,781734,30303.0
380,1126789,2021-03-16,New York City,New York,,785307,30366.0
381,1130036,2021-03-17,New York City,New York,,788302,30406.0


## Extract and Clean Los Angeles

In [21]:
df = pd.read_csv(source, dtype={'fips': object})
df

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0.0
3,2020-01-24,Cook,Illinois,17031,1,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0.0
...,...,...,...,...,...,...
1134652,2021-03-18,Sweetwater,Wyoming,56037,3912,36.0
1134653,2021-03-18,Teton,Wyoming,56039,3495,9.0
1134654,2021-03-18,Uinta,Wyoming,56041,2101,12.0
1134655,2021-03-18,Washakie,Wyoming,56043,888,26.0


In [22]:
df = df.rename(columns={'fips':'cbg'})
df

Unnamed: 0,date,county,state,cbg,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0.0
3,2020-01-24,Cook,Illinois,17031,1,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0.0
...,...,...,...,...,...,...
1134652,2021-03-18,Sweetwater,Wyoming,56037,3912,36.0
1134653,2021-03-18,Teton,Wyoming,56039,3495,9.0
1134654,2021-03-18,Uinta,Wyoming,56041,2101,12.0
1134655,2021-03-18,Washakie,Wyoming,56043,888,26.0


In [23]:
df = df[df['cbg'].astype(str).str[:5].isin(fips_la)]
df

Unnamed: 0,date,county,state,cbg,cases,deaths
5,2020-01-25,Orange,California,06059,1,0.0
9,2020-01-26,Los Angeles,California,06037,1,0.0
10,2020-01-26,Orange,California,06059,1,0.0
14,2020-01-27,Los Angeles,California,06037,1,0.0
15,2020-01-27,Orange,California,06059,1,0.0
...,...,...,...,...,...,...
1125131,2021-03-16,Orange,California,06059,264033,4486.0
1128366,2021-03-17,Los Angeles,California,06037,1212389,22580.0
1128377,2021-03-17,Orange,California,06059,264216,4511.0
1131614,2021-03-18,Los Angeles,California,06037,1213242,22664.0


In [24]:
# Save LA data
df.to_csv(target_la)

In [25]:
if len(df.groupby(['date','county'])) == len(df):
    print('Everything is good. Every unique date, county combo has a unique row!')
else:
    print('We have a problem, not every unique date, county combo has a unique row.')

Everything is good. Every unique date, county combo has a unique row!


In [26]:
for col in ['date','cases','deaths']:
    print('\n')
    print(col)
    print('Min: {}\nMax: {}'.format(df[col].min(), df[col].max()))



date
Min: 2020-01-25
Max: 2021-03-18


cases
Min: 1
Max: 1213242


deaths
Min: 0.0
Max: 22664.0


In [27]:
# Print all Null and NaN values
print('We have {} null values.'.format(len(df[df.isna().any(axis=1)])))

We have 0 null values.


In [28]:
pd.read_csv(target_la, dtype={'cbg': object})

Unnamed: 0.1,Unnamed: 0,date,county,state,cbg,cases,deaths
0,5,2020-01-25,Orange,California,06059,1,0.0
1,9,2020-01-26,Los Angeles,California,06037,1,0.0
2,10,2020-01-26,Orange,California,06059,1,0.0
3,14,2020-01-27,Los Angeles,California,06037,1,0.0
4,15,2020-01-27,Orange,California,06059,1,0.0
...,...,...,...,...,...,...,...
832,1125131,2021-03-16,Orange,California,06059,264033,4486.0
833,1128366,2021-03-17,Los Angeles,California,06037,1212389,22580.0
834,1128377,2021-03-17,Orange,California,06059,264216,4511.0
835,1131614,2021-03-18,Los Angeles,California,06037,1213242,22664.0


## Extract and Clean Chicago

In [29]:
df = pd.read_csv(source, dtype={'fips': object})
df

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0.0
3,2020-01-24,Cook,Illinois,17031,1,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0.0
...,...,...,...,...,...,...
1134652,2021-03-18,Sweetwater,Wyoming,56037,3912,36.0
1134653,2021-03-18,Teton,Wyoming,56039,3495,9.0
1134654,2021-03-18,Uinta,Wyoming,56041,2101,12.0
1134655,2021-03-18,Washakie,Wyoming,56043,888,26.0


In [30]:
df = df.rename(columns={'fips':'cbg'})
df

Unnamed: 0,date,county,state,cbg,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0.0
3,2020-01-24,Cook,Illinois,17031,1,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0.0
...,...,...,...,...,...,...
1134652,2021-03-18,Sweetwater,Wyoming,56037,3912,36.0
1134653,2021-03-18,Teton,Wyoming,56039,3495,9.0
1134654,2021-03-18,Uinta,Wyoming,56041,2101,12.0
1134655,2021-03-18,Washakie,Wyoming,56043,888,26.0


In [31]:
df = df[df['cbg'].astype(str).str[:5].isin(fips_ch)]
df

Unnamed: 0,date,county,state,cbg,cases,deaths
3,2020-01-24,Cook,Illinois,17031,1,0.0
6,2020-01-25,Cook,Illinois,17031,1,0.0
11,2020-01-26,Cook,Illinois,17031,1,0.0
16,2020-01-27,Cook,Illinois,17031,1,0.0
21,2020-01-28,Cook,Illinois,17031,1,0.0
...,...,...,...,...,...,...
1132149,2021-03-18,Jasper,Indiana,18073,3252,48.0
1132158,2021-03-18,Lake,Indiana,18089,49345,943.0
1132168,2021-03-18,Newton,Indiana,18111,984,35.0
1132176,2021-03-18,Porter,Indiana,18127,16330,296.0


In [32]:
# Save Chicago Data
df.to_csv(target_ch)

In [33]:
pd.read_csv(target_ch, dtype={'cbg': object})

Unnamed: 0.1,Unnamed: 0,date,county,state,cbg,cases,deaths
0,3,2020-01-24,Cook,Illinois,17031,1,0.0
1,6,2020-01-25,Cook,Illinois,17031,1,0.0
2,11,2020-01-26,Cook,Illinois,17031,1,0.0
3,16,2020-01-27,Cook,Illinois,17031,1,0.0
4,21,2020-01-28,Cook,Illinois,17031,1,0.0
...,...,...,...,...,...,...,...
5538,1132149,2021-03-18,Jasper,Indiana,18073,3252,48.0
5539,1132158,2021-03-18,Lake,Indiana,18089,49345,943.0
5540,1132168,2021-03-18,Newton,Indiana,18111,984,35.0
5541,1132176,2021-03-18,Porter,Indiana,18127,16330,296.0


In [34]:
if len(df.groupby(['date','cbg'])) == len(df):
    print('Everything is good. Every unique date, county combo has a unique row!')
else:
    print('We have a problem, not every unique date, county combo has a unique row.')

Everything is good. Every unique date, county combo has a unique row!


In [35]:
for col in ['date','cases','deaths']:
    print('\n')
    print(col)
    print('Min: {}\nMax: {}'.format(df[col].min(), df[col].max()))



date
Min: 2020-01-24
Max: 2021-03-18


cases
Min: 1
Max: 485771


deaths
Min: 0.0
Max: 10099.0


In [36]:
# Print all Null and NaN values
print('We have {} null values.'.format(len(df[df.isna().any(axis=1)])))

We have 0 null values.
