In [1]:
import glob
import gzip
import tqdm.notebook as tq
import json
import numpy as np

# reading and combining files
ZIPFILES='*.csv'
filelist = glob.glob(ZIPFILES)
import pandas as pd
df = pd.DataFrame()
for gzfile in tq.tqdm(filelist):
    if len(df)==0:
        df=pd.read_csv(gzfile)
    else:
        data = pd.read_csv(gzfile)
        df=pd.concat([data,df])
# cleaning the data by dropping the indexed columns and converting NA rows to 0. 
df=df.reset_index().drop(["index"], axis=1)
df = df[df['fips'].notna()]
df['cases'] = df['cases'].fillna(0)
df['deaths']=df['deaths'].fillna(0)

  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
## converting FIPS to string
df["fips"]=[str(int(i)) for i in df.fips]
## converting the county FIPS to character, including the 0 infront of counties with 4-digit FIPS (eg "1234" -> "01234")
df["fips"]=(pd.Series(df.fips)).str.pad(width=5,fillchar='0')
## Converting date to datetime format. 
df['date']=pd.to_datetime(df['date'], format='%Y-%m-%d')
df=df.sort_values(by='date')
## converting proper columns to its relevant forms. 
df["cases"]=[int(i) for i in df["cases"]]
df["deaths"]=[int(i) for i in df["deaths"]]

In [3]:
# Removing and dropping NAs (cannot be possible to have NA deaths or cases)
df = df[df['deaths'].notna()]
df = df[df['cases'].notna()]

# calculating daily COVID incidence cases and deaths (which is the number of new cases)
## to do this we take subtract each current date from the previous days cases
list_of_fips=list(np.unique(np.array(df.fips)))
df_new=pd.DataFrame()
for county in tq.tqdm(list_of_fips):
    df_temp=df[df['fips']==str(county)]
    df_temp=df_temp.sort_values(by='date')
    df_temp=df_temp.reset_index().drop(["index"], axis=1)
    temp=[(int((df_temp['cases'])[i]) - int((df_temp['cases'])[i-1])) for i in range(1,len(df_temp))]
    temp.append(int(df_temp.cases[0]))
    df_temp['cases']=temp
    temp=[(int((df_temp['deaths'])[i]) - int((df_temp['deaths'])[i-1])) for i in range(1,len(df_temp))]
    temp.append(int(df_temp.deaths[0]))
    df_temp['deaths']=temp
    if len(df_new)==0:
        df_new=df_temp
    else:
        df_new=pd.concat([df_new,df_temp])

df_new=df_new.reset_index().drop(["index"], axis=1)
df_new['deaths'] = df_new['deaths'].fillna(0)
df_new['cases'] = df_new['cases'].fillna(0)
df_new['date']=(df_new['date']).dt.to_period('M')


  0%|          | 0/3220 [00:00<?, ?it/s]

In [5]:
# Aggregating each county's daily COVID incidence to its monthly COVID incidence cases and deaths (which is the sum of new cases for the month)
aggregated=(df_new.groupby(['date','county','state','fips'],as_index=False).agg(monthly_cases=('cases', 'sum'),monthly_deaths=('deaths','sum')))
aggregated=aggregated.reset_index().drop(["index"], axis=1)
aggregated['monthly_cases'] = aggregated['monthly_cases'].mask(aggregated['monthly_cases'] < 0, 0)
aggregated['monthly_deaths'] = aggregated['monthly_deaths'].mask(aggregated['monthly_deaths'] < 0, 0)

In [6]:
# Saving the data. 
aggregated.to_csv("covid_data_clean.csv",index=False)