In [1]:
import numpy as np
import pandas as pd
import glob
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt

In [3]:
# grab filepaths for every csv file
datapath = 'data/csse_covid_19_daily_reports_us/*.csv'
datafiles = np.array(glob.glob(datapath))
print(datafiles[138:142])

['data/csse_covid_19_daily_reports_us\\08-28-2020.csv'
 'data/csse_covid_19_daily_reports_us\\08-29-2020.csv'
 'data/csse_covid_19_daily_reports_us\\08-30-2020.csv'
 'data/csse_covid_19_daily_reports_us\\08-31-2020.csv']


In [4]:
# init giant df
data = pd.read_csv(datafiles[0])
# read and append each csv to the master df
for i in range(1, len(datafiles)):
    df = pd.read_csv(datafiles[i])
    data = data.append(df, ignore_index = True)
# only keep 50 states
data.drop(data[data.Province_State.isin(['Diamond Princess', 'District of Columbia', 'Grand Princess', 'Guam', 'Puerto Rico', 'American Samoa', 'Northern Mariana Islands', 'Recovered', 'Virgin Islands'])].index, inplace = True)
data.drop(columns = ['Country_Region', 'FIPS', 'UID', 'ISO3'], inplace = True)
data['Last_Update'] = data['Last_Update'].astype('datetime64[ns]')
data = data.reset_index(drop=True)
data.loc[48:52, ]

Unnamed: 0,Province_State,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
48,Wisconsin,2020-04-12 23:18:15,44.2685,-89.6165,3341,144,,3197.0,64.565739,39257.0,974.0,4.310087,758.652263,29.152948
49,Wyoming,2020-04-12 23:18:15,42.756,-107.3025,270,0,137.0,270.0,54.299735,5459.0,40.0,0.0,1097.860188,14.814815
50,Alabama,2020-04-13 23:07:54,32.3182,-86.9023,3734,99,,3635.0,79.634933,29182.0,457.0,2.651312,622.363852,12.238886
51,Alaska,2020-04-13 23:07:54,61.3707,-152.4044,277,8,85.0,269.0,46.340521,7830.0,32.0,2.888087,1309.914362,11.552347
52,Arizona,2020-04-13 23:07:54,33.7298,-111.4312,3705,122,,3583.0,50.901828,43347.0,525.0,3.292848,595.530778,14.17004


In [5]:
data = pd.concat([data, pd.get_dummies(data['Province_State'])], axis = 1)
statenames = np.unique(data['Province_State'])
#data.drop(columns = ['Province_State'], inplace = True)
names = list(data.columns.values)
timestamps = data['Last_Update']
states = data['Province_State']
data.head()

Unnamed: 0,Province_State,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,Alabama,2020-04-12 23:18:15,32.3182,-86.9023,3563,93,,3470.0,75.98802,21583.0,...,0,0,0,0,0,0,0,0,0,0
1,Alaska,2020-04-12 23:18:15,61.3707,-152.4044,272,8,66.0,264.0,45.504049,8038.0,...,0,0,0,0,0,0,0,0,0,0
2,Arizona,2020-04-12 23:18:15,33.7298,-111.4312,3542,115,,3427.0,48.662422,42109.0,...,0,0,0,0,0,0,0,0,0,0
3,Arkansas,2020-04-12 23:18:15,34.9697,-92.3731,1280,27,367.0,1253.0,49.439423,19722.0,...,0,0,0,0,0,0,0,0,0,0
4,California,2020-04-12 23:18:15,36.1162,-119.6816,22795,640,,22155.0,58.137726,190328.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data.shape

(9650, 64)

In [7]:
imputer = KNNImputer(n_neighbors=10, weights="uniform")
data = pd.DataFrame(data = imputer.fit_transform(data.drop(columns = ['Province_State', 'Last_Update'])), columns = names[2:])
data = pd.concat([states, timestamps, data], axis = 1)
data.head()

Unnamed: 0,Province_State,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,Alabama,2020-04-12 23:18:15,32.3182,-86.9023,3563.0,93.0,780.3,3470.0,75.98802,21583.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alaska,2020-04-12 23:18:15,61.3707,-152.4044,272.0,8.0,66.0,264.0,45.504049,8038.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arizona,2020-04-12 23:18:15,33.7298,-111.4312,3542.0,115.0,1378.2,3427.0,48.662422,42109.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arkansas,2020-04-12 23:18:15,34.9697,-92.3731,1280.0,27.0,367.0,1253.0,49.439423,19722.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,California,2020-04-12 23:18:15,36.1162,-119.6816,22795.0,640.0,4283.8,22155.0,58.137726,190328.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
