In [1]:
import os
import re
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
covid_repo_path = "..\\..\\COVID-19"
db_source = os.path.join(covid_repo_path, "csse_covid_19_data\\csse_covid_19_daily_reports")
print(f"list of files: {len(os.listdir(db_source))}")

list of files: 51


In [3]:
data_df = pd.DataFrame()
for file in tqdm(os.listdir(db_source)):
    try:
        crt_date, crt_ext = crt_file = file.split(".")
        if(crt_ext == "csv"):
            crt_date_df = pd.read_csv(os.path.join(db_source, file))
            crt_date_df['date_str'] = crt_date
            crt_date_df['date'] = crt_date_df['date_str'].apply(lambda x: datetime.strptime(x, "%m-%d-%Y"))
            data_df = data_df.append(crt_date_df)
    except:
        pass

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
100%|██████████| 51/51 [00:00<00:00, 165.87it/s]


In [4]:
print(f"Data: rows: {data_df.shape[0]}, cols: {data_df.shape[1]}")
print(f"Days: {data_df.date_str.nunique()} ({data_df.date_str.min()} : {data_df.date_str.max()})")
print(f"Country/Region: {data_df['Country/Region'].nunique()}")
print(f"Province/State: {data_df['Province/State'].nunique()}")
print(f"Confirmed all: {sum(data_df.groupby(['Province/State'])['Confirmed'].max())}")
print(f"Recovered all: {sum(data_df.loc[~data_df.Recovered.isna()].groupby(['Province/State'])['Recovered'].max())}")
print(f"Deaths all: {sum(data_df.loc[~data_df.Deaths.isna()].groupby(['Province/State'])['Deaths'].max())}")

Data: rows: 4719, cols: 10
Days: 49 (01-22-2020 : 03-10-2020)
Country/Region: 130
Province/State: 243
Confirmed all: 83594.0
Recovered all: 60293.0
Deaths all: 3202.0


In [5]:
data_df.head()

Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date,date_str
0,1.0,Mainland China,,1/22/2020 17:00,,,Anhui,,2020-01-22,01-22-2020
1,14.0,Mainland China,,1/22/2020 17:00,,,Beijing,,2020-01-22,01-22-2020
2,6.0,Mainland China,,1/22/2020 17:00,,,Chongqing,,2020-01-22,01-22-2020
3,1.0,Mainland China,,1/22/2020 17:00,,,Fujian,,2020-01-22,01-22-2020
4,,Mainland China,,1/22/2020 17:00,,,Gansu,,2020-01-22,01-22-2020


In [6]:
data_df.tail()

Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date,date_str
201,0.0,US,0.0,2020-03-10T02:33:04,34.8405,-106.2485,New Mexico,0.0,2020-03-10,03-10-2020
202,0.0,US,0.0,2020-03-10T02:33:04,47.5289,-99.784,North Dakota,0.0,2020-03-10,03-10-2020
203,0.0,US,0.0,2020-03-10T02:33:04,44.2998,-99.4388,South Dakota,0.0,2020-03-10,03-10-2020
204,0.0,US,0.0,2020-03-10T02:33:04,38.4912,-80.9545,West Virginia,0.0,2020-03-10,03-10-2020
205,0.0,US,0.0,2020-03-10T02:33:04,42.756,-107.3025,Wyoming,0.0,2020-03-10,03-10-2020


In [7]:
province_state = data_df['Province/State'].unique()

for ps in province_state:

    data_df.loc[(data_df['Province/State']==ps) & (data_df['Latitude'].isna()), 'Latitude'] =\
                data_df.loc[(~data_df['Latitude'].isna()) & \
                            (data_df['Province/State']==ps), 'Latitude'].median()
    
    data_df.loc[(data_df['Province/State']==ps) & (data_df['Longitude'].isna()), 'Longitude'] =\
            data_df.loc[(~data_df['Longitude'].isna()) & \
                        (data_df['Province/State']==ps), 'Longitude'].median() 

In [8]:
country_region = data_df['Country/Region'].unique()

for cr in country_region:

    data_df.loc[(data_df['Country/Region']==cr) & (data_df['Latitude'].isna()), 'Latitude'] =\
                data_df.loc[(~data_df['Latitude'].isna()) & \
                            (data_df['Country/Region']==cr), 'Latitude'].median()
    
    data_df.loc[(data_df['Country/Region']==cr) & (data_df['Longitude'].isna()), 'Longitude'] =\
            data_df.loc[(~data_df['Longitude'].isna()) & \
                        (data_df['Country/Region']==cr), 'Longitude'].median() 

In [9]:
data_df.loc[data_df.Latitude.isna()]

Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date,date_str
48,1.0,Ivory Coast,,1/27/20 23:59,,,,,2020-01-27,01-27-2020
78,1.0,Azerbaijan,0.0,2020-02-28T15:03:26,,,,0.0,2020-02-28,02-28-2020
99,1.0,North Ireland,0.0,2020-02-28T05:43:02,,,,0.0,2020-02-28,02-28-2020


In [10]:
data_df.loc[data_df.Longitude.isna()]

Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date,date_str
48,1.0,Ivory Coast,,1/27/20 23:59,,,,,2020-01-27,01-27-2020
78,1.0,Azerbaijan,0.0,2020-02-28T15:03:26,,,,0.0,2020-02-28,02-28-2020
99,1.0,North Ireland,0.0,2020-02-28T05:43:02,,,,0.0,2020-02-28,02-28-2020


In [11]:
data_df.to_csv("covid-19-all.csv", index=False)