In [35]:
import os
import re
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd

In [36]:
covid_repo_path = "..\\..\\COVID-19"
db_source = os.path.join(covid_repo_path, "csse_covid_19_data\\csse_covid_19_daily_reports")
print(f"list of files: {len(os.listdir(db_source))}")

list of files: 52


In [37]:
data_df = pd.DataFrame()
for file in tqdm(os.listdir(db_source)):
    try:
        crt_date, crt_ext = crt_file = file.split(".")
        if(crt_ext == "csv"):
            crt_date_df = pd.read_csv(os.path.join(db_source, file))
            crt_date_df['date_str'] = crt_date
            crt_date_df['Date'] = crt_date_df['date_str'].apply(lambda x: datetime.strptime(x, "%m-%d-%Y"))
            data_df = data_df.append(crt_date_df)
    except:
        pass

100%|██████████| 52/52 [00:00<00:00, 72.85it/s]


In [38]:
print(f"Data: rows: {data_df.shape[0]}, cols: {data_df.shape[1]}")
print(f"Days: {data_df.date_str.nunique()} ({data_df.date_str.min()} : {data_df.date_str.max()})")
print(f"Country/Region: {data_df['Country/Region'].nunique()}")
print(f"Province/State: {data_df['Province/State'].nunique()}")
print(f"Confirmed all: {sum(data_df.groupby(['Province/State'])['Confirmed'].max())}")
print(f"Recovered all: {sum(data_df.loc[~data_df.Recovered.isna()].groupby(['Province/State'])['Recovered'].max())}")
print(f"Deaths all: {sum(data_df.loc[~data_df.Deaths.isna()].groupby(['Province/State'])['Deaths'].max())}")

Data: rows: 4935, cols: 10
Days: 50 (01-22-2020 : 03-11-2020)
Country/Region: 143
Province/State: 252
Confirmed all: 87837.0
Recovered all: 62113.0
Deaths all: 3295.0


In [39]:
data_df.head()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
0,1.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Anhui,,01-22-2020
1,14.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Beijing,,01-22-2020
2,6.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Chongqing,,01-22-2020
3,1.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Fujian,,01-22-2020
4,,Mainland China,2020-01-22,,1/22/2020 17:00,,,Gansu,,01-22-2020


In [40]:
data_df.tail()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
211,0.0,US,2020-03-11,0.0,2020-03-10T02:33:04,32.7416,-89.6787,Mississippi,0.0,03-11-2020
212,0.0,US,2020-03-11,0.0,2020-03-10T02:33:04,47.5289,-99.784,North Dakota,0.0,03-11-2020
213,0.0,US,2020-03-11,0.0,2020-03-10T02:33:04,38.4912,-80.9545,West Virginia,0.0,03-11-2020
214,0.0,US,2020-03-11,0.0,2020-03-10T02:33:04,42.756,-107.3025,Wyoming,0.0,03-11-2020
215,0.0,occupied Palestinian territory,2020-03-11,0.0,2020-03-11T20:53:02,31.9522,35.2332,,0.0,03-11-2020


In [41]:
province_state = data_df['Province/State'].unique()

for ps in province_state:

    data_df.loc[(data_df['Province/State']==ps) & (data_df['Latitude'].isna()), 'Latitude'] =\
                data_df.loc[(~data_df['Latitude'].isna()) & \
                            (data_df['Province/State']==ps), 'Latitude'].median()
    
    data_df.loc[(data_df['Province/State']==ps) & (data_df['Longitude'].isna()), 'Longitude'] =\
            data_df.loc[(~data_df['Longitude'].isna()) & \
                        (data_df['Province/State']==ps), 'Longitude'].median() 

In [42]:
country_region = data_df['Country/Region'].unique()

for cr in country_region:

    data_df.loc[(data_df['Country/Region']==cr) & (data_df['Latitude'].isna()), 'Latitude'] =\
                data_df.loc[(~data_df['Latitude'].isna()) & \
                            (data_df['Country/Region']==cr), 'Latitude'].median()
    
    data_df.loc[(data_df['Country/Region']==cr) & (data_df['Longitude'].isna()), 'Longitude'] =\
            data_df.loc[(~data_df['Longitude'].isna()) & \
                        (data_df['Country/Region']==cr), 'Longitude'].median() 

In [43]:
data_df.loc[data_df.Latitude.isna()]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
48,1.0,Ivory Coast,2020-01-27,,1/27/20 23:59,,,,,01-27-2020
78,1.0,Azerbaijan,2020-02-28,0.0,2020-02-28T15:03:26,,,,0.0,02-28-2020
99,1.0,North Ireland,2020-02-28,0.0,2020-02-28T05:43:02,,,,0.0,02-28-2020


In [44]:
data_df.loc[data_df.Longitude.isna()]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
48,1.0,Ivory Coast,2020-01-27,,1/27/20 23:59,,,,,01-27-2020
78,1.0,Azerbaijan,2020-02-28,0.0,2020-02-28T15:03:26,,,,0.0,02-28-2020
99,1.0,North Ireland,2020-02-28,0.0,2020-02-28T05:43:02,,,,0.0,02-28-2020


## Check countries duplicates

In [45]:
data_df.loc[data_df['Country/Region'].isin(['Iran', 'Iran (Islamic Republic of)'])]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
53,2.0,Iran,2020-02-19,2.0,2020-02-19T23:43:02,32.0,53.0,,0.0,02-19-2020
46,5.0,Iran,2020-02-20,2.0,2020-02-20T17:33:02,32.0,53.0,,0.0,02-20-2020
38,18.0,Iran,2020-02-21,4.0,2020-02-21T18:53:02,32.0,53.0,,0.0,02-21-2020
36,28.0,Iran,2020-02-22,5.0,2020-02-22T10:03:05,32.0,53.0,,0.0,02-22-2020
35,43.0,Iran,2020-02-23,8.0,2020-02-23T15:13:15,32.0,53.0,,0.0,02-23-2020
35,61.0,Iran,2020-02-24,12.0,2020-02-24T11:13:10,32.0,53.0,,0.0,02-24-2020
28,95.0,Iran,2020-02-25,16.0,2020-02-25T14:53:03,32.0,53.0,,0.0,02-25-2020
25,139.0,Iran,2020-02-26,19.0,2020-02-26T23:43:03,32.0,53.0,,49.0,02-26-2020
20,245.0,Iran,2020-02-27,26.0,2020-02-27T12:03:04,32.0,53.0,,49.0,02-27-2020
16,388.0,Iran,2020-02-28,34.0,2020-02-28T15:43:03,32.0,53.0,,73.0,02-28-2020


Unify Iran

In [46]:
data_df.loc[data_df['Country/Region']=='Iran (Islamic Republic of)', 'Country/Region'] = 'Iran'

In [47]:
data_df.loc[data_df['Country/Region'].isin(['Iran', 'Iran (Islamic Republic of)'])]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
53,2.0,Iran,2020-02-19,2.0,2020-02-19T23:43:02,32.0,53.0,,0.0,02-19-2020
46,5.0,Iran,2020-02-20,2.0,2020-02-20T17:33:02,32.0,53.0,,0.0,02-20-2020
38,18.0,Iran,2020-02-21,4.0,2020-02-21T18:53:02,32.0,53.0,,0.0,02-21-2020
36,28.0,Iran,2020-02-22,5.0,2020-02-22T10:03:05,32.0,53.0,,0.0,02-22-2020
35,43.0,Iran,2020-02-23,8.0,2020-02-23T15:13:15,32.0,53.0,,0.0,02-23-2020
35,61.0,Iran,2020-02-24,12.0,2020-02-24T11:13:10,32.0,53.0,,0.0,02-24-2020
28,95.0,Iran,2020-02-25,16.0,2020-02-25T14:53:03,32.0,53.0,,0.0,02-25-2020
25,139.0,Iran,2020-02-26,19.0,2020-02-26T23:43:03,32.0,53.0,,49.0,02-26-2020
20,245.0,Iran,2020-02-27,26.0,2020-02-27T12:03:04,32.0,53.0,,49.0,02-27-2020
16,388.0,Iran,2020-02-28,34.0,2020-02-28T15:43:03,32.0,53.0,,73.0,02-28-2020


## Fix Lat/Long where missing

In [48]:
data_df.loc[data_df['Country/Region']=='Azerbaijan']

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
73,3.0,Azerbaijan,2020-03-01,0.0,2020-03-01T02:43:03,40.1431,47.5769,,0.0,03-01-2020
81,3.0,Azerbaijan,2020-03-02,0.0,2020-03-01T02:43:03,40.1431,47.5769,,0.0,03-02-2020
83,3.0,Azerbaijan,2020-03-03,0.0,2020-03-01T02:43:03,40.1431,47.5769,,0.0,03-03-2020
91,3.0,Azerbaijan,2020-03-04,0.0,2020-03-01T02:43:03,40.1431,47.5769,,0.0,03-04-2020
82,6.0,Azerbaijan,2020-03-05,0.0,2020-03-05T13:53:03,40.1431,47.5769,,0.0,03-05-2020
92,6.0,Azerbaijan,2020-03-06,0.0,2020-03-05T13:53:03,40.1431,47.5769,,0.0,03-06-2020
89,9.0,Azerbaijan,2020-03-07,0.0,2020-03-07T02:13:09,40.1431,47.5769,,0.0,03-07-2020
99,9.0,Azerbaijan,2020-03-08,0.0,2020-03-07T02:13:09,40.1431,47.5769,,0.0,03-08-2020
100,9.0,Azerbaijan,2020-03-09,0.0,2020-03-07T02:13:09,40.1431,47.5769,,0.0,03-09-2020
104,11.0,Azerbaijan,2020-03-10,0.0,2020-03-10T16:13:27,40.1431,47.5769,,0.0,03-10-2020


In [49]:
data_df.loc[data_df['Country/Region']=='Ivory Coast', 'Longitude'] = 5.54
data_df.loc[data_df['Country/Region']=='Ivory Coast', 'Latitude'] = 7.54
data_df.loc[data_df['Country/Region']=='North Ireland', 'Longitude'] = 6.4923
data_df.loc[data_df['Country/Region']=='North Ireland', 'Latitude'] = 54.7877

In [50]:
data_df = data_df[['Country/Region', 'Province/State', 'Latitude', 'Longitude', 'Confirmed', 'Recovered', 'Deaths', 'Date']]

In [51]:
data_df.head()

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
0,Mainland China,Anhui,31.8257,117.2264,1.0,,,2020-01-22
1,Mainland China,Beijing,40.1824,116.4142,14.0,,,2020-01-22
2,Mainland China,Chongqing,30.0572,107.874,6.0,,,2020-01-22
3,Mainland China,Fujian,26.0789,117.9874,1.0,,,2020-01-22
4,Mainland China,Gansu,36.0611,103.8343,,,,2020-01-22


In [52]:
data_df.to_csv("covid-19-all.csv", index=False)