In [1]:
import os
import re
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
covid_repo_path = "..\\..\\COVID-19"
db_source = os.path.join(covid_repo_path, "csse_covid_19_data\\csse_covid_19_daily_reports")
print(f"list of files: {len(os.listdir(db_source))}")

list of files: 55


In [3]:
data_df = pd.DataFrame()
for file in tqdm(os.listdir(db_source)):
    try:
        crt_date, crt_ext = crt_file = file.split(".")
        if(crt_ext == "csv"):
            crt_date_df = pd.read_csv(os.path.join(db_source, file))
            crt_date_df['date_str'] = crt_date
            crt_date_df['Date'] = crt_date_df['date_str'].apply(lambda x: datetime.strptime(x, "%m-%d-%Y"))
            data_df = data_df.append(crt_date_df)
    except:
        pass

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
100%|██████████| 55/55 [00:00<00:00, 61.20it/s]


In [4]:
print(f"Data: rows: {data_df.shape[0]}, cols: {data_df.shape[1]}")
print(f"Days: {data_df.date_str.nunique()} ({data_df.date_str.min()} : {data_df.date_str.max()})")
print(f"Country/Region: {data_df['Country/Region'].nunique()}")
print(f"Province/State: {data_df['Province/State'].nunique()}")
print(f"Confirmed all: {sum(data_df.groupby(['Province/State'])['Confirmed'].max())}")
print(f"Recovered all: {sum(data_df.loc[~data_df.Recovered.isna()].groupby(['Province/State'])['Recovered'].max())}")
print(f"Deaths all: {sum(data_df.loc[~data_df.Deaths.isna()].groupby(['Province/State'])['Deaths'].max())}")

Data: rows: 5632, cols: 10
Days: 53 (01-22-2020 : 03-14-2020)
Country/Region: 171
Province/State: 261
Confirmed all: 93277.0
Recovered all: 66153.0
Deaths all: 3410.0


In [5]:
data_df.head()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
0,1.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Anhui,,01-22-2020
1,14.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Beijing,,01-22-2020
2,6.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Chongqing,,01-22-2020
3,1.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Fujian,,01-22-2020
4,,Mainland China,2020-01-22,,1/22/2020 17:00,,,Gansu,,01-22-2020


In [6]:
data_df.tail()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
244,1.0,US,2020-03-14,0.0,2020-03-14T16:15:18,18.3358,-64.8963,"Virgin Islands, U.S.",0.0,03-14-2020
245,1.0,United Kingdom,2020-03-14,0.0,2020-03-14T16:33:03,36.1408,-5.3536,Gibraltar,1.0,03-14-2020
246,0.0,Australia,2020-03-14,0.0,2020-03-14T02:33:04,35.4437,139.638,From Diamond Princess,0.0,03-14-2020
247,0.0,US,2020-03-14,0.0,2020-03-10T02:33:04,38.4912,-80.9545,West Virginia,0.0,03-14-2020
248,0.0,occupied Palestinian territory,2020-03-14,0.0,2020-03-11T20:53:02,31.9522,35.2332,,0.0,03-14-2020


In [7]:
province_state = data_df['Province/State'].unique()

for ps in province_state:

    data_df.loc[(data_df['Province/State']==ps) & (data_df['Latitude'].isna()), 'Latitude'] =\
                data_df.loc[(~data_df['Latitude'].isna()) & \
                            (data_df['Province/State']==ps), 'Latitude'].median()
    
    data_df.loc[(data_df['Province/State']==ps) & (data_df['Longitude'].isna()), 'Longitude'] =\
            data_df.loc[(~data_df['Longitude'].isna()) & \
                        (data_df['Province/State']==ps), 'Longitude'].median() 

In [8]:
country_region = data_df['Country/Region'].unique()

for cr in country_region:

    data_df.loc[(data_df['Country/Region']==cr) & (data_df['Latitude'].isna()), 'Latitude'] =\
                data_df.loc[(~data_df['Latitude'].isna()) & \
                            (data_df['Country/Region']==cr), 'Latitude'].median()
    
    data_df.loc[(data_df['Country/Region']==cr) & (data_df['Longitude'].isna()), 'Longitude'] =\
            data_df.loc[(~data_df['Longitude'].isna()) & \
                        (data_df['Country/Region']==cr), 'Longitude'].median() 

In [9]:
data_df.loc[data_df.Latitude.isna()]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
48,1.0,Ivory Coast,2020-01-27,,1/27/20 23:59,,,,,01-27-2020
78,1.0,Azerbaijan,2020-02-28,0.0,2020-02-28T15:03:26,,,,0.0,02-28-2020
99,1.0,North Ireland,2020-02-28,0.0,2020-02-28T05:43:02,,,,0.0,02-28-2020


In [10]:
data_df.loc[data_df.Longitude.isna()]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
48,1.0,Ivory Coast,2020-01-27,,1/27/20 23:59,,,,,01-27-2020
78,1.0,Azerbaijan,2020-02-28,0.0,2020-02-28T15:03:26,,,,0.0,02-28-2020
99,1.0,North Ireland,2020-02-28,0.0,2020-02-28T05:43:02,,,,0.0,02-28-2020


## Check countries duplicates

In [11]:
data_df.loc[data_df['Country/Region'].isin(['Iran', 'Iran (Islamic Republic of)'])]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
53,2.0,Iran,2020-02-19,2.0,2020-02-19T23:43:02,32.0,53.0,,0.0,02-19-2020
46,5.0,Iran,2020-02-20,2.0,2020-02-20T17:33:02,32.0,53.0,,0.0,02-20-2020
38,18.0,Iran,2020-02-21,4.0,2020-02-21T18:53:02,32.0,53.0,,0.0,02-21-2020
36,28.0,Iran,2020-02-22,5.0,2020-02-22T10:03:05,32.0,53.0,,0.0,02-22-2020
35,43.0,Iran,2020-02-23,8.0,2020-02-23T15:13:15,32.0,53.0,,0.0,02-23-2020
35,61.0,Iran,2020-02-24,12.0,2020-02-24T11:13:10,32.0,53.0,,0.0,02-24-2020
28,95.0,Iran,2020-02-25,16.0,2020-02-25T14:53:03,32.0,53.0,,0.0,02-25-2020
25,139.0,Iran,2020-02-26,19.0,2020-02-26T23:43:03,32.0,53.0,,49.0,02-26-2020
20,245.0,Iran,2020-02-27,26.0,2020-02-27T12:03:04,32.0,53.0,,49.0,02-27-2020
16,388.0,Iran,2020-02-28,34.0,2020-02-28T15:43:03,32.0,53.0,,73.0,02-28-2020


Unify Iran

In [12]:
data_df.loc[data_df['Country/Region']=='Iran (Islamic Republic of)', 'Country/Region'] = 'Iran'

In [13]:
data_df.loc[data_df['Country/Region'].isin(['Iran (Islamic Republic of)'])]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str


### Unify Check Republic, Macau, South Korea, Hong Kong variations

In [14]:
data_df.loc[data_df['Country/Region']=='Czechia', 'Country/Region'] = 'Czech Republic'
data_df.loc[data_df['Country/Region']=='Hong Kong SAR', 'Country/Region'] = 'Hong Kong'
data_df.loc[data_df['Country/Region']=='Macao SAR', 'Country/Region'] = 'Macau'
data_df.loc[data_df['Country/Region']=='Korea, South', 'Country/Region'] = 'South Korea'
data_df.loc[data_df['Country/Region']=='Republic of Korea', 'Country/Region'] = 'South Korea'
data_df.loc[data_df['Country/Region']=='Holy See', 'Country/Region'] = 'Vatican City'
data_df.loc[data_df['Country/Region']=='Taiwan*', 'Country/Region'] = 'Taiwan'
data_df.loc[data_df['Country/Region']==' Azerbaijan', 'Country/Region'] = 'Azerbaijan'

## Fix Lat/Long where missing

In [15]:
data_df.loc[data_df['Country/Region']=='Azerbaijan']

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
78,1.0,Azerbaijan,2020-02-28,0.0,2020-02-28T15:03:26,,,,0.0,02-28-2020
73,3.0,Azerbaijan,2020-03-01,0.0,2020-03-01T02:43:03,40.1431,47.5769,,0.0,03-01-2020
81,3.0,Azerbaijan,2020-03-02,0.0,2020-03-01T02:43:03,40.1431,47.5769,,0.0,03-02-2020
83,3.0,Azerbaijan,2020-03-03,0.0,2020-03-01T02:43:03,40.1431,47.5769,,0.0,03-03-2020
91,3.0,Azerbaijan,2020-03-04,0.0,2020-03-01T02:43:03,40.1431,47.5769,,0.0,03-04-2020
82,6.0,Azerbaijan,2020-03-05,0.0,2020-03-05T13:53:03,40.1431,47.5769,,0.0,03-05-2020
92,6.0,Azerbaijan,2020-03-06,0.0,2020-03-05T13:53:03,40.1431,47.5769,,0.0,03-06-2020
89,9.0,Azerbaijan,2020-03-07,0.0,2020-03-07T02:13:09,40.1431,47.5769,,0.0,03-07-2020
99,9.0,Azerbaijan,2020-03-08,0.0,2020-03-07T02:13:09,40.1431,47.5769,,0.0,03-08-2020
100,9.0,Azerbaijan,2020-03-09,0.0,2020-03-07T02:13:09,40.1431,47.5769,,0.0,03-09-2020


In [16]:
data_df.loc[data_df['Country/Region']=='Ivory Coast', 'Longitude'] = 5.54
data_df.loc[data_df['Country/Region']=='Ivory Coast', 'Latitude'] = 7.54
data_df.loc[data_df['Country/Region']=='North Ireland', 'Longitude'] = 6.4923
data_df.loc[data_df['Country/Region']=='North Ireland', 'Latitude'] = 54.7877

In [17]:
data_df = data_df[['Country/Region', 'Province/State', 'Latitude', 'Longitude', 'Confirmed', 'Recovered', 'Deaths', 'Date']]

In [18]:
data_df.head()

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
0,Mainland China,Anhui,31.8257,117.2264,1.0,,,2020-01-22
1,Mainland China,Beijing,40.1824,116.4142,14.0,,,2020-01-22
2,Mainland China,Chongqing,30.0572,107.874,6.0,,,2020-01-22
3,Mainland China,Fujian,26.0789,117.9874,1.0,,,2020-01-22
4,Mainland China,Gansu,36.0611,103.8343,,,,2020-01-22


## Check European Countries on 2020-03-12

Italy, Germany, France, Spain & other European Countries did not reported the data last days. Same data appears as for the previous day. We will fix using https://github.com/CSSEGISandData/COVID-19/issues/599 information, as following:

* Italy: 15113 confirmed cases, 1016 death and 1258 recovered (http://www.salute.gov.it/imgs/C_17_pagineAree_5351_8_file.pdf)  

* Spain: 2950 confirmed cases, 84 death
(https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/alertasActual/nCov-China/documentos/Actualizacion_42_COVID-19.pdf)  

* France: 2876 confirmed cases, 61 death (https://www.santepubliquefrance.fr/maladies-et-traumatismes/maladies-et-infections-respiratoires/infection-a-coronavirus/articles/infection-au-nouveau-coronavirus-sars-cov-2-covid-19-france-et-monde)  

* Switzerland: 815 confirmed cases, 4 death (https://www.bag.admin.ch/dam/bag/fr/dokumente/mt/k-und-i/aktuelle-ausbrueche-pandemien/2019-nCoV/covid-19-lagebericht.pdf.download.pdf/COVID-19_Situation_epidemiologique_en_Suisse.pdf)

* Germany : 2.369 confirmed, 5 deaths (https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html)

In [21]:
data_df[(data_df['Country/Region'].isin(['Italy', 'Spain', 'France', 'Switzerland', 'Germany', 'Netherlands'])) & (data_df['Date'].isin(['2020-03-11','2020-03-12', '2020-03-13']))]

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
1,Italy,,43.0,12.0,12462.0,1045.0,827.0,2020-03-11
4,France,France,46.2276,2.2137,2281.0,12.0,48.0,2020-03-11
5,Spain,,40.0,-4.0,2277.0,183.0,54.0,2020-03-11
6,Germany,,51.0,9.0,1908.0,25.0,3.0,2020-03-11
15,Switzerland,,46.8182,8.2275,652.0,4.0,4.0,2020-03-11
21,Netherlands,,52.1326,5.2913,503.0,0.0,5.0,2020-03-11
170,France,St Martin,18.0708,-63.0501,2.0,0.0,0.0,2020-03-11
186,France,Saint Barthelemy,17.9,-62.8333,1.0,0.0,0.0,2020-03-11
1,Italy,,43.0,12.0,12462.0,1045.0,827.0,2020-03-12
4,France,France,46.2276,2.2137,2281.0,12.0,48.0,2020-03-12


In [23]:
def fix_data_for_country_date(country, date, confirmed, recovered, deaths):
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Date']==date), 'Confirmed'] = confirmed
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Date']==date), 'Recovered'] = recovered
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Date']==date), 'Deaths'] = deaths

In [24]:
fix_data_for_country_date('Italy', '2020-03-12', 15113, 1258, 1016)
fix_data_for_country_date('Spain', '2020-03-12', 2950, 183, 84)
fix_data_for_country_date('France', '2020-03-12', 2896, 12, 61)
fix_data_for_country_date('Switzerland', '2020-03-12', 815, 4, 4)
fix_data_for_country_date('Germany', '2020-03-12', 2369, 25, 5)
fix_data_for_country_date('Netherlands', '2020-03-12', 614, 25, 5)

In [25]:
data_df[(data_df['Country/Region'].isin(['Italy', 'Spain', 'France', 'Switzerland', 'Germany'])) & (data_df['Date'].isin(['2020-03-11','2020-03-12']))]

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
1,Italy,,43.0,12.0,12462.0,1045.0,827.0,2020-03-11
4,France,France,46.2276,2.2137,2281.0,12.0,48.0,2020-03-11
5,Spain,,40.0,-4.0,2277.0,183.0,54.0,2020-03-11
6,Germany,,51.0,9.0,1908.0,25.0,3.0,2020-03-11
15,Switzerland,,46.8182,8.2275,652.0,4.0,4.0,2020-03-11
170,France,St Martin,18.0708,-63.0501,2.0,0.0,0.0,2020-03-11
186,France,Saint Barthelemy,17.9,-62.8333,1.0,0.0,0.0,2020-03-11
1,Italy,,43.0,12.0,15113.0,1258.0,1016.0,2020-03-12
4,France,France,46.2276,2.2137,2896.0,12.0,61.0,2020-03-12
5,Spain,,40.0,-4.0,2950.0,183.0,84.0,2020-03-12


In [26]:
data_df.to_csv("covid-19-all.csv", index=False)