In [1]:
import os
import re
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
covid_repo_path = "..\\..\\COVID-19"
db_source = os.path.join(covid_repo_path, "csse_covid_19_data\\csse_covid_19_daily_reports")
print(f"list of files: {len(os.listdir(db_source))}")

list of files: 63


In [3]:
data_df = pd.DataFrame()
for file in tqdm(os.listdir(db_source)):
    try:
        crt_date, crt_ext = crt_file = file.split(".")
        if(crt_ext == "csv"):
            crt_date_df = pd.read_csv(os.path.join(db_source, file))
            crt_date_df['date_str'] = crt_date
            crt_date_df['Date'] = crt_date_df['date_str'].apply(lambda x: datetime.strptime(x, "%m-%d-%Y"))
            data_df = data_df.append(crt_date_df)
    except:
        pass

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
100%|██████████| 63/63 [00:01<00:00, 59.84it/s]


In [4]:
print(f"Data: rows: {data_df.shape[0]}, cols: {data_df.shape[1]}")
print(f"Days: {data_df.date_str.nunique()} ({data_df.date_str.min()} : {data_df.date_str.max()})")
print(f"Country/Region: {data_df['Country/Region'].nunique()}")
print(f"Province/State: {data_df['Province/State'].nunique()}")
print(f"Confirmed all  (Province/State): {sum(data_df.groupby(['Province/State'])['Confirmed'].max())}")
print(f"Confirmed all (Country/Region): {sum(data_df.groupby(['Country/Region'])['Confirmed'].max())}")
print(f"Recovered all (Province/State): {sum(data_df.loc[~data_df.Recovered.isna()].groupby(['Province/State'])['Recovered'].max())}")
print(f"Recovered all (Country/Region): {sum(data_df.loc[~data_df.Recovered.isna()].groupby(['Country/Region'])['Recovered'].max())}")      
print(f"Deaths all (Province/State): {sum(data_df.loc[~data_df.Deaths.isna()].groupby(['Province/State'])['Deaths'].max())}")
print(f"Deaths all (Country/Region): {sum(data_df.loc[~data_df.Deaths.isna()].groupby(['Country/Region'])['Deaths'].max())}")

Data: rows: 7926, cols: 10
Days: 61 (01-22-2020 : 03-22-2020)
Country/Region: 215
Province/State: 283
Confirmed all  (Province/State): 147668.0
Confirmed all (Country/Region): 395258.0
Recovered all (Province/State): 75343.0
Recovered all (Country/Region): 136060.0
Deaths all (Province/State): 4906.0
Deaths all (Country/Region): 17649.0


In [5]:
data_df.head()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
0,1.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Anhui,,01-22-2020
1,14.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Beijing,,01-22-2020
2,6.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Chongqing,,01-22-2020
3,1.0,Mainland China,2020-01-22,,1/22/2020 17:00,,,Fujian,,01-22-2020
4,,Mainland China,2020-01-22,,1/22/2020 17:00,,,Gansu,,01-22-2020


In [6]:
data_df.tail()

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
304,0.0,Jersey,2020-03-22,0.0,2020-03-17T18:33:03,49.19,-2.11,,0.0,03-22-2020
305,0.0,Puerto Rico,2020-03-22,1.0,2020-03-22T22:43:02,18.2,-66.5,,0.0,03-22-2020
306,0.0,Republic of the Congo,2020-03-22,0.0,2020-03-17T21:33:03,-1.44,15.556,,0.0,03-22-2020
307,0.0,The Bahamas,2020-03-22,0.0,2020-03-19T12:13:38,24.25,-76.0,,0.0,03-22-2020
308,0.0,The Gambia,2020-03-22,0.0,2020-03-18T14:13:56,13.4667,-16.6,,0.0,03-22-2020


In [7]:
province_state = data_df['Province/State'].unique()

for ps in province_state:

    data_df.loc[(data_df['Province/State']==ps) & (data_df['Latitude'].isna()), 'Latitude'] =\
                data_df.loc[(~data_df['Latitude'].isna()) & \
                            (data_df['Province/State']==ps), 'Latitude'].median()
    
    data_df.loc[(data_df['Province/State']==ps) & (data_df['Longitude'].isna()), 'Longitude'] =\
            data_df.loc[(~data_df['Longitude'].isna()) & \
                        (data_df['Province/State']==ps), 'Longitude'].median() 

In [8]:
country_region = data_df['Country/Region'].unique()

for cr in country_region:

    data_df.loc[(data_df['Country/Region']==cr) & (data_df['Latitude'].isna()), 'Latitude'] =\
                data_df.loc[(~data_df['Latitude'].isna()) & \
                            (data_df['Country/Region']==cr), 'Latitude'].median()
    
    data_df.loc[(data_df['Country/Region']==cr) & (data_df['Longitude'].isna()), 'Longitude'] =\
            data_df.loc[(~data_df['Longitude'].isna()) & \
                        (data_df['Country/Region']==cr), 'Longitude'].median() 

In [9]:
data_df.loc[data_df.Latitude.isna()]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
48,1.0,Ivory Coast,2020-01-27,,1/27/20 23:59,,,,,01-27-2020
78,1.0,Azerbaijan,2020-02-28,0.0,2020-02-28T15:03:26,,,,0.0,02-28-2020
99,1.0,North Ireland,2020-02-28,0.0,2020-02-28T05:43:02,,,,0.0,02-28-2020


In [10]:
data_df.loc[data_df.Longitude.isna()]

Unnamed: 0,Confirmed,Country/Region,Date,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date_str
48,1.0,Ivory Coast,2020-01-27,,1/27/20 23:59,,,,,01-27-2020
78,1.0,Azerbaijan,2020-02-28,0.0,2020-02-28T15:03:26,,,,0.0,02-28-2020
99,1.0,North Ireland,2020-02-28,0.0,2020-02-28T05:43:02,,,,0.0,02-28-2020


## Unify countries duplicates

In [11]:
countries = list(data_df['Country/Region'].unique())
countries.sort()
print(countries)

[' Azerbaijan', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Channel Islands', 'Chile', 'China', 'Colombia', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cruise Ship', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'East Timor', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'Gabon', 'Gambia, The', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Gr

In [12]:
data_df.loc[data_df['Country/Region']==' Azerbaijan', 'Country/Region'] = 'Azerbaijan'
data_df.loc[data_df['Country/Region']=='Czechia', 'Country/Region'] = 'Czech Republic'
data_df.loc[data_df['Country/Region']=="Cote d'Ivoire", 'Country/Region'] = 'Ivory Coast'
data_df.loc[data_df['Country/Region']=='Iran (Islamic Republic of)', 'Country/Region'] = 'Iran'
data_df.loc[data_df['Country/Region']=='Hong Kong SAR', 'Country/Region'] = 'Hong Kong'
data_df.loc[data_df['Country/Region']=='Holy See', 'Country/Region'] = 'Vatican City'
data_df.loc[data_df['Country/Region']=='Macao SAR', 'Country/Region'] = 'Macau'
data_df.loc[data_df['Country/Region']=='Mainland China', 'Country/Region'] = 'China'
data_df.loc[data_df['Country/Region']=='Republic of Ireland', 'Country/Region'] = 'Ireland'
data_df.loc[data_df['Country/Region']=='Korea, South', 'Country/Region'] = 'South Korea'
data_df.loc[data_df['Country/Region']=='Republic of Ireland', 'Country/Region'] = 'Ireland'
data_df.loc[data_df['Country/Region']=='Republic of Korea', 'Country/Region'] = 'South Korea'
data_df.loc[data_df['Country/Region']=='Republic of Moldova', 'Country/Region'] = 'Moldova'
data_df.loc[data_df['Country/Region']=='Republic of the Congo', 'Country/Region'] = 'Congo (Brazzaville)'
data_df.loc[data_df['Country/Region']=='Taiwan*', 'Country/Region'] = 'Taiwan'
data_df.loc[data_df['Country/Region']=='The Gambia', 'Country/Region'] = 'Gambia'
data_df.loc[data_df['Country/Region']=='Gambia, The', 'Country/Region'] = 'Gambia'
data_df.loc[data_df['Country/Region']=='UK', 'Country/Region'] = 'United Kingdom'
data_df.loc[data_df['Country/Region']=='Viet Nam', 'Country/Region'] = 'Vietnam'

## Fix Lat/Long where missing

In [13]:
data_df.loc[data_df['Country/Region']=='Ivory Coast', 'Longitude'] = 5.54
data_df.loc[data_df['Country/Region']=='Ivory Coast', 'Latitude'] = 7.54
data_df.loc[data_df['Country/Region']=='North Ireland', 'Longitude'] = 6.4923
data_df.loc[data_df['Country/Region']=='North Ireland', 'Latitude'] = 54.7877

In [14]:
data_df = data_df[['Country/Region', 'Province/State', 'Latitude', 'Longitude', 'Confirmed', 'Recovered', 'Deaths', 'Date']]

In [15]:
data_df.head()

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
0,China,Anhui,31.8257,117.2264,1.0,,,2020-01-22
1,China,Beijing,40.1824,116.4142,14.0,,,2020-01-22
2,China,Chongqing,30.0572,107.874,6.0,,,2020-01-22
3,China,Fujian,26.0789,117.9874,1.0,,,2020-01-22
4,China,Gansu,36.0611,103.8343,,,,2020-01-22


## Check European Countries on 2020-03-12

Italy, Germany, France, Spain & other European Countries did not reported the data last days. Same data appears as for the previous day. We will fix using https://github.com/CSSEGISandData/COVID-19/issues/599 information, as following:

* Italy: 15113 confirmed cases, 1016 death and 1258 recovered (http://www.salute.gov.it/imgs/C_17_pagineAree_5351_8_file.pdf)  

* Spain: 2950 confirmed cases, 84 death
(https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/alertasActual/nCov-China/documentos/Actualizacion_42_COVID-19.pdf)  

* France: 2876 confirmed cases, 61 death (https://www.santepubliquefrance.fr/maladies-et-traumatismes/maladies-et-infections-respiratoires/infection-a-coronavirus/articles/infection-au-nouveau-coronavirus-sars-cov-2-covid-19-france-et-monde)  

* Switzerland: 815 confirmed cases, 4 death (https://www.bag.admin.ch/dam/bag/fr/dokumente/mt/k-und-i/aktuelle-ausbrueche-pandemien/2019-nCoV/covid-19-lagebericht.pdf.download.pdf/COVID-19_Situation_epidemiologique_en_Suisse.pdf)

* Germany : 2.369 confirmed, 5 deaths (https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html)  

* All EU countries: https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide  


In [16]:
data_df[(data_df['Country/Region'].isin(['Italy', 'Spain', 'France', 'Switzerland', 'Germany', 'Netherlands'])) & (data_df['Date'].isin(['2020-03-11','2020-03-12', '2020-03-13']))]

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
1,Italy,,43.0,12.0,12462.0,1045.0,827.0,2020-03-11
4,France,France,46.2276,2.2137,2281.0,12.0,48.0,2020-03-11
5,Spain,,40.0,-4.0,2277.0,183.0,54.0,2020-03-11
6,Germany,,51.0,9.0,1908.0,25.0,3.0,2020-03-11
15,Switzerland,,46.8182,8.2275,652.0,4.0,4.0,2020-03-11
21,Netherlands,,52.1326,5.2913,503.0,0.0,5.0,2020-03-11
170,France,St Martin,18.0708,-63.0501,2.0,0.0,0.0,2020-03-11
186,France,Saint Barthelemy,17.9,-62.8333,1.0,0.0,0.0,2020-03-11
1,Italy,,43.0,12.0,12462.0,1045.0,827.0,2020-03-12
4,France,France,46.2276,2.2137,2281.0,12.0,48.0,2020-03-12


In [17]:
def fix_data_for_country_date(country, date, confirmed, recovered, deaths):
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Date']==date), 'Confirmed'] = confirmed
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Date']==date), 'Recovered'] = recovered
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Date']==date), 'Deaths'] = deaths

In [18]:
def fix_data_for_france_date(country, date, confirmed, recovered, deaths):
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Province/State']==country) & (data_df['Date']==date), 'Confirmed'] = confirmed
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Province/State']==country)  & (data_df['Date']==date), 'Recovered'] = recovered
    data_df.loc[(data_df['Country/Region']==country) & (data_df['Province/State']==country)  & (data_df['Date']==date), 'Deaths'] = deaths

In [19]:
fix_data_for_country_date('Italy', '2020-03-12', 15113, 1258, 1016)
fix_data_for_country_date('Spain', '2020-03-12', 2950, 183, 84)
fix_data_for_france_date('France', '2020-03-12', 2896, 12, 61)
fix_data_for_country_date('Switzerland', '2020-03-12', 815, 4, 4)
fix_data_for_country_date('Germany', '2020-03-12', 2369, 25, 5)
fix_data_for_country_date('Netherlands', '2020-03-12', 614, 25, 5)

In [20]:
data_df[(data_df['Country/Region'].isin(['Italy', 'Spain', 'France', 'Switzerland', 'Germany', 'Netherlands'])) & (data_df['Date'].isin(['2020-03-11','2020-03-12', '2020-03-13']))]

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
1,Italy,,43.0,12.0,12462.0,1045.0,827.0,2020-03-11
4,France,France,46.2276,2.2137,2281.0,12.0,48.0,2020-03-11
5,Spain,,40.0,-4.0,2277.0,183.0,54.0,2020-03-11
6,Germany,,51.0,9.0,1908.0,25.0,3.0,2020-03-11
15,Switzerland,,46.8182,8.2275,652.0,4.0,4.0,2020-03-11
21,Netherlands,,52.1326,5.2913,503.0,0.0,5.0,2020-03-11
170,France,St Martin,18.0708,-63.0501,2.0,0.0,0.0,2020-03-11
186,France,Saint Barthelemy,17.9,-62.8333,1.0,0.0,0.0,2020-03-11
1,Italy,,43.0,12.0,15113.0,1258.0,1016.0,2020-03-12
4,France,France,46.2276,2.2137,2896.0,12.0,61.0,2020-03-12


In [21]:
data_df[(data_df['Country/Region'].isin(['France'])) & (data_df['Date'].isin(['2020-03-11','2020-03-12', '2020-03-13']))]

Unnamed: 0,Country/Region,Province/State,Latitude,Longitude,Confirmed,Recovered,Deaths,Date
4,France,France,46.2276,2.2137,2281.0,12.0,48.0,2020-03-11
170,France,St Martin,18.0708,-63.0501,2.0,0.0,0.0,2020-03-11
186,France,Saint Barthelemy,17.9,-62.8333,1.0,0.0,0.0,2020-03-11
4,France,France,46.2276,2.2137,2896.0,12.0,61.0,2020-03-12
173,France,St Martin,18.0708,-63.0501,2.0,0.0,0.0,2020-03-12
191,France,Saint Barthelemy,17.9,-62.8333,1.0,0.0,0.0,2020-03-12
79,France,France,46.2276,2.2137,3661.0,12.0,79.0,2020-03-13
99,France,French Polynesia,-17.6797,149.4068,3.0,0.0,0.0,2020-03-13
132,France,Saint Barthelemy,17.9,-62.8333,1.0,0.0,0.0,2020-03-13
133,France,St Martin,18.0708,-63.0501,2.0,0.0,0.0,2020-03-13


In [22]:
#data_df[data_df['Country/Region'].isin(['Romania'])]

In [23]:
data_df.to_csv("covid-19-all.csv", index=False)