# Exploration des données

In [90]:
import os
import pandas as pd
from datetime import date, timedelta


In [91]:
# Racine des fichiers quotidiens
BASE_URL ='https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv'

# Date de disponibilité des fichiers 
START_DATE =date(2020, 1,22)
END_DATE =date(2020, 3,5)

# Répertoire de sauvegarde des fichiers bruts
RAWFILES_DIR = '../data/raw/'
PROCESSED_DIR = '../data/processed/'

## Boucle de récupération des fichiers

In [92]:
delta = END_DATE - START_DATE       # as timedelta

for i in range(delta.days + 1):
    day = START_DATE + timedelta(days=i)
    day_label = day.strftime("%m-%d-%Y")
    virus_df = pd.read_csv(BASE_URL.format(day_label), sep=',', parse_dates=['Last Update'])
    virus_df.to_csv(os.path.join(RAWFILES_DIR, day_label +'.csv'), index=False)

In [93]:
(pd.read_csv(RAWFILES_DIR + '01-28-2020.csv', 
            sep=','
           )
).head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Hubei,Mainland China,2020-01-28 23:00:00,3554,125.0,80.0
1,Guangdong,Mainland China,2020-01-28 23:00:00,207,,4.0
2,Zhejiang,Mainland China,2020-01-28 23:00:00,173,,3.0
3,Henan,Mainland China,2020-01-28 23:00:00,168,1.0,
4,Hunan,Mainland China,2020-01-28 23:00:00,143,,


## Constitution de la table de référence lat/long

In [94]:
import glob


In [95]:

df_list = []

for file in glob.glob(os.path.join(RAWFILES_DIR, '*.csv')):
    virus_df = pd.read_csv(file, sep=',')
    if 'Latitude' in virus_df.columns and 'Longitude' in virus_df.columns:
        df_list.append(virus_df)

all_df =  pd.concat(df_list)

# Création d'une table de référence pour latitude et longitude 

(all_df[['Province/State','Country/Region','Latitude', 'Longitude']]
 .drop_duplicates(subset=['Province/State','Country/Region'])
 .sort_values(by=['Country/Region','Province/State',])
 .to_csv(os.path.join(PROCESSED_DIR, 'lat_long_table.csv'), index= False)
)

## Construction d'une table unique

In [96]:
data_catalog = {
    'Last Update': ['<M8[ns]'],
    'Confirmed' : ['float','int64'],
    'Deaths':['float','int64'],
    'Recovered':['float','int64'],
    'Latitude': ['float64'],
    'Longitude':['float64'],
}

In [97]:
df_list=[]
latlong_df = pd.read_csv(os.path.join(PROCESSED_DIR, 'lat_long_table.csv'))

# Lecture des fichiers récupérés et sélection de
for file in glob.glob(os.path.join(RAWFILES_DIR, '*.csv')):
    virus_df = pd.read_csv(file, sep=',',parse_dates=['Last Update'])
    if not ('Latitude' in virus_df.columns or 'Longitude' in virus_df.columns):
        virus_df = virus_df.merge(latlong_df, on=['Province/State','Country/Region'], how='left')
    
    for field, types in data_catalog.items():
        assert virus_df[field].dtypes in types, f"Bad for {field} in {file}"
   
    df_list.append(virus_df.assign(source=os.path.basename(file)))
                        

all_df =  pd.concat(df_list)
                        
# Sauvegarde de la base finale
all_df.to_csv(os.path.join(PROCESSED_DIR, 'all_data.csv'), index = False)


In [98]:
all_df.dtypes

Province/State            object
Country/Region            object
Last Update       datetime64[ns]
Confirmed                float64
Deaths                   float64
Recovered                float64
Latitude                 float64
Longitude                float64
source                    object
dtype: object

In [99]:
all_df.shape

(3568, 9)