# Traitement de la source de données¶


In [29]:
import os
from datetime import date, timedelta

import pandas as pd

In [42]:
# Racine des fichiers quotidiens
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv'

# Dates de disponibilité des fichiers
START_DATE = date(2020, 1, 22)
END_DATE = date(2020, 3, 21)

# Répertoire de sauvegarde des fichiers bruts
RAWFILES_DIR = '/Users/EnzoButhiot/Documents/AMSE/MAG3/S2/Projet_mignot/corona/data/raw'
PROCESSED_DIR = '/Users/EnzoButhiot/Documents/AMSE/MAG3/S2/Projet_mignot/corona/data/processed'


# Fichier principal
ALL_DATA_FILE = 'all_data.csv'

#TODO: A remplacer par la lecture du fichier env.yaml

## Boucle de récupération des fichiers

In [43]:
delta = END_DATE - START_DATE       # as timedelta

for i in range(delta.days + 1):
    day = START_DATE + timedelta(days=i)
    day_label = day.strftime("%m-%d-%Y")
    print(day_label)
    virus_df = pd.read_csv(BASE_URL.format(day_label), sep=',', parse_dates=['Last Update'])
    virus_df.to_csv(os.path.join(RAWFILES_DIR, day_label + '.csv'), index=False)

01-22-2020
01-23-2020
01-24-2020
01-25-2020
01-26-2020
01-27-2020
01-28-2020
01-29-2020
01-30-2020
01-31-2020
02-01-2020
02-02-2020
02-03-2020
02-04-2020
02-05-2020
02-06-2020
02-07-2020
02-08-2020
02-09-2020
02-10-2020
02-11-2020
02-12-2020
02-13-2020
02-14-2020
02-15-2020
02-16-2020
02-17-2020
02-18-2020
02-19-2020
02-20-2020
02-21-2020
02-22-2020
02-23-2020
02-24-2020
02-25-2020
02-26-2020
02-27-2020
02-28-2020
02-29-2020
03-01-2020
03-02-2020
03-03-2020
03-04-2020
03-05-2020
03-06-2020
03-07-2020
03-08-2020
03-09-2020
03-10-2020
03-11-2020
03-12-2020
03-13-2020
03-14-2020
03-15-2020
03-16-2020
03-17-2020
03-18-2020
03-19-2020
03-20-2020
03-21-2020


## Constitution de la table de références lat / log

In [32]:
import glob

df_list = []

# Lecture des fichiers récupérés et sélection de ceux qui ont une lat / long
for file in glob.glob(os.path.join(RAWFILES_DIR, '*.csv')):
    virus_df = pd.read_csv(file, sep=',')
    if 'Latitude' in virus_df.columns and 'Longitude' in virus_df.columns:
        df_list.append(virus_df)

all_df = pd.concat(df_list)

# Création d'une table de références pour les lat/long
(all_df[['Province/State', 'Country/Region', 'Latitude', 'Longitude']]
 .drop_duplicates(subset=['Province/State', 'Country/Region'])
 .sort_values(by=['Country/Region', 'Province/State'])
 .to_csv(os.path.join(PROCESSED_DIR, 'lat_long_table.csv'), index=False)
)

## Construction d'une table unique

In [33]:
data_catalog = {
    'Last Update': ['<M8[ns]'],
    'Confirmed': ['float64', 'int64'],
    'Deaths': ['float64', 'int64'],
    'Recovered': ['float64', 'int64'],
    'Latitude': ['float64'],
    'Longitude': ['float64'],
}

In [39]:
df_list = []
latlong_df = pd.read_csv(os.path.join(PROCESSED_DIR, 'lat_long_table.csv'))

# Lecture des fichiers récupérés et sélection de ceux qui ont une lat / long
for file in glob.glob(os.path.join(RAWFILES_DIR, '*.csv')):
    virus_df = pd.read_csv(file, sep=',', parse_dates=['Last Update'])
    if not('Latitude' in virus_df.columns and 'Longitude' in virus_df.columns):
        virus_df = virus_df.merge(latlong_df, on=['Province/State', 'Country/Region'], how='left')
        
    for field, types in data_catalog.items():
        assert virus_df[field].dtypes in types, f"Bad type for {field} in {file}"
        
    df_list.append(virus_df.assign(source=os.path.basename(file)))

all_df = pd.concat(df_list)
all_df = all_df.sort_values(by=['Last Update'])

# Sauvegarde de la table totale
all_df.to_csv(os.path.join(PROCESSED_DIR, 'all_data.csv'), index=False)

In [38]:
all_df.sort_values(by=['Last Update'])

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,source
19,Liaoning,Mainland China,2020-01-22 17:00:00,2.0,,,41.2956,122.6085,01-22-2020.csv
21,Ningxia,Mainland China,2020-01-22 17:00:00,1.0,,,37.2692,106.1655,01-22-2020.csv
22,Qinghai,Mainland China,2020-01-22 17:00:00,,,,35.7452,95.9956,01-22-2020.csv
23,Shaanxi,Mainland China,2020-01-22 17:00:00,,,,35.1917,108.8701,01-22-2020.csv
24,Shandong,Mainland China,2020-01-22 17:00:00,2.0,,,36.3427,118.1498,01-22-2020.csv
...,...,...,...,...,...,...,...,...,...
67,Tennessee,US,2020-03-21 23:13:18,371.0,1.0,0.0,35.7478,-86.6923,03-21-2020.csv
119,Alabama,US,2020-03-21 23:13:18,131.0,0.0,0.0,32.3182,-86.9023,03-21-2020.csv
271,,Cape Verde,2020-03-21 23:43:02,1.0,0.0,0.0,15.1111,-23.6167,03-21-2020.csv
284,,Papua New Guinea,2020-03-21 23:43:02,1.0,0.0,0.0,-6.3150,143.9555,03-21-2020.csv
