# COVID Data Scrapper and Parser

## Source data

The data is downloaded from https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports

## Parsed files

The parsed files are stored in `../data/covid/` with the following columns:
- `date`: date, format: YYYY-MM-DD
- `confirmed`: integer, acumulated number of intecions.
- `deaths`: integer.
- `recovered`: integer.
- `active`: integer.

## Parsing rules

- Data is grouped by country, i.e. provinces/states are not detailed.
- If `active` column is missing in the source file, it is calculated as `active = confirmed - deaths - recovered`.
- If a country is missing in the source file, its data is filled with the last known data of the country.
- Data collected between `22-01-2020` and `30-05-2020` (both included).
- Target countries: `Russia`, `USA` and `Spain`.

In [1]:
import requests
import os
import pandas as pd
import io
from datetime import timedelta, date
import re

In [2]:
def create_path(path):
    if not os.path.exists(path):
        os.mkdir(path)

In [3]:
DATA_PATH = '../data/covid/'
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/'

TARGET_COUNTRIES = ['Russia', 'US', 'Spain']

create_path(DATA_PATH)

In [4]:
def get_remote_file(filename):
    url = BASE_URL + filename
    r = requests.get(url, allow_redirects=False)
    c = pd.read_csv(io.StringIO(r.content.decode('utf-8')))
    return c

In [5]:
def normalize_file_content(df):
    col_mapping = {
        'Country_Region': 'country',
        'Country/Region': 'country',
        'Confirmed': 'confirmed',
        'Deaths': 'deaths',
        'Recovered': 'recovered',
        'Active': 'active'
    }
    
    df = df.rename(columns=col_mapping)
    df = df.fillna(0)
    
    if 'active' not in df.columns:
        df['active'] = df['confirmed'] - df['deaths'] - df['recovered']
    
    df = df[['country', 'confirmed', 'deaths', 'recovered', 'active']]
    
    # Normalize country name (only target countries)
    for index, row in df.iterrows():
        for country in TARGET_COUNTRIES:
            if re.match(country, row['country']):
                df.at[index, 'country'] = country
                break
    
    return df

In [6]:
def filter_target_countries(df):
    to_remove = []
    for index, row in df.iterrows():
        is_target_country = False
        
        for country in TARGET_COUNTRIES:
            if country == row['country']:
                is_target_country = True
                break
        
        if not is_target_country:
            to_remove.append(index)
    
    df = df.drop(to_remove)
    df = df.reset_index(drop=True)
    
    return df

In [7]:
def group_by_country(df):
    return df.groupby(['country'], as_index=False).sum()

In [8]:
def init_country_data():
    template = {'confirmed': 0, 'deaths': 0, 'recovered': 0, 'active': 0}
    init_data = {}
    
    for country in TARGET_COUNTRIES:
        c_template = template.copy()
        c_template['country'] = country
        init_data[country] = c_template
    
    return init_data

In [9]:
def init_df_per_country():
    df_empty = pd.DataFrame({'date': [], 'confirmed': [], 'deaths': [], 'recovered': [], 'active': []})
    df_per_country = {country:df_empty.copy() for country in TARGET_COUNTRIES}
    return df_per_country

In [10]:
def fill_missing_countries(df, last_country_data):
    for country in TARGET_COUNTRIES:
        found_country = False
        df_country = df.loc[df['country'] == country]
        
        if df_country.empty:
            # If missing, append the last known data
            last_known = {
                'confirmed': last_country_data[country]['confirmed'],
                'deaths': 0,
                'recovered': 0,
                'active': last_country_data[country]['active']
            }
            df = df.append(last_country_data[country], ignore_index=True)
        else:
            # If not missing, update the last known data
            df_country = df_country.iloc[0]
            last_country_data[country] = {
                'confirmed': df_country['confirmed'],
                'deaths': df_country['deaths'],
                'recovered': df_country['recovered'],
                'active': df_country['active']
            }
            
    return df, last_country_data

In [11]:
initial_date = date(2020, 1, 22)
last_date = date(2020, 5, 30)

last_country_data = init_country_data()
df_per_country = init_df_per_country()

loop_date = initial_date
while loop_date < last_date:
    str_date = loop_date.strftime('%Y-%m-%d')
    print('Date: %s' % str_date)
    
    # Get file content
    filename = loop_date.strftime('%m-%d-%Y') + '.csv'
    df = get_remote_file(filename)
    
    # Parsing rules
    df = normalize_file_content(df)
    df = filter_target_countries(df)
    df = group_by_country(df)
    df, last_country_data = fill_missing_countries(df, last_country_data)
    
    # Add data to the correct timeline
    for country in TARGET_COUNTRIES:
        df_country = df.loc[df['country'] == country].iloc[0]
        df_country['date'] = str_date
        df_per_country[country] = df_per_country[country].append(dict(df_country), ignore_index=True)
    
    # Go to the next day
    loop_date = loop_date + timedelta(days=1)

print('Finished!')

Date: 2020-01-22
Date: 2020-01-23
Date: 2020-01-24
Date: 2020-01-25
Date: 2020-01-26
Date: 2020-01-27
Date: 2020-01-28
Date: 2020-01-29
Date: 2020-01-30
Date: 2020-01-31
Date: 2020-02-01
Date: 2020-02-02
Date: 2020-02-03
Date: 2020-02-04
Date: 2020-02-05
Date: 2020-02-06
Date: 2020-02-07
Date: 2020-02-08
Date: 2020-02-09
Date: 2020-02-10
Date: 2020-02-11
Date: 2020-02-12
Date: 2020-02-13
Date: 2020-02-14
Date: 2020-02-15
Date: 2020-02-16
Date: 2020-02-17
Date: 2020-02-18
Date: 2020-02-19
Date: 2020-02-20
Date: 2020-02-21
Date: 2020-02-22
Date: 2020-02-23
Date: 2020-02-24
Date: 2020-02-25
Date: 2020-02-26
Date: 2020-02-27
Date: 2020-02-28
Date: 2020-02-29
Date: 2020-03-01
Date: 2020-03-02
Date: 2020-03-03
Date: 2020-03-04
Date: 2020-03-05
Date: 2020-03-06
Date: 2020-03-07
Date: 2020-03-08
Date: 2020-03-09
Date: 2020-03-10
Date: 2020-03-11
Date: 2020-03-12
Date: 2020-03-13
Date: 2020-03-14
Date: 2020-03-15
Date: 2020-03-16
Date: 2020-03-17
Date: 2020-03-18
Date: 2020-03-19
Date: 2020-03-

In [14]:
# Save data
for country, df in df_per_country.items():
    path = DATA_PATH + country.lower() + '.csv'
    df[['date', 'confirmed', 'deaths', 'recovered', 'active']].to_csv(path, index=False)