In [1]:
# Author: Emanuele Zeppieri <emazep@gmail.com>
#
# This code is distributed under the terms and conditions
# from the MIT License (MIT).

In [2]:
import datetime as dt
from datetime import date, datetime, timedelta
import time
import os.path
import tabula

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
ORIGINAL_DOC_DIR = '../original_ISS_documents/bollettino_sorveglianza_integrata/'
FILE_PREFIX = 'Bollettino-sorveglianza-integrata-COVID-19_'
EXPORT_DIR = '../data/'
FILES_TO_EXCLUDE = ['Bollettino-sorveglianza-integrata-COVID-19_2020-03-09.pdf']

# Let Java wait this number of seconds in case of tabula (Java) timing issues.
WAIT = 1

In [4]:
COLUMNS_CASES_MALE = [
    'cases_male_0-9', 'cases_male_10-19', 'cases_male_20-29',
    'cases_male_30-39', 'cases_male_40-49', 'cases_male_50-59',
    'cases_male_60-69', 'cases_male_70-79', 'cases_male_80-89',
    'cases_male_90-'
]
COLUMNS_DEATHS_MALE = [
    'deaths_male_0-9', 'deaths_male_10-19', 'deaths_male_20-29',
    'deaths_male_30-39', 'deaths_male_40-49', 'deaths_male_50-59',
    'deaths_male_60-69', 'deaths_male_70-79', 'deaths_male_80-89',
    'deaths_male_90-'
]
COLUMNS_CASES_FEMALE = [
    'cases_female_0-9', 'cases_female_10-19', 'cases_female_20-29',
    'cases_female_30-39', 'cases_female_40-49', 'cases_female_50-59',
    'cases_female_60-69', 'cases_female_70-79', 'cases_female_80-89',
    'cases_female_90-'
]
COLUMNS_DEATHS_FEMALE = [
    'deaths_female_0-9', 'deaths_female_10-19', 'deaths_female_20-29',
    'deaths_female_30-39', 'deaths_female_40-49', 'deaths_female_50-59',
    'deaths_female_60-69', 'deaths_female_70-79', 'deaths_female_80-89',
    'deaths_female_90-'
]

In [5]:
TABLE_PAGES = {
    'Bollettino-sorveglianza-integrata-COVID-19_2021-04-28.pdf': 14,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-04-21.pdf': 14,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-04-14.pdf': 23,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-04-07.pdf': 23,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-03-31.pdf': 23,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-03-24.pdf': 23,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-03-17.pdf': 23,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-03-10.pdf': 24,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-02-17.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-02-10.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-02-03.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-01-27.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2021-01-13.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-12-22.pdf': 23,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-12-16.pdf': 23,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-11-25.pdf': 23,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-10-13.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-10-06.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-09-29.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-09-22.pdf': 20,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-09-15.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-09-08.pdf': 20,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-09-01.pdf': 20,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-08-25.pdf': 21,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-08-18.pdf': 19,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-08-11.pdf': 12,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-08-04.pdf': 10,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-07-28.pdf': 8,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-07-21.pdf': 7,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-07-14.pdf': 7,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-07-07.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-06-30.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-06-23.pdf': 7,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-06-16.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-06-09.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-06-03.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-05-26.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-05-20.pdf': 7,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-05-14.pdf': 7,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-05-07.pdf': 8,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-04-28.pdf': 8,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-04-23.pdf': 7,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-04-16.pdf': 7,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-04-09.pdf': 8,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-04-06.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-04-02.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-03-30.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-03-26.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-03-23.pdf': 6,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-03-16.pdf': 5,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-03-12.pdf': 5,
    'Bollettino-sorveglianza-integrata-COVID-19_2020-03-09.pdf': 4
}
DEFAULT_TABLE_PAGE = 22

In [6]:
SPECIAL_CASES = {
    'Bollettino-sorveglianza-integrata-COVID-19_2021-03-03.pdf': {
        'columns_to_split': {
            3 : [
                [4, 6, 30, 113, 564, 2304, 6763, 16061, 21869, 6698],
                [40, 54.5, 58.8, 61.7, 69.6, 73.4, 73.9, 68.3, 54.3, 33.8]
            ],
            7 : [
                [6, 5, 21, 70, 246, 835, 2390, 7463, 18416, 13103],
                [60, 45.5, 41.2, 38.3, 30.4, 26.6, 26.1, 31.7, 45.7, 66.2]
            ]
        }
    }
}


In [7]:
# Utility: to be used with new docs of unknown size (to fill the corresponding ETL_SPECS entry)
docs = ['Bollettino-sorveglianza-integrata-COVID-19_2021-04-21.pdf']

for file_name in docs:
    if file_name in FILES_TO_EXCLUDE:
        # Skip unusable ISS bulletins.
        continue
    
    print(file_name)
    file = ORIGINAL_DOC_DIR + file_name
    table_page = TABLE_PAGES.get(file_name, DEFAULT_TABLE_PAGE)
    print(table_page)
    
    df_pdf = tabula.read_pdf(file, pages=table_page)[0]
    df_pdf_norm_tmp = pd.DataFrame()

    df_pdf.dropna(how='all', axis='columns', inplace=True)
    
    for c in df_pdf.columns:
        df_pdf_norm_tmp = pd.concat([df_pdf_norm_tmp, df_pdf[c].astype(str).str.split(expand=True)], axis=1)
        df_pdf_norm_tmp.columns = range(len(df_pdf_norm_tmp.columns))
    
    # Drop spurious rows
    idx1 = df_pdf_norm_tmp[df_pdf_norm_tmp[0]=='0-9'].index.values[0]
    idx2 = df_pdf_norm_tmp[(df_pdf_norm_tmp[0]=='>90') | (df_pdf_norm_tmp[0]=='≥90')].index.values[0]
    df_pdf_norm_tmp = df_pdf_norm_tmp[idx1:idx2+1]
    df_pdf_norm_tmp.reset_index(drop=True, inplace=True)

    # Drop spurious columns
    df_pdf_norm_tmp.dropna(how='all', axis='columns', inplace=True)
    df_pdf_norm_tmp.columns = range(len(df_pdf_norm_tmp.columns))

    df_pdf_norm = pd.DataFrame()
    
    # Handle special cases (unsplitted columns)
    if file_name in SPECIAL_CASES:
        for c in df_pdf_norm_tmp.columns:
            if c in (special_spec := SPECIAL_CASES[file_name]['columns_to_split']):
                df_pdf_norm[c] = special_spec[c][0]
            else:
                df_pdf_norm[c] = df_pdf_norm_tmp[c]
        df_pdf_norm.columns = range(len(df_pdf_norm.columns))
    else:
        df_pdf_norm = df_pdf_norm_tmp

    
    print(df_pdf_norm.size)
    print(str(len(df_pdf_norm)) + '*' + str(len(df_pdf_norm.columns)))
    print()

df_pdf_norm

Bollettino-sorveglianza-integrata-COVID-19_2021-04-21.pdf
14
160
10*16



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0-9,101.528,54,4.0,0,0%,94.484,48,6.0,0,0%,196.012,51,10.0,0,0%
1,10-19,187.636,99,7.0,0,0%,172.952,87,5.0,0,0%,360.588,93,12.0,0,0%
2,20-29,232.345,123,36.0,1,0%,226.031,114,21.0,0,0%,458.38,118,57.0,0,0%
3,30-39,234.447,124,133.0,2,"0,10%",245.102,124,78.0,2,0%,479.553,124,211.0,2,0%
4,40-49,294.689,156,697.0,11,"0,20%",325.276,164,298.0,6,"0,10%",619.966,16,995.0,9,"0,20%"
5,50-59,332.526,176,2.825,43,"0,80%",346.589,175,1.089,21,"0,30%",679.117,175,3.914,34,"0,60%"
6,60-69,224.671,119,8.345,127,"3,70%",206.789,104,3.081,61,"1,50%",431.461,111,11.426,98,"2,60%"
7,70-79,161.756,86,19.565,298,"12,10%",156.057,79,9.284,182,"5,90%",317.813,82,28.849,248,"9,10%"
8,80-89,100.11,53,26.093,398,"26,10%",144.184,73,21.852,43,"15,20%",244.301,63,47.945,412,"19,60%"
9,≥90,20.448,11,7.92,121,"38,70%",64.342,32,15.157,298,"23,60%",84.79,22,23.077,198,"27,20%"


In [8]:
ETL_SPECS = {
    '10*14': {
        'col_cases_male': 1, 'col_deaths_male': 3,
        'col_cases_female': 5, 'col_deaths_female': 7
    },
    '10*16': {
        'col_cases_male': 1, 'col_deaths_male': 3,
        'col_cases_female': 6, 'col_deaths_female': 8
    },
    '10*17': {
        'col_cases_male': 1, 'col_deaths_male': 3,
        'col_cases_female': 6, 'col_deaths_female': 8
    },
    '10*18': {
        'col_cases_male': 1, 'col_deaths_male': 3,
        'col_cases_female': 6, 'col_deaths_female': 8
    },
    '10*25': {
        'col_cases_male': 1, 'col_deaths_male': 4,
        'col_cases_female': 9, 'col_deaths_female': 12
    }
}

In [9]:
def scraped_df_to_row(
    scraped_df,
    col_cases_male, col_deaths_male,
    col_cases_female, col_deaths_female
):
    row = scraped_df.iloc[:, col_cases_male].tolist()
    row += scraped_df.iloc[:, col_deaths_male].tolist()
    
    row += scraped_df.iloc[:, col_cases_female].tolist()
    row += scraped_df.iloc[:, col_deaths_female].tolist()
    
    return [int(str(v).replace('.', '').replace(',', '').replace('-', '0')) for v in row]

In [10]:
# ETL loop
ds = pd.DataFrame(columns = COLUMNS_CASES_MALE + COLUMNS_DEATHS_MALE + COLUMNS_CASES_FEMALE + COLUMNS_DEATHS_FEMALE)

docs = sorted(os.listdir(ORIGINAL_DOC_DIR))

for file_name in docs:
    if (
        file_name in FILES_TO_EXCLUDE or
        not file_name.startswith(FILE_PREFIX)
    ):
        # Skip unusable ISS bulletins and spurious files.
        continue
    
    print('Scraping')
    print(file_name)
    
    file = ORIGINAL_DOC_DIR + file_name
    table_page = TABLE_PAGES.get(file_name, DEFAULT_TABLE_PAGE)    
    df_pdf = tabula.read_pdf(file, pages=table_page)[0]

    df_pdf.dropna(how='all', axis='columns', inplace=True)

    df_pdf_norm_tmp = pd.DataFrame()
    
    for c in df_pdf.columns:
        df_pdf_norm_tmp = pd.concat([df_pdf_norm_tmp, df_pdf[c].astype(str).str.split(expand=True)], axis=1)
        df_pdf_norm_tmp.columns = range(len(df_pdf_norm_tmp.columns))
    
    # Drop spurious rows
    idx1 = df_pdf_norm_tmp[df_pdf_norm_tmp[0]=='0-9'].index.values[0]
    idx2 = df_pdf_norm_tmp[(df_pdf_norm_tmp[0]=='>90') | (df_pdf_norm_tmp[0]=='≥90')].index.values[0]
    df_pdf_norm_tmp = df_pdf_norm_tmp[idx1:idx2+1]
    df_pdf_norm_tmp.reset_index(drop=True, inplace=True)

    # Drop spurious columns
    df_pdf_norm_tmp.dropna(how='all', axis='columns', inplace=True)
    df_pdf_norm_tmp.columns = range(len(df_pdf_norm_tmp.columns))

    df_pdf_norm = pd.DataFrame()
    
    # Handle special cases (unsplitted columns)
    if file_name in SPECIAL_CASES:
        for c in df_pdf_norm_tmp.columns:
            if c in (special_spec := SPECIAL_CASES[file_name]['columns_to_split']):
                df_pdf_norm[c] = special_spec[c][0]
            else:
                df_pdf_norm[c] = df_pdf_norm_tmp[c]
        df_pdf_norm.columns = range(len(df_pdf_norm.columns))
    else:
        df_pdf_norm = df_pdf_norm_tmp    
    scraped_table_dim = str(len(df_pdf_norm)) + '*' + str(len(df_pdf_norm.columns))
    print(scraped_table_dim)
    
    # Flatten the whole ISS table into a single row.
    doc_date = datetime.fromisoformat(file_name[-14:-4])
    ds.loc[doc_date] = scraped_df_to_row(df_pdf_norm, **ETL_SPECS[scraped_table_dim])

    print()
    time.sleep(WAIT)

ds.tail()

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-03-12.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-03-16.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-03-23.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-03-26.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-03-30.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-04-02.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-04-06.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-04-09.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-04-16.pdf
10*17

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-04-23.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-04-28.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-05-07.pdf
10*16

Scraping
Bollettino-sorveglianza-integrata-COVID-19_2020-05-14.pdf
10*16

Scraping
Bollettino-sorveglianza-integ

Unnamed: 0,cases_male_0-9,cases_male_10-19,cases_male_20-29,cases_male_30-39,cases_male_40-49,cases_male_50-59,cases_male_60-69,cases_male_70-79,cases_male_80-89,cases_male_90-,deaths_male_0-9,deaths_male_10-19,deaths_male_20-29,deaths_male_30-39,deaths_male_40-49,deaths_male_50-59,deaths_male_60-69,deaths_male_70-79,deaths_male_80-89,deaths_male_90-,cases_female_0-9,cases_female_10-19,cases_female_20-29,cases_female_30-39,cases_female_40-49,cases_female_50-59,cases_female_60-69,cases_female_70-79,cases_female_80-89,cases_female_90-,deaths_female_0-9,deaths_female_10-19,deaths_female_20-29,deaths_female_30-39,deaths_female_40-49,deaths_female_50-59,deaths_female_60-69,deaths_female_70-79,deaths_female_80-89,deaths_female_90-
2021-03-31,89620,169342,212641,214295,269304,304629,206094,148933,94405,19322,4,8,32,129,638,2561,7566,17849,24239,7386,83182,155598,207159,224508,298767,318979,188279,142228,135929,61877,6,6,22,77,272,973,2725,8387,20356,14331
2021-04-07,93513,175889,220105,221722,279206,315457,213303,153943,96750,19819,4,7,33,131,666,2641,7838,18394,24901,7558,86907,161748,214036,231932,308701,329485,195237,147451,139369,62881,6,5,20,76,278,1003,2841,8670,20862,14623
2021-04-14,97371,181695,226352,228142,287344,324493,219247,158006,98569,20170,4,7,33,133,675,2731,8060,18921,25446,7735,90464,167253,220121,238619,317221,338450,201297,151982,141995,63696,6,5,22,77,287,1037,2956,8942,21297,14883
2021-04-21,101528,187636,232345,234447,294689,332526,224671,161756,100110,20448,4,7,36,133,697,2825,8345,19565,26093,7920,94484,172952,226031,245102,325276,346589,206789,156057,144184,64342,6,5,21,78,298,1089,3081,9284,21852,15157
2021-04-28,106302,193596,237720,240191,302049,339894,229667,164949,101328,20697,4,8,37,135,711,2900,8555,20000,26449,8060,98905,178360,231453,231453,333000,354206,211724,159369,146009,64943,6,6,22,78,305,1119,3173,9488,22175,15358


In [11]:
# Add the totals.
ds['cases_male_total'] = ds[COLUMNS_CASES_MALE].sum(axis=1).astype(int)
ds['deaths_male_total'] = ds[COLUMNS_DEATHS_MALE].sum(axis=1).astype(int)
ds['cases_female_total'] = ds[COLUMNS_CASES_FEMALE].sum(axis=1).astype(int)
ds['deaths_female_total'] = ds[COLUMNS_DEATHS_FEMALE].sum(axis=1).astype(int)

In [12]:
# Check the transformed table
ds.tail()

Unnamed: 0,cases_male_0-9,cases_male_10-19,cases_male_20-29,cases_male_30-39,cases_male_40-49,cases_male_50-59,cases_male_60-69,cases_male_70-79,cases_male_80-89,cases_male_90-,deaths_male_0-9,deaths_male_10-19,deaths_male_20-29,deaths_male_30-39,deaths_male_40-49,deaths_male_50-59,deaths_male_60-69,deaths_male_70-79,deaths_male_80-89,deaths_male_90-,cases_female_0-9,cases_female_10-19,cases_female_20-29,cases_female_30-39,cases_female_40-49,cases_female_50-59,cases_female_60-69,cases_female_70-79,cases_female_80-89,cases_female_90-,deaths_female_0-9,deaths_female_10-19,deaths_female_20-29,deaths_female_30-39,deaths_female_40-49,deaths_female_50-59,deaths_female_60-69,deaths_female_70-79,deaths_female_80-89,deaths_female_90-,cases_male_total,deaths_male_total,cases_female_total,deaths_female_total
2021-03-31,89620,169342,212641,214295,269304,304629,206094,148933,94405,19322,4,8,32,129,638,2561,7566,17849,24239,7386,83182,155598,207159,224508,298767,318979,188279,142228,135929,61877,6,6,22,77,272,973,2725,8387,20356,14331,1728585,60412,1816506,47155
2021-04-07,93513,175889,220105,221722,279206,315457,213303,153943,96750,19819,4,7,33,131,666,2641,7838,18394,24901,7558,86907,161748,214036,231932,308701,329485,195237,147451,139369,62881,6,5,20,76,278,1003,2841,8670,20862,14623,1789707,62173,1877747,48384
2021-04-14,97371,181695,226352,228142,287344,324493,219247,158006,98569,20170,4,7,33,133,675,2731,8060,18921,25446,7735,90464,167253,220121,238619,317221,338450,201297,151982,141995,63696,6,5,22,77,287,1037,2956,8942,21297,14883,1841389,63745,1931098,49512
2021-04-21,101528,187636,232345,234447,294689,332526,224671,161756,100110,20448,4,7,36,133,697,2825,8345,19565,26093,7920,94484,172952,226031,245102,325276,346589,206789,156057,144184,64342,6,5,21,78,298,1089,3081,9284,21852,15157,1890156,65625,1981806,50871
2021-04-28,106302,193596,237720,240191,302049,339894,229667,164949,101328,20697,4,8,37,135,711,2900,8555,20000,26449,8060,98905,178360,231453,231453,333000,354206,211724,159369,146009,64943,6,6,22,78,305,1119,3173,9488,22175,15358,1936393,66859,2009422,51730


In [13]:
# Interpolate
ds_interp_linear = ds.astype(float).asfreq(freq='D').interpolate(method='linear')
ds_interp_cubic = ds.astype(float).asfreq(freq='D').interpolate(method='cubic')

ds_interp_cubic.tail(10)

Unnamed: 0,cases_male_0-9,cases_male_10-19,cases_male_20-29,cases_male_30-39,cases_male_40-49,cases_male_50-59,cases_male_60-69,cases_male_70-79,cases_male_80-89,cases_male_90-,deaths_male_0-9,deaths_male_10-19,deaths_male_20-29,deaths_male_30-39,deaths_male_40-49,deaths_male_50-59,deaths_male_60-69,deaths_male_70-79,deaths_male_80-89,deaths_male_90-,cases_female_0-9,cases_female_10-19,cases_female_20-29,cases_female_30-39,cases_female_40-49,cases_female_50-59,cases_female_60-69,cases_female_70-79,cases_female_80-89,cases_female_90-,deaths_female_0-9,deaths_female_10-19,deaths_female_20-29,deaths_female_30-39,deaths_female_40-49,deaths_female_50-59,deaths_female_60-69,deaths_female_70-79,deaths_female_80-89,deaths_female_90-,cases_male_total,deaths_male_total,cases_female_total,deaths_female_total
2021-04-19,100291.945961,185919.634123,230663.638264,232667.498956,292619.894849,330307.749034,223159.740284,160717.420002,99696.724288,20372.482076,4.0,6.975449,35.005951,133.050914,689.800053,2798.969649,8261.471917,19381.651758,25911.852119,7868.765344,93282.560216,171316.626015,224368.77692,244194.646167,323005.447037,344327.328197,205273.889607,154950.694344,143593.971977,64169.024436,6.000002,4.978767,21.502876,77.813133,294.959834,1073.977246,3045.978445,9187.218158,21693.879067,15080.151859,1876417.0,65091.543154,1968483.0,50486.459388
2021-04-20,100904.164377,186777.056566,231509.554013,233561.796722,293655.327593,331423.96139,223919.563525,161241.594069,99906.459757,20410.568672,4.0,6.980103,35.512017,133.013549,693.423974,2812.141636,8303.676179,19474.769373,26004.529293,7894.742878,93878.789938,172136.056324,225204.086466,244816.773194,324144.004406,345463.840345,206037.035001,155510.970317,143892.642197,64256.204869,6.000001,4.981873,21.245071,77.916664,296.510551,1081.632763,3063.736804,9236.578936,21774.532782,15119.120447,1883310.0,65362.789002,1975340.0,50682.255891
2021-04-21,101528.0,187636.0,232345.0,234447.0,294689.0,332526.0,224671.0,161756.0,100110.0,20448.0,4.0,7.0,36.0,133.0,697.0,2825.0,8345.0,19565.0,26093.0,7920.0,94484.0,172952.0,226031.0,245102.0,325276.0,346589.0,206789.0,156057.0,144184.0,64342.0,6.0,5.0,21.0,78.0,298.0,1089.0,3081.0,9284.0,21852.0,15157.0,1890156.0,65625.0,1981806.0,50871.0
2021-04-22,102164.42746,188495.331189,233167.833742,235320.754298,295722.978529,333614.467182,225413.701781,162259.038584,100306.948407,20484.839491,4.0,7.040305,36.447166,133.027268,700.412761,2837.470609,8384.793209,19650.965321,26175.531931,7944.338755,95097.393735,173762.004901,226847.95435,244976.369663,326401.240492,347703.506594,207529.597652,156587.458255,144467.929231,64426.876763,5.999999,5.038535,20.795745,78.062928,299.407816,1095.918258,3097.589727,9328.604737,21924.732525,15193.389757,1896950.0,65874.027325,1987800.0,51049.540027
2021-04-23,102814.421386,189353.916898,233975.912757,236180.705126,296759.329641,334689.965252,226147.320941,162749.11061,100496.908365,20521.150577,4.0,7.106184,36.830784,133.112352,703.546885,2849.479331,8422.405634,19731.287017,26250.392779,7967.561186,95718.174478,174563.618883,227653.386345,244365.925261,327519.532555,348808.059558,208258.641005,157101.019942,144744.313737,64511.302095,5.999998,5.102866,20.660389,78.105235,300.713635,1102.226836,3113.327677,9369.516535,21991.182157,15227.888958,1903689.0,66105.722152,1993244.0,51214.724286
2021-04-24,103478.956406,190210.623891,234767.09456,237024.497991,297800.119793,335753.096526,226871.509554,163224.616868,100679.483262,20556.996688,4.0,7.202803,37.12812,133.272251,706.287004,2860.952033,8457.187104,19804.586771,26315.850236,7989.469337,96345.545563,175354.389803,228445.732814,243196.709872,328630.682863,349903.358322,208975.943109,157596.359923,145013.037364,64595.742929,5.999997,5.198379,20.622016,78.126707,301.897091,1107.765033,3128.035542,9405.858782,22049.800699,15260.096842,1910367.0,66315.935658,1998058.0,51363.401088
2021-04-25,104159.007148,191064.318933,235539.236669,237849.778402,298847.415446,336804.463322,227585.919693,163683.958146,100854.276486,20592.441254,4.0,7.335329,37.316442,133.523966,708.517745,2871.814583,8488.487447,19869.486265,26370.171994,8009.86525,96978.710323,176131.865518,229223.430585,241394.766574,329734.498088,350990.102317,209681.317011,158072.153058,145273.983957,64680.666202,5.999997,5.330462,20.708707,78.127133,302.937821,1112.372148,3141.535015,9436.754864,22099.039953,15289.612648,1916981.0,66500.51902,2002161.0,51492.418748
2021-04-26,104855.548242,191913.868789,236290.196601,238654.191868,299903.283058,337844.667955,228290.203431,164125.535235,101020.891425,20627.547708,4.0,7.508926,37.373018,133.884496,710.123738,2881.992849,8515.656491,19924.607179,26411.625746,8028.550969,97616.872094,176893.593884,229984.916488,238886.138444,330830.784904,352068.990975,210374.575762,158527.074208,145527.037364,64766.538847,5.999997,5.504502,20.948546,78.106299,303.815459,1115.887482,3153.647787,9461.32817,22137.35172,15316.035617,1923526.0,66655.323413,2005477.0,51598.625578
2021-04-27,105569.554317,192758.140223,237017.831872,239435.383898,300969.78909,338874.312742,228984.012843,164547.748923,101178.931467,20662.37948,4.0,7.728761,37.275115,134.370841,710.989614,2891.412699,8538.044066,19968.571197,26438.479184,8045.328538,98259.234208,177637.12276,230728.62735,235596.86856,331919.349983,353140.723726,211055.532408,158959.798235,145772.081429,64853.827802,5.999998,5.725885,21.369616,78.063992,304.50964,1118.150332,3164.195551,9478.702086,22163.187801,15338.964988,1929998.0,66776.200015,2007923.0,51678.869891
2021-04-28,106302.0,193596.0,237720.0,240191.0,302049.0,339894.0,229667.0,164949.0,101328.0,20697.0,4.0,8.0,37.0,135.0,711.0,2900.0,8555.0,20000.0,26449.0,8060.0,98905.0,178360.0,231453.0,231453.0,333000.0,354206.0,211724.0,159369.0,146009.0,64943.0,6.0,6.0,22.0,78.0,305.0,1119.0,3173.0,9488.0,22175.0,15358.0,1936393.0,66859.0,2009422.0,51730.0


In [14]:
# Export the transformed tables to csv files.
ds.to_csv(EXPORT_DIR + 'italy_cases_deaths_by_age_sex.csv', index_label='date')
ds_interp_linear.to_csv(EXPORT_DIR + 'italy_cases_deaths_by_age_sex_interp_linear.csv', index_label='date')
ds_interp_cubic.to_csv(EXPORT_DIR + 'italy_cases_deaths_by_age_sex_interp_cubic.csv', index_label='date')

In [15]:
# Check roundtrip
chk_df = pd.read_csv(EXPORT_DIR+'italy_cases_deaths_by_age_sex_interp_cubic.csv', index_col=0, parse_dates=True)
chk_df.tail()

Unnamed: 0_level_0,cases_male_0-9,cases_male_10-19,cases_male_20-29,cases_male_30-39,cases_male_40-49,cases_male_50-59,cases_male_60-69,cases_male_70-79,cases_male_80-89,cases_male_90-,deaths_male_0-9,deaths_male_10-19,deaths_male_20-29,deaths_male_30-39,deaths_male_40-49,deaths_male_50-59,deaths_male_60-69,deaths_male_70-79,deaths_male_80-89,deaths_male_90-,cases_female_0-9,cases_female_10-19,cases_female_20-29,cases_female_30-39,cases_female_40-49,cases_female_50-59,cases_female_60-69,cases_female_70-79,cases_female_80-89,cases_female_90-,deaths_female_0-9,deaths_female_10-19,deaths_female_20-29,deaths_female_30-39,deaths_female_40-49,deaths_female_50-59,deaths_female_60-69,deaths_female_70-79,deaths_female_80-89,deaths_female_90-,cases_male_total,deaths_male_total,cases_female_total,deaths_female_total
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
2021-04-24,103478.956406,190210.623891,234767.09456,237024.497991,297800.119793,335753.096526,226871.509554,163224.616868,100679.483262,20556.996688,4.0,7.202803,37.12812,133.272251,706.287004,2860.952033,8457.187104,19804.586771,26315.850236,7989.469337,96345.545563,175354.389803,228445.732814,243196.709872,328630.682863,349903.358322,208975.943109,157596.359923,145013.037364,64595.742929,5.999997,5.198379,20.622016,78.126707,301.897091,1107.765033,3128.035542,9405.858782,22049.800699,15260.096842,1910367.0,66315.935658,1998058.0,51363.401088
2021-04-25,104159.007148,191064.318933,235539.236669,237849.778402,298847.415446,336804.463322,227585.919693,163683.958146,100854.276486,20592.441254,4.0,7.335329,37.316442,133.523966,708.517745,2871.814583,8488.487447,19869.486265,26370.171994,8009.86525,96978.710323,176131.865518,229223.430585,241394.766574,329734.498088,350990.102317,209681.317011,158072.153058,145273.983957,64680.666202,5.999997,5.330462,20.708707,78.127133,302.937821,1112.372148,3141.535015,9436.754864,22099.039953,15289.612648,1916981.0,66500.51902,2002161.0,51492.418748
2021-04-26,104855.548242,191913.868789,236290.196601,238654.191868,299903.283058,337844.667955,228290.203431,164125.535235,101020.891425,20627.547708,4.0,7.508926,37.373018,133.884496,710.123738,2881.992849,8515.656491,19924.607179,26411.625746,8028.550969,97616.872094,176893.593884,229984.916488,238886.138444,330830.784904,352068.990975,210374.575762,158527.074208,145527.037364,64766.538847,5.999997,5.504502,20.948546,78.106299,303.815459,1115.887482,3153.647787,9461.32817,22137.35172,15316.035617,1923526.0,66655.323413,2005477.0,51598.625578
2021-04-27,105569.554317,192758.140223,237017.831872,239435.383898,300969.78909,338874.312742,228984.012843,164547.748923,101178.931467,20662.37948,4.0,7.728761,37.275115,134.370841,710.989614,2891.412699,8538.044066,19968.571197,26438.479184,8045.328538,98259.234208,177637.12276,230728.62735,235596.86856,331919.349983,353140.723726,211055.532408,158959.798235,145772.081429,64853.827802,5.999998,5.725885,21.369616,78.063992,304.50964,1118.150332,3164.195551,9478.702086,22163.187801,15338.964988,1929998.0,66776.200015,2007923.0,51678.869891
2021-04-28,106302.0,193596.0,237720.0,240191.0,302049.0,339894.0,229667.0,164949.0,101328.0,20697.0,4.0,8.0,37.0,135.0,711.0,2900.0,8555.0,20000.0,26449.0,8060.0,98905.0,178360.0,231453.0,231453.0,333000.0,354206.0,211724.0,159369.0,146009.0,64943.0,6.0,6.0,22.0,78.0,305.0,1119.0,3173.0,9488.0,22175.0,15358.0,1936393.0,66859.0,2009422.0,51730.0
