In [175]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.graph_objects as go
from datetime import datetime

## Constants

In [2]:
base_dir = Path('/Users/eandreas/projects/dev/covid-19/bag_data_download').resolve()

cases_dir = base_dir / 'downloads' / 'cases_data'
report_dir = base_dir / 'downloads' / 'report_data'
test_dir = base_dir / 'downloads' / 'test_data'
csv_dir = base_dir / 'csv'

## Methods

In [3]:
def get_latest(directory, prefix = '', suffix='', n = 1):
    time, latest = sorted((f.stat().st_mtime, f) for f in directory.glob(prefix + '*' + suffix))[-n]
    return latest

In [13]:
def load_data(nn = 1):
    # create DataFrame from Excel
    xlsx = get_latest(cases_dir, prefix='2020', suffix='.xlsx', n = nn)
    print(xlsx)
    df = pd.read_excel(xlsx)
    
    renames = {
        'replikation_dt': 'date',
        'fall_dt' : 'case_date',
        'ktn': 'canton',
        'akl': 'age_class',
        'fallklasse_3': 'conf',
        'pttod_1': 'deceased',
        'pttoddat': 'deceased_date'
    }
    
    # rename columns
    df.rename(columns = renames,inplace=True)
    
    # split datetime column into a date and a time column
    #time_list = pd.to_datetime(df['date'], dayfirst=True).dt.time
    #df.insert(loc=1, column='time', value=time_list)                 
    #df['date'] = pd.to_datetime(df['date'], dayfirst=True).dt.date
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df['case_date'] = pd.to_datetime(df['case_date'], dayfirst=True).dt.date
    df['case_date'] = pd.to_datetime(df['case_date'], dayfirst=True)
    df['deceased_date'] = pd.to_datetime(df['deceased_date'], dayfirst=True).dt.date
    df['deceased_date'] = pd.to_datetime(df['deceased_date'], dayfirst=True)

    # clean up
    df.loc[:,'sex'] = np.where(df['sex'] == 1, 'm', np.where(df['sex'] == 2, 'f', 'n/a'))
    df.drop('Geschlecht', axis=1, inplace=True)
    df.drop('Sexe', axis=1, inplace=True)

    # insert column for country
    df.insert(loc=3, column='country', value=np.where(df['canton'] == 'FL', 'FL', 'CH'))

    return df

## Laden der BAG-Daten

In [14]:
df_latest = load_data(1)
df_second_latest = load_data(2)

/Users/eandreas/projects/dev/covid-19/bag_data_download/downloads/cases_data/2020-10-23_13-00_Dashboards_1&2_COVID19_swiss_data_pv.xlsx
/Users/eandreas/projects/dev/covid-19/bag_data_download/downloads/cases_data/2020-10-22_13-00_Dashboards_1&2_COVID19_swiss_data_pv.xlsx


## Neue und bisherige Fälle / Tag - ganze Schweiz

In [25]:
index = pd.MultiIndex.from_frame(df_latest[['case_date']])
df_ch = df_latest.copy()
df_ch.set_index(index, inplace=True)
df_ch = df_ch.drop(['date', 'case_date', 'age_class', 'sex', 'deceased_date', 'deceased'], axis=1).reset_index()
df_ch = df_ch.groupby('case_date').sum().reset_index()

In [17]:
index = pd.MultiIndex.from_frame(df_second_latest[['case_date']])
df_ch_y = df_second_latest.copy()
df_ch_y.set_index(index, inplace=True)
df_ch_y = df_ch_y.drop(['date', 'case_date', 'age_class', 'sex', 'deceased_date', 'deceased'], axis=1).reset_index()
df_ch_y = df_ch_y.groupby('case_date').sum().reset_index()
df_ch_y.rename(columns = {'conf': 'conf_yesterday'}, inplace=True)

In [18]:
df_ch = df_ch.merge(df_ch_y, how='left', on='case_date')
df_ch.fillna(value = 0, inplace=True)
df_ch['new_conf'] = (df_ch.conf - df_ch.conf_yesterday).astype(int)
df_ch.drop(columns=['conf_yesterday'], axis = 1, inplace = True)

In [20]:
fig = go.Figure(data=[
    go.Bar(x=df_ch.case_date, y=df_ch.conf-df_ch.new_conf, marker_color='indianred', name = 'bisher'),
    go.Bar(x=df_ch.case_date, y=df_ch.new_conf, marker_color='lightsalmon', name = 'neu')
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()

## Write CSV

In [21]:
df_latest.head()

Unnamed: 0,date,case_date,canton,country,age_class,sex,conf,deceased_date,deceased
0,2020-10-23 07:52:18,2020-02-24,TI,CH,70 - 79,m,1,NaT,0
1,2020-10-23 07:52:18,2020-02-25,AG,CH,20 - 29,m,1,NaT,0
2,2020-10-23 07:52:18,2020-02-26,BL,CH,20 - 29,m,1,NaT,0
3,2020-10-23 07:52:18,2020-02-26,BS,CH,20 - 29,f,1,NaT,0
4,2020-10-23 07:52:18,2020-02-26,GE,CH,20 - 29,m,1,NaT,0


In [12]:
df_latest.to_csv(csv_dir / 'cases_all.csv', index = False)

In [13]:
url = 'https://raw.githubusercontent.com/eandreas/bag_data_download/master/csv/cases_all.csv'

In [16]:
df_tmp = pd.read_csv(url, parse_dates=[['date', 'time'], 'case_date', 'deceased_date'])
df_tmp.head()

Unnamed: 0,date_time,case_date,country,canton,age_class,sex,conf,deceased_date,deceased
0,2020-10-23 07:52:18,2020-02-24,CH,TI,70 - 79,m,1,NaT,0
1,2020-10-23 07:52:18,2020-02-25,CH,AG,20 - 29,m,1,NaT,0
2,2020-10-23 07:52:18,2020-02-26,CH,BL,20 - 29,m,1,NaT,0
3,2020-10-23 07:52:18,2020-02-26,CH,BS,20 - 29,f,1,NaT,0
4,2020-10-23 07:52:18,2020-02-26,CH,GE,20 - 29,m,1,NaT,0


## Playgrounds

In [209]:
df_ch = df_latest.copy()

In [210]:
df_c = pd.DataFrame()
df_c['date'] = df_ch['case_date']
df_c['country'] = df_ch['country']
df_c['canton'] = df_ch['canton']
df_c['age_class'] = df_ch['age_class']
df_c['sex'] = df_ch['sex']
df_c['conf'] = df_ch['conf']
df_c = df_c[df_c.date.notnull()]

In [211]:
df_c.head()

Unnamed: 0,date,country,canton,age_class,sex,conf
0,2020-02-24,CH,TI,70 - 79,m,1
1,2020-02-25,CH,AG,20 - 29,m,1
2,2020-02-26,CH,BL,20 - 29,m,1
3,2020-02-26,CH,BS,20 - 29,f,1
4,2020-02-26,CH,GE,20 - 29,m,1


In [212]:
df_d = pd.DataFrame()
df_d['date'] = df_ch['deceased_date']
df_d['country'] = df_ch['country']
df_d['canton'] = df_ch['canton']
df_d['age_class'] = df_ch['age_class']
df_d['sex'] = df_ch['sex']
df_d['deceased'] = df_ch['deceased']
df_d = df_d[df_d.date.notnull()]

In [213]:
df_d.head()

Unnamed: 0,date,country,canton,age_class,sex,deceased
196830,2020-05-26,CH,AG,0 - 9,m,1
196838,2020-03-28,CH,AG,30 - 39,f,1
196842,2020-05-01,CH,AG,40 - 49,f,1
196845,2020-08-23,CH,AG,50 - 59,m,1
196846,2020-09-11,CH,AG,50 - 59,m,1


In [214]:
df_c_d = pd.merge(df_c, df_d, how='outer', on=['date', 'country', 'canton', 'age_class', 'sex'])
df_c_d.fillna(0, inplace=True)
df_c_d.deceased = df_c_d.deceased.astype(int)

In [215]:
df_c_d.head()

Unnamed: 0,date,country,canton,age_class,sex,conf,deceased
0,2020-02-24,CH,TI,70 - 79,m,1,0
1,2020-02-25,CH,AG,20 - 29,m,1,0
2,2020-02-26,CH,BL,20 - 29,m,1,0
3,2020-02-26,CH,BS,20 - 29,f,1,0
4,2020-02-26,CH,GE,20 - 29,m,1,0


In [216]:
df_c.conf.sum(), df_d.deceased.sum(), df_c_d.conf.sum(), df_c_d.deceased.sum()

(103653, 1877, 103653, 1877)

In [217]:
df_c_d.to_csv(csv_dir / 'cases_all_indexed.csv', index = False)

In [251]:
# all cantons including FL
CANTONS = {
    1: "AG",
    2: "AI",
    3: "AR",
    4: "BE",
    5: "BL",
    6: "BS",
    7: "FL",
    8: "FR",
    9: "GE",
    10: "GL",
    11: "GR",
    12: "JU",
    13: "LU",
    14: "NE",
    15: "NW",
    16: "OW",
    17: "SG",
    18: "SH",
    19: "SO",
    20: "SZ",
    21: "TG",
    22: "TI",
    23: "UR",
    24: "VD",
    25: "VS",
    26: "ZG",
    27: "ZH"
}

AGE_CLASSES = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-50', '60-69', '70-79', '80+', 'ukn']

SEX = ['f', 'm', 'ukn']

VALUES = ['conf', 'dead']

In [255]:
idx = list()

for v in VALUES:
    for c in CANTONS.values():
        for s in SEX:
            for a in AGE_CLASSES:
                idx.append(f'{c}_{s}_{a}_{v}')

In [256]:
df_test = pd.DataFrame(columns=idx)

In [257]:
df_test

Unnamed: 0,AG_f_0-9_conf,AG_f_10-19_conf,AG_f_20-29_conf,AG_f_30-39_conf,AG_f_40-49_conf,AG_f_50-50_conf,AG_f_60-69_conf,AG_f_70-79_conf,AG_f_80+_conf,AG_f_ukn_conf,...,ZH_ukn_0-9_dead,ZH_ukn_10-19_dead,ZH_ukn_20-29_dead,ZH_ukn_30-39_dead,ZH_ukn_40-49_dead,ZH_ukn_50-50_dead,ZH_ukn_60-69_dead,ZH_ukn_70-79_dead,ZH_ukn_80+_dead,ZH_ukn_ukn_dead


In [275]:
df_ch.conf[(df_ch.canton == 'AG') & (df_ch.sex == 'f') & (df_ch.age_class == '20 - 29') & (df_ch.case_date == '2020-10-22')].sum()

10

In [271]:
df_ch.tail()

Unnamed: 0,date,case_date,canton,country,age_class,sex,conf,deceased_date,deceased
386083,2020-10-23 07:52:18,NaT,ZH,CH,80+,f,0,2020-10-23,0
386084,2020-10-23 07:52:18,NaT,ZH,CH,80+,,0,2020-10-23,0
386085,2020-10-23 07:52:18,NaT,ZH,CH,Unbekannt,m,0,2020-10-23,0
386086,2020-10-23 07:52:18,NaT,ZH,CH,Unbekannt,f,0,2020-10-23,0
386087,2020-10-23 07:52:18,NaT,ZH,CH,Unbekannt,,0,2020-10-23,0


In [219]:
df_c_d.replace('CH', 0, inplace=True)
df_c_d.replace('FL', 1, inplace=True)
df_c_d.replace('0 - 9', 0, inplace=True)
df_c_d.replace('10 - 19', 1, inplace=True)
df_c_d.replace('20 - 29', 2, inplace=True)
df_c_d.replace('30 - 39', 3, inplace=True)
df_c_d.replace('40 - 49', 4, inplace=True)
df_c_d.replace('50 - 59', 5, inplace=True)
df_c_d.replace('60 - 69', 6, inplace=True)
df_c_d.replace('70 - 79', 7, inplace=True)
df_c_d.replace('80+', 8, inplace=True)
df_c_d.replace('Unbekannt', 9, inplace=True)
df_c_d.replace('f', 0, inplace=True)
df_c_d.replace('m', 1, inplace=True)
df_c_d.replace('n/a', 2, inplace=True)
df_c_d.rename
for k, v in CANTONS.items():
    df_c_d.replace(v, k, inplace=True)

In [239]:
CANTONS.values()

dict_values(['AG', 'AI', 'AR', 'BE', 'BL', 'BS', 'FL', 'FR', 'GE', 'GL', 'GR', 'JU', 'LU', 'NE', 'NW', 'OW', 'SG', 'SH', 'SO', 'SZ', 'TG', 'TI', 'UR', 'VD', 'VS', 'ZG', 'ZH'])

In [None]:
columns = ['date', 'AG']