In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
#import plotly.graph_objects as go
#from datetime import datetime

## Constants

In [2]:
base_dir = Path('/Users/eandreas/projects/dev/covid-19/bag_data_download').resolve()

cases_dir = base_dir / 'downloads' / 'cases_data'
report_dir = base_dir / 'downloads' / 'report_data'
test_dir = base_dir / 'downloads' / 'test_data'
csv_dir = base_dir / 'csv'

In [3]:
# all cantons including FL
CANTONS = {
    1: "AG",
    2: "AI",
    3: "AR",
    4: "BE",
    5: "BL",
    6: "BS",
    7: "FL",
    8: "FR",
    9: "GE",
    10: "GL",
    11: "GR",
    12: "JU",
    13: "LU",
    14: "NE",
    15: "NW",
    16: "OW",
    17: "SG",
    18: "SH",
    19: "SO",
    20: "SZ",
    21: "TG",
    22: "TI",
    23: "UR",
    24: "VD",
    25: "VS",
    26: "ZG",
    27: "ZH"
}

AGE_CLASSES = {
    '0-9': '0 - 9',
    '10-19': '10 - 19',
    '20-29': '20 - 29',
    '30-39': '30 - 39',
    '40-49': '40 - 49',
    '50-59': '50 - 59',
    '60-69': '60 - 69',
    '70-79': '70 - 79',
    '80+': '80+',
    'ukn': 'Unbekannt'
}

SEX = {
    'f': 'f',
    'm': 'm',
    'ukn': 'n/a'
}

VALUES = ['conf', 'deceased']

## Methods

In [4]:
def get_latest(directory, prefix = '', suffix='', n = 1):
    time, latest = sorted((f.stat().st_mtime, f) for f in directory.glob(prefix + '*' + suffix))[-n]
    return latest

In [13]:
def load_bag_conf_excel(nn = 1):
    # create DataFrame from Excel
    xlsx = get_latest(cases_dir, prefix='2020', suffix='.xlsx', n = nn)
    print(xlsx)
    df = pd.read_excel(xlsx)

    # rename and clean up redundant
    df.loc[:,'sex'] = np.where(df['sex'] == 1, 'm', np.where(df['sex'] == 2, 'f', 'n/a'))
    df.drop('Geschlecht', axis=1, inplace=True)
    df.drop('Sexe', axis=1, inplace=True)
    
    df.replikation_dt = df.replikation_dt.astype('datetime64')
    df.fall_dt = df.fall_dt.astype('datetime64')
    df.pttoddat = df.pttoddat.astype('datetime64')
    df.ktn = df.ktn.astype('category')
    df.akl = df.akl.astype('category')
    df.sex = df.sex.astype('category')
    df.fallklasse_3 = df.fallklasse_3.astype('int64')
    df.pttod_1 = df.pttod_1.astype('int64')

    return df

In [147]:
## REPAIR AS FIXED BELOW
asdf()

def load_and_rearrange_data(n = 1, sparce=True):
    # load data from bag excel
    df = load_bag_conf_excel(n)
    # extract conf part only and remove rows without conf information
    df_c = pd.DataFrame()
    df_c['date'] = df['fall_dt']
    df_c['canton'] = df['ktn']
    df_c['age_class'] = df['akl']
    df_c['sex'] = df['sex']
    df_c['conf'] = df['fallklasse_3']
    df_c = df_c[df_c.date.notnull()]
    # extract decease part only and remove rows without decease information
    df_d = pd.DataFrame()
    df_d['date'] = df['pttoddat']
    df_d['canton'] = df['ktn']
    df_d['age_class'] = df['akl']
    df_d['sex'] = df['sex']
    df_d['deceased'] = df['pttod_1']
    df_d = df_d[df_d.date.notnull()]
    # merge conf and decease together
    df_c_d = pd.merge(df_c, df_d, how='outer', on=['date', 'canton', 'age_class', 'sex'])
    # build new rearranged data frame
    df_final = pd.DataFrame(columns=['date'])
    for c in CANTONS.values():
        for s_k, s_v in SEX.items():
            for a_k, a_v in AGE_CLASSES.items():
                df_tmp = df_c_d[
                    (df_c_d.canton == c) & 
                    (df_c_d.sex == s_v) & 
                    (df_c_d.age_class == a_v)].groupby('date').sum()
                df_final = pd.merge(df_final, df_tmp, how='outer', on=['date'])
                bn = f'{c}_{s_k}_{a_k}'
                df_final.rename(columns = {
                    'conf': f'{bn}_conf','deceased': f'{bn}_dead'},inplace=True)
    if (sparce):
        df_final.replace(0, np.nan, inplace=True)
    return df_final

NameError: name 'asdf' is not defined

In [15]:
def get_data_subset(df, a, start_date = None, end_date = None):
    col = (np.core.defchararray.find(df.columns.values.astype(str), a[0]) >= 0)
    for i in range(1, len(a)):
        col = col & (np.core.defchararray.find(df.columns.values.astype(str), a[i]) >= 0)
    col = col | (np.core.defchararray.find(df.columns.values.astype(str), 'date') >= 0)
    dfs = pd.DataFrame(df.values[:, col], df.index, df.columns[col])
    if start_date is not None:
        dfs = dfs[dfs.date >= start_date]
    if end_date is not None:
        dfs = dfs[dfs.date <= end_date]
    return dfs

In [24]:
df = load_and_rearrange_data(n=12, sparce=True)

/Users/eandreas/projects/dev/covid-19/bag_data_download/downloads/cases_data/2020-12-01_14-00_Dashboards_1&2_COVID19_swiss_data_pv.xlsx


## Write CSV

In [28]:
df.to_csv(csv_dir / 'cases_all.csv', index = False)

## Checks & Tests

In [29]:
df = pd.read_csv(csv_dir / 'cases_all.csv', parse_dates=['date'])

In [30]:
dfn = get_data_subset(df, ['AG'])
dfn

Unnamed: 0,date,AG_f_0-9_conf,AG_f_0-9_dead,AG_f_10-19_conf,AG_f_10-19_dead,AG_f_20-29_conf,AG_f_20-29_dead,AG_f_30-39_conf,AG_f_30-39_dead,AG_f_40-49_conf,...,AG_ukn_50-59_conf,AG_ukn_50-59_dead,AG_ukn_60-69_conf,AG_ukn_60-69_dead,AG_ukn_70-79_conf,AG_ukn_70-79_dead,AG_ukn_80+_conf,AG_ukn_80+_dead,AG_ukn_ukn_conf,AG_ukn_ukn_dead
0,2020-02-24,,,,,,,,,,...,,,,,,,,,,
1,2020-02-25,,,,,,,,,,...,,,,,,,,,,
2,2020-02-26,,,,,,,,,,...,,,,,,,,,,
3,2020-02-27,,,,,,,,,,...,,,,,,,,,,
4,2020-02-28,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,2020-11-27,,,20,,26,,27,,34,...,,,,,,,,,,
278,2020-11-28,1,,7,,22,,18,,21,...,,,,,,,,,,
279,2020-11-29,1,,5,,16,,12,,9,...,,,,,,,,,,
280,2020-11-30,,,15,,21,,21,,21,...,,,,,,,,,,


In [31]:
get_data_subset(df, ['_conf']).sum().sum()

355327.0

In [35]:
get_data_subset(df, ['_conf']).max()

date                 2020-12-01 00:00:00
AG_f_0-9_conf                          6
AG_f_10-19_conf                       31
AG_f_20-29_conf                       55
AG_f_30-39_conf                       46
                            ...         
ZH_ukn_50-59_conf                      1
ZH_ukn_60-69_conf                      2
ZH_ukn_70-79_conf                      1
ZH_ukn_80+_conf                      NaN
ZH_ukn_ukn_conf                      NaN
Length: 811, dtype: object

## Playgrounds - New confirmed cases with latest in different color

In [36]:
get_data_subset(df, ['_dead']).sum().sum()

50029.0

In [140]:
# load data from bag excel
#df = load_bag_conf_excel(12)
# extract conf part only and remove rows without conf information
df_c = pd.DataFrame()
df_c = df.loc[:,['fall_dt', 'ktn', 'akl', 'sex', 'fallklasse_3']]
df_c.rename(
    columns = {'fall_dt':'date', 'ktn':'canton', 'akl':'age_class', 'sex':'sex', 'fallklasse_3':'conf'},
    inplace = True
)
df_c = df_c.groupby(['date', 'canton', 'age_class', 'sex']).sum().reset_index()
#df_c['date'] = df['fall_dt']
#df_c['canton'] = df['ktn']
#df_c['age_class'] = df['akl']
#df_c['sex'] = df['sex']
#df_c['conf'] = df['fallklasse_3']
df_c = df_c[df_c.date.notnull()]
## extract decease part only and remove rows without decease information
df_d = pd.DataFrame()
df_d = df.loc[:,['pttoddat', 'ktn', 'akl', 'sex', 'pttod_1']]
df_d.rename(
    columns = {'pttoddat':'date', 'ktn':'canton', 'akl':'age_class', 'sex':'sex', 'pttod_1':'deceased'},
    inplace = True
)
df_d = df_d.groupby(['date', 'canton', 'age_class', 'sex']).sum().reset_index()
#df_d['date'] = df['pttoddat']
#df_d['canton'] = df['ktn']
#df_d['age_class'] = df['akl']
#df_d['sex'] = df['sex']
#df_d['deceased'] = df['pttod_1']
df_d = df_d[df_d.date.notnull()]
## merge conf and decease together
df_c_d = pd.merge(df_c, df_d, how='outer', on=['date', 'canton', 'age_class', 'sex'])
# build new rearranged data frame
df_final = pd.DataFrame(columns=['date'])
for c in CANTONS.values():
    for s_k, s_v in SEX.items():
        for a_k, a_v in AGE_CLASSES.items():
            df_tmp = df_c_d[
                (df_c_d.canton == c) & 
                (df_c_d.sex == s_v) & 
                (df_c_d.age_class == a_v)].groupby('date').sum()
            df_final = pd.merge(df_final, df_tmp, how='outer', on=['date'])
            bn = f'{c}_{s_k}_{a_k}'
            df_final.rename(columns = {
                'conf': f'{bn}_conf','deceased': f'{bn}_dead'},inplace=True)
#if (sparce):
df_final.replace(0, np.nan, inplace=True)
#return df_final

df_c['conf'].sum(), df_d['deceased'].sum(), df_c_d['conf'].sum(), df_c_d['deceased'].sum()

(330874, 4552, 330874, 4552.0)

In [141]:
df_c_d.conf.sum()

330874

In [146]:
get_data_subset(df_final, ['_dead']).sum().sum()

4552.0