In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
rki_raw = pd.read_csv('rki_raw_200926.csv')

In [3]:
rki_raw.to_csv('rki_raw_200926.csv',index=False)

In [4]:
rki_raw.rename(columns={'IdLandkreis':'districtId'}, inplace=True)
rki_raw.loc[rki_raw['Bundesland']=='Berlin','districtId'] = 11000 


In [5]:
rki_infect = rki_raw[rki_raw.NeuerFall >= 0]

In [6]:
rki_deaths = rki_raw[rki_raw.NeuerTodesfall >= 0]

In [7]:
#columns needed for both cases and deaths
cols = ['districtId','Altersgruppe','Geschlecht','Meldedatum']#Melde statt Ref
#Landkreisname ist schon in static data, Bundesland brauchen wir wohl nicht und ist im AGS drin

In [8]:
def aggr_reports(df, count_col):
    
    df = df[cols+[count_col]].copy()
    #prepare datetime columns
    df['Meldedatum'] = pd.to_datetime(df['Meldedatum'])
    df['week_no'] = df['Meldedatum'].dt.week
    
    #get special columns for age cohorts and genders
    for col in ['Geschlecht', 'Altersgruppe']:
        dummy_df = pd.get_dummies(df[col], prefix=col, drop_first=False)
        dummies = dummy_df.columns
        df = df.merge(dummy_df, how='left', left_index=True, right_index=True)
        for dummy in dummies:
            df[dummy] = df[dummy] * df[count_col] 
            #multiplying dummy with Anzahl(todes)Fall so that each dummy column contains number of cases in that category
            
    daily_output = df.drop(columns='week_no').groupby(['districtId','Meldedatum']).sum()
    weekly_output = df.drop(columns='Meldedatum').groupby(['districtId','week_no']).sum()
    
    return daily_output, weekly_output

In [9]:
daily_infects, weekly_infects = aggr_reports(rki_infect, 'AnzahlFall')
daily_deaths, weekly_deaths = aggr_reports(rki_deaths, 'AnzahlTodesfall')

In [10]:
rki_daily = pd.merge(daily_infects, daily_deaths, how='outer', 
                    left_index=True, right_index=True,
                    suffixes=('_infected', '_deceased')).reset_index()

In [11]:
rki_weekly = pd.merge(weekly_infects, weekly_deaths, how='outer', 
                    left_index=True, right_index=True,
                    suffixes=('_infected', '_deceased')).reset_index()

Next, make sure that there are no day/weeks skipped in the data due to zero cases:

In [12]:
districts = np.unique(rki_raw.districtId)

In [13]:
first_d = rki_daily.Meldedatum.min()
last_d = rki_daily.Meldedatum.max()

all_dates = [first_d + dt.timedelta(days=x) for x in range((last_d-first_d).days + 1)]

all_combin_d = pd.DataFrame([(dist, day) for dist in districts for day in all_dates], columns=['districtId','Meldedatum'])

rki_daily = pd.merge(all_combin_d, rki_daily, how='left').fillna(0)

In [14]:
first_w = rki_weekly.week_no.min()
last_w = rki_weekly.week_no.max()

all_weeks = [first_w + x for x in range(last_w-first_w + 1)]

all_combin_w = pd.DataFrame([(dist,week) for dist in districts for week in all_weeks],columns=['districtId','week_no'])

rki_weekly = pd.merge(all_combin_w, rki_weekly, how='left').fillna(0)

Read in static data for population sizes:

In [15]:
static_data = pd.read_csv('processed_static_data.csv')

In [16]:
populations = static_data[['districtId','total_population']]

Add cumulative case numbers and per capita values:

In [17]:
def cumulate_and_per_cap(df):
    df_ = df.copy()
    cols = df.columns[2:]
    for col in cols:
        df_[col+'_cumul'] = df_.groupby('districtId')[col].cumsum()
        
    df_ = df_.merge(populations)
    cols = ['AnzahlFall', 'AnzahlTodesfall']
    cols = cols + [col + '_cumul' for col in cols]
    for col in cols:
        df_[col+'_per_cap'] = df_[col] / df_['total_population']
        
    
    return df_
        

In [18]:
rki_weekly = cumulate_and_per_cap(rki_weekly)
rki_daily = cumulate_and_per_cap(rki_daily)

In [19]:
rki_weekly.to_csv('rki_weekly.csv',index=False)
rki_daily.to_csv('rki_daily.csv',index=False)