### Import libraries

In [7]:
import pandas as pd
import numpy as np
import datetime as dt
import holidays

### Prepare holiday calenders

In [8]:
german_provinces = holidays.DE.PROVINCES

In [9]:
holiday_dict = {}
for province in german_provinces:
    holiday_dict[province] = holidays.DE(years=[2019,2020,2021],prov=province)

Bayern hat einmal normalen Feiertagskalender, und dann noch BYP für protestantische Landkreise... das würde ich jetzt einfach mal ignorieren, und allen bayrischen Landkreise and Maria Himmelfahrt frei geben?

### Read in edge table and mobility volumes

In [10]:
edgetable = pd.read_csv('teralytics_2019_20_edge_table.csv')
mobvolume = pd.read_csv('teralytics_2019_20_mobility_per_district_and_day.csv')

FileNotFoundError: [Errno 2] File teralytics_2019_20_edge_table.csv does not exist: 'teralytics_2019_20_edge_table.csv'

### Prepare datetime variables

In [None]:
for df in [edgetable, mobvolume]:
    df['date'] = pd.to_datetime(df['date'],format='%Y-%m-%d')
    df['week_no'] = df['date'].dt.week    
    df['year'] = df['date'].dt.year

    
mobvolume['day_of_the_week'] = mobvolume['date'].dt.day_name()
#leave the day of the week in this string format for easier daily dummy creation if need be

### Get provinces from districtIds

In [None]:
prov_ags_dict = {8:'BW',
                 9:'BY',
                 11:'BE',
                 12:'BB',
                 4:'HB',
                 2:'HH',
                 6:'HE',
                 13:'MV',
                 3:'NI',
                 5:'NW',
                 7:'RP',
                 10:'SL',
                 14:'SN',
                 15:'ST',
                 1:'SH',
                 16:'TH'}

mobvolume['province'] = np.floor(mobvolume['districtId'] / 1000).map(prov_ags_dict)
#first digits of AGS code for Province

### Prepare workday vs day_off dummies

In [None]:
mobvolume['holiday'] = None
for i in mobvolume.index:
    mobvolume.holiday.at[i] = mobvolume.date[i] in holiday_dict[mobvolume.province[i]]

mobvolume['weekend'] = mobvolume['day_of_the_week'].isin(['Saturday','Sunday'])

mobvolume['day_off'] = mobvolume['holiday'] | mobvolume['weekend']

mobvolume['workday'] = ~mobvolume['day_off']

separate trip counts into workday and day_off columns

In [None]:
variables_to_sum = []
for mobi_type in ['internal','incoming']: #maybe add outgoing as well, but should not matter for district infections
    for work in ['workday', 'day_off']:
        count_var = 'Count_' + mobi_type
        var_name = count_var + '_' + work
        variables_to_sum.append(var_name)
        mobvolume[var_name] = mobvolume[count_var] * mobvolume[work]

Generate feature dataframe

In [None]:
mobi_volume_features = mobvolume.groupby(['districtId','week_no','year'])[variables_to_sum].sum().reset_index()

Now for the incoming infection load. We need a dataframe of infections rate of districts per week. This will be available in the main analysis, I'll load it in here from a downloaded CSV. It's important to note that these are NEW cases per inhabitant and week, not CURRENT cases, as these are unavailable. Since the ratio of new cases to (earlier) current cases can be assumed to be constant, the regression weight/other model fitting should account for this difference.

In [None]:
#this will be different in general analysis
inf_rates = pd.read_csv('rki_weekly.csv')
inf_rates['year'] = 2020
inf_rates.set_index(['districtId','week_no','year'],inplace=True)
###############################################################

transform daily full edge table to weekly edge table without self-loops

In [None]:
is_internal = edgetable.orig_ags5 == edgetable.dest_ags5
weekly_edge_tbl = edgetable[~is_internal].groupby(['year','week_no', 'orig_ags5', 'dest_ags5']).sum()

weekly_edge_tbl.reset_index(inplace=True)

reset index of infections for easier dataframe merging

In [None]:
inf_rates.reset_index(inplace=True)

merge infection rates and edge table. Infection rates of trip ORIGINS are added to the edges.

In [None]:
merged_inf_mobi = pd.merge(weekly_edge_tbl, inf_rates, 
                           left_on = ['year','week_no','orig_ags5'],
                           right_on = ['year','week_no','districtId'],
                           how='outer')
merged_inf_mobi['incoming_infected'] = merged_inf_mobi['Count'] * merged_inf_mobi['AnzahlFall_per_cap']

Group by trip DESTINATIONS, as this is where the incoming infections come together

In [None]:
incoming_infections = merged_inf_mobi.groupby(['year','dest_ags5','week_no'])['incoming_infected'].sum().reset_index()

Rename the districtId, as we no longer need to differentiate between origin and destination districtIds

In [None]:
incoming_infections.rename(columns={'dest_ags5':'districtId'},inplace=True)

get structure data for per_head standardisation

In [None]:
# this will be different in general analysis
static_data = pd.read_csv('processed_static_data.csv')
##########################################
populations = static_data[['districtId','total_population']]

In [None]:
mobility_features = mobi_volume_features.merge(incoming_infections).merge(populations)

In [None]:
for col in ['Count_internal_workday', 'Count_internal_day_off', 
            'Count_incoming_workday', 'Count_incoming_day_off', 
            'incoming_infected']:
    var_name = col + '_p_pop'
    mobility_features[var_name] = mobility_features[col] / mobility_features['total_population']

In [None]:
mobility_features.to_csv('mobility_features.csv',index=False)