In [2]:
import numpy as np
import pandas as pd

In [5]:
all_data = pd.read_csv('all_data.csv')

## descriptives

In [19]:
cols_to_include = ['target',
                   'R0_this_week',
                   'Count_internal_p_pop',
                   'Count_incoming_p_pop',
                   'incoming_infected_p_pop',
                   'temperature',
                   'humidity',
                   'precipitation',
                   'sunshine',
                   'ses_z',
                   'rur_z']

descriptions = (all_data[cols_to_include]
                .describe()
                .T
                [['mean', 'std', 'min', '25%', '50%', '75%', 'max']])

ex=5 # exponent to make infection load visibly non-zero

descriptions.loc['incoming_infected_p_pop'] = descriptions.loc['incoming_infected_p_pop'] * (10**ex)

var_dict = {'target':'Log of R-factor',
            'R0_this_week':'R-factor',
            'Count_internal_p_pop':'Internal trips per inhabitant',
            'Count_incoming_p_pop':'Incoming trips per inhabitant',
            'incoming_infected_p_pop':'Incoming infection load (10^-'+str(ex)+' per inhabitant)',
            'temperature':'Average temperature (°C)',
            'humidity':'Average humidity (%)',
            'precipitation':'Average daily precipitation (cm)',
            'sunshine':'Average daily sunshine (minutes)',
            'ses_z':'Standardized socio-economic status index',
            'rur_z':'Standardized ruralness index'}

descriptions.index=descriptions.index.map(var_dict)
descriptions = descriptions.round(2)
print(descriptions)
descriptions.to_csv('descriptives.csv')

                                                  mean     std    min     25%  \
Log of R-factor                                  -0.02    0.78  -4.33   -0.53   
R-factor                                          1.35    1.48   0.01    0.59   
Internal trips per inhabitant                     9.98    2.98   3.62    8.01   
Incoming trips per inhabitant                     4.28    1.78   0.70    2.98   
Incoming infection load (10^-5 per inhabitant)   42.00   53.60   0.48   11.06   
Average temperature (°C)                         14.78    4.90   0.16   11.64   
Average humidity (%)                             66.42    9.11  20.11   60.43   
Average daily precipitation (cm)                  1.58    1.98   0.00    0.11   
Average daily sunshine (minutes)                473.67  145.73  34.03  354.92   
Standardized socio-economic status index          0.00    1.00  -2.66   -0.55   
Standardized ruralness index                      0.00    1.00  -4.18   -0.60   

                           

## infection numbers 

In [4]:
cols_to_include = ['districtId',
                   'week_no',
                   'AnzahlFall',
                  'AnzahlTodesfall',
                   'AnzahlFall_per_cap',
                  'AnzahlTodesfall_per_cap',
                   'AnzahlFall_cumul',
                  'AnzahlTodesfall_cumul',
                   'AnzahlFall_cumul_per_cap',
                  'AnzahlTodesfall_cumul_per_cap',
                  'R0_this_week',
                   'target']

infections = all_data[cols_to_include]

var_dict = {'districtId':'ID/AGS-5',
            'week_no':'Calendar week 2020',
            'AnzahlFall':'Number of new cases',
            'AnzahlTodesfall':'Number of new fatalities',
            'AnzahlFall_per_cap':'New cases per inhabitant',
            'AnzahlTodesfall_per_cap':'New fatalities per inhabitant',
            'AnzahlFall_cumul':'Cumulated number of cases',
            'AnzahlTodesfall_cumul':'Cumulated number of fatalities',
            'AnzahlFall_cumul_per_cap':'Cumulated cases per inhabitant',
            'AnzahlTodesfall_cumul_per_cap':'Cumulated fatalities per inhabitant',
            'R0_this_week':'Weekly R0',
            'target':'Weekly Logarithmic R0'}

infections.rename(columns=var_dict).to_csv('infection_numbers.csv',index=False)

## pd descriptives

In [83]:
pd_data = pd.merge(pd.read_csv('deviance_df.csv'),
                   pd.read_csv('processed_static_data.csv'),
                   on='districtId',
                   suffixes=('','_duplicate')).query("is_lockdown_pd or is_summer_pd")
pd_data['BL_code'] = pd_data.districtId // 1000
pd_data['Province'] = pd_data.BL_code.map({3:'Lower Saxony',
                                          5:'North Rine-Westphalia',
                                          7:'Rhineland-Palatinate',
                                          8:'Baden-Wuerttemberg',
                                          9:'Bavaria'})
pd_data['PD Timeframe'] = pd_data.is_lockdown_pd.map({True:'First Lockdown',
                                                False:'Summer Vacation'})
pd_data['District'] = pd_data.district_name.str.replace('KS ','Independent city of ')
pd_data.rename(columns={'lockdown_rank':'Ranking during first lockdown',
                       'summer_rank':'Ranking during summer vacation',
                       'rur_z':'Ruralness',
                       'ses_z':'Socioeconimic status',
                       'total_population':'Population',
                       'population_density':'Population density (inhabitants per km²)'},
              inplace=True)

for c in pd_data.columns:
    try:
        if all(pd_data[c] % 1 == 0):
            pd_data[c] = pd_data[c].astype(int)
        else:
            pd_data[c] = pd_data[c].round(2)
    except:
        pass
    
pd_data

Unnamed: 0,districtId,deviance_lockdown,deviance_summer,district_name,Ruralness,Socioeconimic status,lockdown_z,n_peer_kernel,shapiro_lockdown_z,shapiro_summer_z,...,Population density (inhabitants per km²),share_female,Ruralness_Index,rur_z_duplicate,Socioecon_Index,ses_z_duplicate,BL_code,Province,PD Timeframe,District
58,3460,0.21,0.27,Vechta,0.8,0.47,1.15,182,0.15,0.6,...,175,0.5,1.55,0.8,1.03,0.47,3,Lower Saxony,Summer Vacation,Vechta
98,5754,-0.03,0.34,Gütersloh,0.03,0.84,-0.83,142,0.2,0.97,...,377,0.5,0.07,0.03,1.86,0.84,5,North Rine-Westphalia,Summer Vacation,Gütersloh
142,7111,0.41,0.02,KS Koblenz,-1.16,0.69,2.64,61,0.94,0.54,...,1084,0.51,-2.26,-1.16,1.53,0.69,7,Rhineland-Palatinate,First Lockdown,Independent city of Koblenz
211,8336,0.37,-0.18,Lörrach,-0.03,0.97,2.24,128,0.44,0.98,...,284,0.51,-0.05,-0.03,2.13,0.97,8,Baden-Wuerttemberg,First Lockdown,Lörrach
256,9279,0.02,0.5,Dingolfing-Landau,0.83,1.99,-0.64,15,0.14,0.04,...,110,0.49,1.62,0.83,4.39,1.99,9,Bavaria,Summer Vacation,Dingolfing-Landau
294,9663,0.33,-0.4,KS Würzburg,-1.29,0.49,2.2,60,0.43,0.19,...,1460,0.52,-2.51,-1.29,1.08,0.49,9,Bavaria,First Lockdown,Independent city of Würzburg


In [85]:
cols_2_desc = ['District',
               'PD Timeframe',
               'Province',
               'Ranking during first lockdown',
               'Ranking during summer vacation',
               'Ruralness',
               'Socioeconimic status',
               'Population',
               'Population density (inhabitants per km²)']
district_order = [142,211,294,98,256,58]

pd_data[cols_2_desc].loc[district_order].to_csv('pd_descriptives.csv',index=False)