In [None]:
import tabula
import pandas as pd
import numpy as np

In [None]:
!pip install tabula-py

# Case numbers

## Data Source RKI
<<https://www.arcgis.com/sharing/rest/content/items/f10774f1c63e40168479a1feb6c7ca74/data>>

Notebooks to be merged for preparation of Sarahs pipeline

--> Look at the notebook WS1_aw_create_germany_data_for_pipeline

# Health data for accessing the health situation or how well people are obeying the governmental rules

## DataSource: 
<<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/8UTBVA>>

## according to the codebook.pdf, these columns are interesting for assessing the health situation of the population
d1_health_1 d1_health_2 d1_health_3 d1_health_4 d1_health_5
d1_health_6 d1_health_7 d1_health_8
d1_health_9

d1_health_10 High cholesterol
d1_health_11 d1_health_12
d1_health_13 d1_health_98 d1_health_99

HIV/ Aids
Mental health condition Multiple Sclerosis Prefer not to say
None of these

In [None]:
df_health= pd.read_csv("/project_data/data_asset/dataverse_harvard_germany.csv",encoding="latin-1")

# Data showing the vulnerable population in Germany on regional level

## Datasource: 
<<https://wido.de/fileadmin/Dateien/Dokumente/News/Pressemitteilungen/2020/2020_Monitor_Vorerkrankungen_mit_erhoehtem_Risiko_fuer_schwere_COVID-19-Verlaeufe_final.pdf>>

In [None]:
# top,left,bottom,right
# 

# page 87
df_wido1 = tabula.read_pdf("/project_data/data_asset/wido_dat_correct_paper_covid-19_2020.pdf", pages = "87", multiple_tables = True, 
                          output_format="dataframe",stream=True, area=(281.52,100,770,495),pandas_options={"header":None}) 

# page 88-94
df_wido2 = tabula.read_pdf("/project_data/data_asset/wido_dat_correct_paper_covid-19_2020.pdf", pages = "88-94", multiple_tables = True, 
                          output_format="dataframe",stream=True, area=(132,100,770,495),pandas_options={"header":None}) 

# page 95
df_wido3 = tabula.read_pdf("/project_data/data_asset/wido_dat_correct_paper_covid-19_2020.pdf", pages = "95", multiple_tables = True, 
                          output_format="dataframe",stream=True, area=(130.6,100.5,301,496.5),pandas_options={"header":None}) 
df_wido4 = tabula.read_pdf("/project_data/data_asset/wido_dat_correct_paper_covid-19_2020.pdf", pages = "95", multiple_tables = True, 
                          output_format="dataframe",stream=True, area=(333,100,760,496),pandas_options={"header":None}) 

# page 96-100
df_wido5 = tabula.read_pdf("/project_data/data_asset/wido_dat_correct_paper_covid-19_2020.pdf", pages = "96-100", multiple_tables = True, 
                          output_format="dataframe",stream=True, area=(132,100,770,495),pandas_options={"header":None}) 

# page 101
df_wido6 = tabula.read_pdf("/project_data/data_asset/wido_dat_correct_paper_covid-19_2020.pdf", pages = "101", multiple_tables = True, 
                          output_format="dataframe",stream=True, area=(132,100,195.51,495),pandas_options={"header":None}) 


In [None]:
# concat the dataframes
df_wido=pd.concat([df_wido1[0],df_wido2[0],df_wido3[0],df_wido4[0],df_wido5[0],df_wido6[0]])

In [None]:
len(df_wido)

In [None]:
# 1 LK is missing due to bad formatting in the pdf
df_wido=df_wido.append({0:"Neustadt an der Aisch",1:30,2:"29,9",3:"24,3 - 34,7"},ignore_index=True)

In [None]:
df_wido.columns=["LK","no of patients with at least 1 pre-condition","percentage","CI"]
df_wido.head(5)

In [None]:
# proper formatting of the columns
df_wido["no of patients with at least 1 pre-condition"]=df_wido["no of patients with at least 1 pre-condition"]*1000
df_wido["no of patients with at least 1 pre-condition"].astype(int)
df_wido["percentage"]=[i.replace(",",".") for i in df_wido["percentage"]]
df_wido["CI"]=[i.replace(",",".") for i in df_wido["CI"]]

In [None]:
# Rearranging the names 
# Creating the first alternate names
for row in df_wido.iterrows():
    #print(row[1]["LK"])
    try:
        x=row[1]["LK"].split(",")
        if "Landkreis" in x[1]:
            df_wido.loc[row[0],"LK altered"]=str(x[1].strip(" "))+" "+str(x[0].strip(" "))
    except:
        df_wido.loc[row[0],"LK altered"]=row[1]["LK"]

In [None]:
# mapping to the Cognos Names column
mapping=pd.read_csv("/project_data/data_asset/mapping_rki_cognos.csv")

In [None]:
# load additional mapping table from Destatis
destatis=pd.read_excel("/project_data/data_asset/destatis_germany_regions.xlsx",
                       sheet_name="Kreisfreie Städte u. Landkreise", skiprows=6)

In [None]:
# destatis reformatting 
destatis.columns=["Cca 2", "regiontype","name","NUTS3","tbr1","tbr2","tbr3","tbr4","tbr5"]
destatis.drop(columns=["tbr1","tbr2","tbr3","tbr4","tbr5"],inplace=True)
destatis.dropna(subset=["NUTS3"],inplace=True)
destatis.sort_values("name",inplace=True)
destatis=destatis.reset_index(drop=True)
df_wido.sort_values("LK",inplace=True)
df_wido=df_wido.reset_index(drop=True)

In [None]:
# Creating a table for the Bundesländer

# Bundesländer
#- 01 Schleswig-Holstein (SH)
#- 02 Hamburg (HH)
#- 03 Niedersachsen (NI)
#- 04 Bremen (HB)
#- 05 Nordrhein-Westfalen (NW)
#- 06 Hessen (HE)
#- 07 Rheinland-Pfalz (RP)
#- 08 Baden-Württemberg (BW)
#- 09 Bayern (BY)
#- 10 Saarland (SL)
#- 11 Berlin (BE)
#- 12 Brandenburg (BB)
#- 13 Mecklenburg-Vorpommern (MV)
#- 14 Sachsen (SN)
#- 15 Sachsen-Anhalt (ST)
#- 16 Thüringen (TH)
df_bundesland=pd.DataFrame(data={"Nummer":["01","02","03","04","05","06","07","08","09",
                                           "10","11","12","13","14","15","16"], 
                                 "Name":["Schleswig-Holstein","Hamburg","Niedersachsen",
                                         "Bremen","Nordrhein-Westfalen","Hessen",
                                         "Rheinland-Pfalz","Baden-Württemberg","Bayern",
                                         "Saarland","Berlin","Brandenburg","Mecklenburg-Vorpommern",
                                         "Sachsen","Sachsen-Anhalt","Thüringen"]})

In [None]:
destatis["Bundesland"]=""
for i in df_bundesland["Nummer"]:
    idx_1=df_bundesland[df_bundesland["Nummer"]==i].index.values[0]
    idx_2=destatis[destatis["Cca 2"].str.startswith(i)].index.values
    destatis.loc[idx_2,"Bundesland"]=df_bundesland.loc[idx_1,"Name"]

In [None]:
destatis

In [None]:
# now append the Cca 2,name,regiontype columns from destatis mapping to the wido table --> mapping to cognos
df_wido=pd.concat([df_wido,destatis["Cca 2"],destatis["name"],destatis["regiontype"]],axis=1,ignore_index=True)

In [None]:
df_wido.columns=['LK', 'no of patients with at least 1 pre-condition', 'percentage',
       'CI', 'alternate name 1', 'Cca 2', 'alternate name 2', 'regiontype']

In [None]:
df_wido.head(5)

In [None]:
mapping.head(5)

In [None]:
mapping_ger=pd.DataFrame(columns=["Land","Bundesland","Landkreis Typ","Landkreis Name 1","Landkreis Name 2",
                                  "Landkreis Name 3","Cca 2","NUTS3","IBM Cognos Name","RKI Name"])

In [None]:
mapping_ger.loc[:,"RKI Name"]=mapping["rki name"]
mapping_ger.loc[:,"IBM Cognos Name"]=mapping["cognos name"]
mapping_ger.loc[:,"Cca 2"]=mapping["Cca 2"]

In [None]:
mapping_ger.loc[:,"Land"]="Germany"

In [None]:
# strip off the 0 from the Cca 2 codes in df_wido["Cca 2"]
df_wido["Cca 2"]=[int(i) for i in df_wido["Cca 2"]]
destatis["Cca 2"]=[int(i) for i in destatis["Cca 2"]]

In [None]:
for code in mapping_ger["Cca 2"]:
    idx_1=mapping_ger[mapping_ger["Cca 2"]==code].index.values[0]
    idx_2=df_wido[df_wido["Cca 2"]==code].index.values[0]
    idx_3=destatis[destatis["Cca 2"]==code].index.values[0]
    mapping_ger.loc[idx_1,"Landkreis Typ"]=df_wido.loc[idx_2,"regiontype"]
    mapping_ger.loc[idx_1,"Landkreis Name 1"]=df_wido.loc[idx_2,"LK"]
    mapping_ger.loc[idx_1,"Landkreis Name 2"]=df_wido.loc[idx_2,"alternate name 1"]
    mapping_ger.loc[idx_1,"Landkreis Name 3"]=df_wido.loc[idx_2,"alternate name 1"]
    mapping_ger.loc[idx_1,"NUTS3"]=destatis.loc[idx_3,"NUTS3"]
    mapping_ger.loc[idx_1,"Bundesland"]=destatis.loc[idx_3,"Bundesland"]

In [None]:
mapping_ger.head(5)

In [None]:
mapping_ger.to_csv("../mapping_table_germany.csv",index_label=False)

In [None]:
# Mapping of the precondition table to cognos names
for code in df_wido["Cca 2"]:
    idx_1=df_wido[df_wido["Cca 2"]==code].index.values[0]
    idx_2=mapping_ger[mapping_ger["Cca 2"]==code].index.values[0]
    df_wido.loc[idx_1,"cognos name"]=mapping_ger.loc[idx_2,"IBM Cognos Name"]

In [None]:
df_wido.to_csv("../Germany_population_precondition_regions.csv")

# Population by age and region

## Data Source: 
<<https://www-genesis.destatis.de/genesis//online?operation=table&code=12411-0017&bypass=true&levelindex=1&levelid=1594666073068>>

In [None]:
df_age=pd.read_csv("../Germany_population_raw_12411-0017.csv",encoding="latin-1",sep=";",skiprows=5)

In [None]:
df_age.columns

In [None]:
new_columns=["Timestamp","Cca 2","Landkreis",'unter 3 Jahre',
       '3 bis unter 6 Jahre', '6 bis unter 10 Jahre', '10 bis unter 15 Jahre',
       '15 bis unter 18 Jahre', '18 bis unter 20 Jahre',
       '20 bis unter 25 Jahre', '25 bis unter 30 Jahre',
       '30 bis unter 35 Jahre', '35 bis unter 40 Jahre',
       '40 bis unter 45 Jahre', '45 bis unter 50 Jahre',
       '50 bis unter 55 Jahre', '55 bis unter 60 Jahre',
       '60 bis unter 65 Jahre', '65 bis unter 75 Jahre', '75 Jahre und mehr',
       'Insgesamt']
df_age.columns=new_columns
df_age.head(5)

In [None]:
# Drop rows with no entries:
df_age = df_age.drop(df_age[df_age["Insgesamt"]=="-"].index)

In [None]:
len(df_age)

In [None]:
# Mapping of the age table to cognos names
for code in mapping_ger["Cca 2"]:
    idx_1=df_age[df_age["Cca 2"]==code].index.values[0]
    idx_2=mapping_ger[mapping_ger["Cca 2"]==int(code)].index.values[0]
    df_age.loc[idx_1,"cognos name"]=mapping_ger.loc[idx_2,"IBM Cognos Name"]

In [None]:
df_age.dropna(subset=["cognos name"],inplace=True)

In [None]:
df_age.tail(5)

In [None]:
df_age.to_csv("../Germany_demographic_distribution_regions.csv")

# life expectancy

## Data Source
<<https://www.destatis.de/EN/Themes/Society-Environment/Population/Deaths-Life-Expectancy/Tables/life-expectancy-laender-male.html>>

Needs to be downsampled to regional level


In [None]:
df_life_male=pd.read_csv("../Life_expectancy_germany_male.csv",sep=";",skiprows=1)
df_life_female=pd.read_csv("../Life_expectancy_germany_female.csv",sep=";",skiprows=1)

In [None]:
df_life_female.drop([0],inplace=True)
df_life_male.drop([0],inplace=True)
columns_fem=["Bundesland","fem life expectancy at the age of 0","fem life expectancy at the age of 1","fem life expectancy at the age of 20",
        "fem life expectancy at the age of 40","fem life expectancy at the age of 60","fem life expectancy at the age of 65",
        "fem life expectancy at the age of 80"]
columns_male=["Bundesland","male life expectancy at the age of 0","male life expectancy at the age of 1","male life expectancy at the age of 20",
        "male life expectancy at the age of 40","male life expectancy at the age of 60","male life expectancy at the age of 65",
        "male life expectancy at the age of 80"]
df_life_female.columns=columns_fem
df_life_male.columns=columns_male

df_life_female.head(5)

In [None]:
# join the dataframes
df_life=pd.merge(df_life_female,df_life_male,on="Bundesland")

In [None]:
columns_regions=["Bundesland","Cca 2","cognos name",'fem life expectancy at the age of 0','fem life expectancy at the age of 1',
       'fem life expectancy at the age of 20',
       'fem life expectancy at the age of 40',
       'fem life expectancy at the age of 60',
       'fem life expectancy at the age of 65',
       'fem life expectancy at the age of 80',
       'male life expectancy at the age of 0',
       'male life expectancy at the age of 1',
       'male life expectancy at the age of 20',
       'male life expectancy at the age of 40',
       'male life expectancy at the age of 60',
       'male life expectancy at the age of 65',
       'male life expectancy at the age of 80']
from collections import Counter
columns_copy=list(Counter(columns_regions)-Counter(["Bundesland","Cca 2","cognos name"]))

In [None]:
df_life_regions=pd.DataFrame(columns=columns_regions)

In [None]:
# downsample to regions and align the names with cognos names
# add columns for Landkreise and add columns for Cca 2 and cognos name

df_life_regions[["Bundesland","Cca 2","cognos name"]]=mapping_ger[["Bundesland","Cca 2","IBM Cognos Name"]]

for bl in df_life_regions["Bundesland"]:
    idx_1=df_life[df_life["Bundesland"]==bl].index.values[0]
    idx_2=df_life_regions[df_life_regions["Bundesland"]==bl.strip(" ")].index.values
    df_life_regions.loc[idx_2,columns_copy]=df_life.loc[idx_1,columns_copy].values

In [None]:
df_life_regions.to_csv("../Germany_life_expectation.csv",index_label=False)

# Hospital beds per 1000 citizens

## Data Source
<<http://www.gbe-bund.de/oowa921-install/servlet/oowa/aw92/WS0100/_XWD_FORMPROC>>

In [None]:
df_beds=pd.read_csv("../Hospital_beds_germany.csv",sep=";",skiprows=2,dtype=str)

In [None]:
df_beds.head(5)

In [None]:
# drop summation rows
df_beds.drop([0,17],inplace=True)

In [None]:
# reformatting
df_beds[" Betten Anzahl"]=[int(i.replace(".","")) for i in df_beds[" Betten Anzahl"]]
df_beds["Betten je 100.000 Einwohner"]=[i.replace(",",".") for i in df_beds["Betten je 100.000 Einwohner"]]
df_beds[" Nutzungsgrad in Prozent"]=[i.replace(",",".") for i in df_beds[" Nutzungsgrad in Prozent"]]

In [None]:
df_beds.head(5)

In [None]:
# add columns for Landkreise and add columns for Cca 2 and cognos name
df_beds_lk=pd.DataFrame(columns=["beds per 100000 citizens","Bundesland", "Cca 2", "cognos name"])
df_beds_lk[["Bundesland","Cca 2","cognos name"]]=mapping_ger[["Bundesland","Cca 2","IBM Cognos Name"]]

In [None]:
for bl in df_beds["Unnamed: 0"]:
    idx_1=df_beds[df_beds["Unnamed: 0"]==bl].index.values[0]
    idx_2=df_beds_lk[df_beds_lk["Bundesland"]==bl.strip(" ")].index.values
    df_beds_lk.loc[idx_2,"beds per 100000 citizens"]=df_beds.loc[idx_1,"Betten je 100.000 Einwohner"]

In [None]:
df_beds_lk.head(5)

In [None]:
df_beds_lk.to_csv("../Germany_hospital_beds_regions.csv",index_label=False)

# Population density

## Data Source
<<https://www-genesis.destatis.de/gis/genView?GenMLURL=https://www-genesis.destatis.de/regatlas/AI002-1.xml&CONTEXT=REGATLAS01>>

In [None]:
df_density=pd.read_csv("../population_density_germany_raw.csv",sep=";")

In [None]:
df_density.head(5)

In [None]:
df_density.columns

In [None]:
df_density.drop(columns=["Unnamed: 3","Unnamed: 4","Unnamed: 5","Unnamed: 6"," Name"],inplace=True)
df_density.columns=["Cca 2","population density"]

In [None]:
for name in mapping_ger["Cca 2"]:
    idx_1=mapping_ger[mapping_ger["Cca 2"]==name].index.values[0]
    idx_2=df_density[df_density["Cca 2"]==name].index.values[0]
    df_density.loc[idx_2,"cognos name"]=mapping_ger.loc[idx_1,"IBM Cognos Name"]

In [None]:
df_density.head(5)

In [None]:
df_density.to_csv("../Germany_population_density.csv",index_label=False)

# ICU Bed capacity

## Data Source
<<https://www.intensivregister.de/#/intensivregister?tab=laendertabelle>>
<<https://www.divi.de/divi-intensivregister-tagesreport-archiv-csv/divi-intensivregister-2020-07-15-12-15/viewdocument/4108>>

## ATTENTION
The table is updated daily, but official usage is not allowed... one needs to contact the DIVI.

In the archive you can only download the daily files.

In [None]:
df_divi=pd.read_csv("../DIVI-Intensivregister_2020-07-15_12-15.csv")

In [None]:
df_divi.columns

In [None]:
df_divi.drop(columns=["anzahl_meldebereiche","anzahl_standorte"],inplace=True)

In [None]:
df_divi.columns=["Bundesland ID","Cca 2", "Covid Cases Hospital", "Covid Cases ventilated","free beds", "occupied beds","date"]

In [None]:
df_divi.head(5)

In [None]:
for name in df_divi["Cca 2"]:
    idx_1=mapping_ger[mapping_ger["Cca 2"]==name].index.values[0]
    idx_2=df_divi[df_divi["Cca 2"]==name].index.values[0]
    df_divi.loc[idx_2,"cognos name"]=mapping_ger.loc[idx_1,"IBM Cognos Name"]

In [None]:
df_divi.to_csv("../Germany_ICU_capacity_20200715.csv",index_label=False)

# Combination of static data into one table

## Data Sources
### ICU Beds
### Population density
### Hospital beds per 100000 citizens
### Life expectancy