In [1]:
import pandas as pd
import numpy as np
import json

# Import datasets

Merge datasets with Covid19 statistics, Air pollution measurement and geographical data at the German county-level

## Import Covid Data

In [2]:
covid_df = pd.read_csv('../raw_data/RKI_corona_landskreise.csv')

covid_df.shape

(411, 47)

In [3]:
covid_df.head()

Unnamed: 0,OBJECTID,ADE,GF,BSG,RS,AGS,SDV_RS,GEN,BEZ,IBZ,...,cases7_per_100k,recovered,EWZ_BL,cases7_bl_per_100k,cases7_bl,death7_bl,cases7_lk,death7_lk,cases7_per_100k_txt,AdmUnitId
0,1,4.0,4.0,1.0,1001,1001.0,10010000000.0,Flensburg,Kreisfreie Stadt,40.0,...,111.192652,,2910875,97.908704,2850,4,100,0,1112,1001
1,2,4.0,4.0,1.0,1002,1002.0,10020000000.0,Kiel,Kreisfreie Stadt,40.0,...,122.870548,,2910875,97.908704,2850,4,303,0,1229,1002
2,3,4.0,4.0,1.0,1003,1003.0,10030000000.0,Lübeck,Kreisfreie Stadt,40.0,...,108.410626,,2910875,97.908704,2850,4,234,0,1084,1003
3,4,4.0,4.0,1.0,1004,1004.0,10040000000.0,Neumünster,Kreisfreie Stadt,40.0,...,127.651586,,2910875,97.908704,2850,4,102,0,1277,1004
4,5,4.0,4.0,1.0,1051,1051.0,10510040000.0,Dithmarschen,Kreis,42.0,...,75.796805,,2910875,97.908704,2850,4,101,0,758,1051


## Import Pollution Data

In [4]:
pollution_df = pd.read_csv('../raw_data/APexpose.csv')
pollution_df.shape

(12060, 19)

In [5]:
pollution_df.head()

Unnamed: 0,year,county,kreis_code,scenario,ISO_code,Kreis_Scluessel,Lon,Lat,NO2_annualMean,NO2_hrOver200,NO_annualMean,O3_annualMean,O3_daysOver120,O3_dailyMaxAnnualMean,O3_dailyHourlyMax,O3_daily8HrMax,PM10_annualMean,PM10_daysOver50,PM2.5_annualMean
0,2019,SK Freiburg i.Breisgau,12,remote,DE.BW.FB,8311,7.818076,47.992523,15.75711,0.0,6.600048,55.4519,6.0,83.70603,218.5,206.925,12.47323,2.0,8.984028
1,2019,LK Dillingen a.d.Donau,68,remote,DE.BY.DD,9773,10.527764,48.596404,18.953264,0.004595,9.519503,55.639033,3.80237,78.827799,180.314018,163.814437,17.288283,5.366906,12.952426
2,2019,SK Nürnberg,107,remote,DE.BY.NR,9564,11.082755,49.436211,25.38007,0.0,12.51538,45.32018,0.0,73.69548,160.34,155.6725,15.367712,4.895583,11.51754
3,2019,LK Neumarkt i.d.OPf.,110,remote,DE.BY.NO,9373,11.566558,49.215961,15.709243,0.003532,8.176603,52.094916,2.182587,76.590555,179.684101,162.612711,16.092153,4.895583,12.269832
4,2019,SK Rosenheim,122,remote,DE.BY.RH,9163,12.108725,47.844378,17.524585,0.004127,8.682464,52.593257,2.410346,76.905136,179.772674,162.781687,16.414083,4.895583,12.485468


## Import GeoJSON

In [6]:
url = 'https://raw.githubusercontent.com/isellsoap/deutschlandGeoJSON/main/4_kreise/1_sehr_hoch.geo.json'
dfObj = pd.read_json(url)
dfObj.head()

Unnamed: 0,type,features
0,FeatureCollection,"{'type': 'Feature', 'id': 0, 'properties': {'I..."
1,FeatureCollection,"{'type': 'Feature', 'id': 1, 'properties': {'I..."
2,FeatureCollection,"{'type': 'Feature', 'id': 2, 'properties': {'I..."
3,FeatureCollection,"{'type': 'Feature', 'id': 3, 'properties': {'I..."
4,FeatureCollection,"{'type': 'Feature', 'id': 4, 'properties': {'I..."


In [7]:
dfObj.shape

(434, 2)

In [8]:
list_ids = []
list_counties = []
list_coords = []
list_code_counties = []
for i in range(len(dfObj['features'])):
    list_ids.append(dfObj["features"][i]["id"])
    list_counties.append(dfObj["features"][i]["properties"]["NAME_3"])
    list_coords.append(dfObj["features"][i]["geometry"]['coordinates'][0])
    list_code_counties.append(dfObj["features"][i]["properties"]["ID_3"])
    

geo_df = pd.DataFrame(list(zip(list_ids, list_counties, list_coords,list_code_counties)), columns =['Id', 'County', 'Coordinates','Id_counties'])

In [9]:
geo_df.head()

Unnamed: 0,Id,County,Coordinates,Id_counties
0,0,Oldenburg,"[[8.65347957611084, 53.11003112792969], [8.665...",244
1,1,Osnabrück Städte,"[[7.963789939880371, 52.3254508972168], [7.969...",245
2,2,Osnabrück,"[[8.026549339294434, 52.68434906005865], [8.03...",246
3,3,Vechta,"[[8.462139129638672, 52.80015182495117], [8.45...",247
4,4,Wesermarsch,"[[[8.30749988555931, 53.61819458007818], [8.30...",248


In [10]:
geo_df.to_csv('../raw_data/geoJSON.csv')

# Covid Data Preproc 

## Keep only relevant features

In [11]:
covid_df = covid_df[['BL','county','EWZ','Shape__Area', 'death_rate', 'cases', 'deaths','cases_per_100k']]
covid_df.head()

Unnamed: 0,BL,county,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k
0,Schleswig-Holstein,SK Flensburg,89934,49182930.0,1.223721,3187,39,3543.709832
1,Schleswig-Holstein,SK Kiel,246601,112231400.0,1.409469,8301,117,3366.166398
2,Schleswig-Holstein,SK Lübeck,215846,211677100.0,1.392355,7613,106,3527.051694
3,Schleswig-Holstein,SK Neumünster,79905,71402240.0,0.889996,2809,25,3515.424567
4,Schleswig-Holstein,LK Dithmarschen,133251,1425511000.0,1.915323,2976,57,2233.379112


## Merge Berlin Counties

Pollution data-set considers Berlin as 1 county, whereas Covid dataset considers 11 counties within Berlin. Collapse 11 counties in Covid dataset into one to match pollution dataset.

In [12]:
berlin = covid_df[covid_df["BL"] == 'Berlin']
berlin.head()

Unnamed: 0,BL,county,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k
399,Berlin,SK Berlin Reinickendorf,259169,89436650.0,1.618289,19465,315,7510.543313
400,Berlin,SK Berlin Charlottenburg-Wilmersdorf,315393,64774500.0,1.568608,20464,321,6488.412869
401,Berlin,SK Berlin Treptow-Köpenick,272429,168005200.0,1.689394,13200,223,4845.299142
402,Berlin,SK Berlin Pankow,403607,103363000.0,1.068934,21049,225,5215.221738
403,Berlin,SK Berlin Neukölln,318128,44996870.0,1.522467,28375,432,8919.36579


In [13]:
## Sum relevant features
berlin_sum = berlin[['Shape__Area', 'cases', 'deaths', 'EWZ']].sum()
berlin_sum

Shape__Area    8.933202e+08
cases          2.428130e+05
deaths         3.759000e+03
EWZ            3.657463e+06
dtype: float64

In [14]:
## Average relevant features
berlin_average = berlin[['death_rate','cases_per_100k']].mean()
berlin_average

death_rate           1.591636
cases_per_100k    6640.688066
dtype: float64

In [15]:
covid_df['county'][399] = 'Berlin'
covid_df['cases'][399] = berlin_sum.cases
covid_df['Shape__Area'][399] = berlin_sum.Shape__Area
covid_df['EWZ'][399] = berlin_sum.EWZ
covid_df['deaths'][399] = berlin_sum.deaths
covid_df['death_rate'][399] = berlin_average.death_rate
covid_df['cases_per_100k'][399] = berlin_average.cases_per_100k
covid_df.loc[399] 

BL                          Berlin
county                      Berlin
EWZ                        3657463
Shape__Area       893320245.521974
death_rate                1.591636
cases                       242813
deaths                        3759
cases_per_100k         6640.688066
Name: 399, dtype: object

In [16]:
covid_df.drop(index=[400,401,402,403,404,405,406,407,408,409,410], axis=0, inplace=True)

In [17]:
covid_df.shape

(400, 8)

## Feature Engineering

### Create deaths per 100k feature

In [18]:
### EWZ is einwohnerzahl is population count
covid_df['deaths_per_100k'] = covid_df['deaths']/covid_df['EWZ']*100_000
covid_df.head()

Unnamed: 0,BL,county,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k
0,Schleswig-Holstein,SK Flensburg,89934,49182930.0,1.223721,3187,39,3543.709832,43.365134
1,Schleswig-Holstein,SK Kiel,246601,112231400.0,1.409469,8301,117,3366.166398,47.445063
2,Schleswig-Holstein,SK Lübeck,215846,211677100.0,1.392355,7613,106,3527.051694,49.109087
3,Schleswig-Holstein,SK Neumünster,79905,71402240.0,0.889996,2809,25,3515.424567,31.287153
4,Schleswig-Holstein,LK Dithmarschen,133251,1425511000.0,1.915323,2976,57,2233.379112,42.776414


### Create Population Density features

divide number of people per county by area per county, multiply by 1 million to have result in number of people per square kilometer

In [19]:
covid_df['Population density'] = covid_df['EWZ']/covid_df['Shape__Area']*1_000_000

### Create vaccination rate feature

In [20]:
covid_df['BL'].unique()

array(['Schleswig-Holstein', 'Hamburg', 'Niedersachsen', 'Bremen',
       'Nordrhein-Westfalen', 'Hessen', 'Rheinland-Pfalz',
       'Baden-Württemberg', 'Bayern', 'Saarland', 'Brandenburg',
       'Mecklenburg-Vorpommern', 'Sachsen', 'Sachsen-Anhalt', 'Thüringen',
       'Berlin'], dtype=object)

In [21]:
### Map vaccinationrate per bundesland to the dataset
d = {'Berlin': 0.688, 'Rheinland-Pfalz': 0.678, 'Bayern': 0.663, 'Baden-Württemberg': 0.662,
       'Thüringen': 0.621, 'Sachsen-Anhalt': 0.645, 'Niedersachsen': 0.699, 'Brandenburg': 0.617,
       'Sachsen': 0.578, 'Hessen': 0.672, 'Nordrhein-Westfalen': 0.715, 'Schleswig-Holstein': 0.725,
       'Mecklenburg-Vorpommern': 0.665, 'Saarland':0.746, 'Bremen': 0.797, 'Hamburg':0.738}

In [22]:
covid_df['Fully vaccinated'] = covid_df['BL'].map(d)
covid_df

Unnamed: 0,BL,county,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k,Population density,Fully vaccinated
0,Schleswig-Holstein,SK Flensburg,89934,4.918293e+07,1.223721,3187,39,3543.709832,43.365134,1828.561262,0.725
1,Schleswig-Holstein,SK Kiel,246601,1.122314e+08,1.409469,8301,117,3366.166398,47.445063,2197.254026,0.725
2,Schleswig-Holstein,SK Lübeck,215846,2.116771e+08,1.392355,7613,106,3527.051694,49.109087,1019.694565,0.725
3,Schleswig-Holstein,SK Neumünster,79905,7.140224e+07,0.889996,2809,25,3515.424567,31.287153,1119.082513,0.725
4,Schleswig-Holstein,LK Dithmarschen,133251,1.425511e+09,1.915323,2976,57,2233.379112,42.776414,93.475981,0.725
...,...,...,...,...,...,...,...,...,...,...,...
395,Thüringen,LK Saale-Holzland-Kreis,82816,8.156104e+08,2.363552,6431,152,7765.407651,183.539413,101.538672,0.621
396,Thüringen,LK Saale-Orla-Kreis,79632,1.151821e+09,2.385050,8134,194,10214.486639,243.620655,69.135716,0.621
397,Thüringen,LK Greiz,96668,8.465426e+08,2.504856,9781,245,10118.136302,253.444780,114.191538,0.621
398,Thüringen,LK Altenburger Land,88356,5.697932e+08,3.458946,9049,313,10241.522930,354.248721,155.066795,0.621


# Pollution Data Preproc

## Keep only 'average' scenario

3 scenarios (rural, urban, average): average combines rural and urban > keep only average

In [23]:
pollution_df.shape

(12060, 19)

In [24]:
pollution_df = pollution_df[pollution_df.scenario == 'average']
pollution_df.shape

(4020, 19)

## Match counties to Covid

Counties were present during 2010-2019 (pollution dataset), but were later merged and are therefore not present in covid dataset (2021)

In [25]:
pollution_df = pollution_df[pollution_df.county != 'Eisenach']
pollution_df.shape

(4010, 19)

In [26]:
pollution_df = pollution_df[pollution_df.county != 'Osterode am Harz']
pollution_df.shape

(4000, 19)

## Keep relevant features only

In [27]:
pollution_df.columns

Index(['year', 'county', 'kreis_code', 'scenario', 'ISO_code',
       'Kreis_Scluessel', 'Lon', 'Lat', 'NO2_annualMean', 'NO2_hrOver200',
       'NO_annualMean', 'O3_annualMean', 'O3_daysOver120',
       'O3_dailyMaxAnnualMean', 'O3_dailyHourlyMax', 'O3_daily8HrMax',
       'PM10_annualMean', 'PM10_daysOver50', 'PM2.5_annualMean'],
      dtype='object')

In [28]:
pollution_df = pollution_df[['county', 'year', 'NO2_annualMean', 'NO2_hrOver200', 'NO_annualMean',
       'O3_annualMean', 'O3_daysOver120', 'O3_dailyMaxAnnualMean',
       'O3_dailyHourlyMax', 'O3_daily8HrMax', 'PM10_annualMean',
       'PM10_daysOver50', 'PM2.5_annualMean']]

## Sort per county and year

In [29]:
pollution_df.sort_values(['county','year'], axis=0, ascending=True,inplace=True,ignore_index=True)
pollution_df.head()

Unnamed: 0,county,year,NO2_annualMean,NO2_hrOver200,NO_annualMean,O3_annualMean,O3_daysOver120,O3_dailyMaxAnnualMean,O3_dailyHourlyMax,O3_daily8HrMax,PM10_annualMean,PM10_daysOver50,PM2.5_annualMean
0,Berlin,2010,21.33097,0.0,4.689645,48.14162,1.321674,75.383964,179.320467,161.91258,25.21095,30.28571,20.56858
1,Berlin,2011,21.10792,0.0,5.591758,46.78272,1.405013,75.515257,179.361973,161.986524,22.83172,25.57143,20.17655
2,Berlin,2012,20.656,0.0,5.372472,45.26885,1.513209,75.685711,179.415858,162.082523,20.53782,11.0,17.18541
3,Berlin,2013,19.16632,0.0,4.37616,47.91164,0.142857,73.14514,171.8557,157.5654,21.05746,13.0,15.83933
4,Berlin,2014,20.46666,0.0,15.75506,47.80345,0.0,71.65981,181.9686,166.5395,24.20021,22.0,19.77463


# GeoJSON Data Preprocessing

Mannually matched counties in GeoJSON with Covid an Pollution Data

In [30]:
geo_df = pd.read_csv('../lung_pollution/data/covid_pollution_complete.csv')

geo_df.shape

(4370, 31)

# Merge Covid and Pollution Data

In [31]:
merge_df = pollution_df.merge(covid_df, how = 'inner', on='county')
merge_df.shape

(4000, 23)

In [32]:
merge_df.columns

Index(['county', 'year', 'NO2_annualMean', 'NO2_hrOver200', 'NO_annualMean',
       'O3_annualMean', 'O3_daysOver120', 'O3_dailyMaxAnnualMean',
       'O3_dailyHourlyMax', 'O3_daily8HrMax', 'PM10_annualMean',
       'PM10_daysOver50', 'PM2.5_annualMean', 'BL', 'EWZ', 'Shape__Area',
       'death_rate', 'cases', 'deaths', 'cases_per_100k', 'deaths_per_100k',
       'Population density', 'Fully vaccinated'],
      dtype='object')

In [33]:
merge_df = merge_df.rename(columns={'PM2.5_annualMean': 'PM2_5_annualMean'})

In [34]:
merge_df = merge_df[['county', 'year', 'NO2_annualMean', 'NO2_hrOver200', 'NO_annualMean',
       'O3_annualMean', 'O3_daysOver120', 'O3_dailyMaxAnnualMean',
       'O3_dailyHourlyMax', 'O3_daily8HrMax','cases_per_100k', 'deaths_per_100k',
       'Population density', 'Fully vaccinated']]

In [35]:
merge_df.to_csv('../lung_pollution/data/covid_pollution.csv', index=False)