# Load clean dataframe

In [1]:
import pandas as pd

df_covid_AP = pd.read_csv('../lung_pollution/data/covid_pollution_clean.csv')

df_covid_AP.shape

(4000, 24)

In [2]:
df_covid_AP.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,county,year,NO2_annualMean,NO2_hrOver200,NO_annualMean,O3_annualMean,O3_daysOver120,O3_dailyMaxAnnualMean,...,PM2.5_annualMean,BL,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k,Fully vaccinated
0,0,0,Berlin,2010,21.33097,0.0,4.689645,48.14162,1.321674,75.383964,...,20.56858,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688
1,1,1,Berlin,2011,21.10792,0.0,5.591758,46.78272,1.405013,75.515257,...,20.17655,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688
2,2,2,Berlin,2012,20.656,0.0,5.372472,45.26885,1.513209,75.685711,...,17.18541,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688
3,3,3,Berlin,2013,19.16632,0.0,4.37616,47.91164,0.142857,73.14514,...,15.83933,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688
4,4,4,Berlin,2014,20.46666,0.0,15.75506,47.80345,0.0,71.65981,...,19.77463,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688


# Feature engineering: population density

In [3]:
df_covid_AP['Population density'] = df_covid_AP['EWZ']/df_covid_AP['Shape__Area']*1000000

In [4]:
df_covid_AP

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,county,year,NO2_annualMean,NO2_hrOver200,NO_annualMean,O3_annualMean,O3_daysOver120,O3_dailyMaxAnnualMean,...,BL,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k,Fully vaccinated,Population density
0,0,0,Berlin,2010,21.330970,0.0,4.689645,48.14162,1.321674,75.383964,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
1,1,1,Berlin,2011,21.107920,0.0,5.591758,46.78272,1.405013,75.515257,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
2,2,2,Berlin,2012,20.656000,0.0,5.372472,45.26885,1.513209,75.685711,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
3,3,3,Berlin,2013,19.166320,0.0,4.376160,47.91164,0.142857,73.145140,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
4,4,4,Berlin,2014,20.466660,0.0,15.755060,47.80345,0.000000,71.659810,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3995,3995,StädteRegion Aachen,2015,19.503560,0.0,18.259300,55.32369,3.500000,74.671370,...,Nordrhein-Westfalen,556631,7.046180e+08,1.834700,33902,622,6090.569875,111.743687,0.715,789.975585
3996,3996,3996,StädteRegion Aachen,2016,19.119410,0.0,20.256560,53.63288,1.000000,73.430130,...,Nordrhein-Westfalen,556631,7.046180e+08,1.834700,33902,622,6090.569875,111.743687,0.715,789.975585
3997,3997,3997,StädteRegion Aachen,2017,19.371040,0.0,20.455340,56.32148,1.500000,76.194330,...,Nordrhein-Westfalen,556631,7.046180e+08,1.834700,33902,622,6090.569875,111.743687,0.715,789.975585
3998,3998,3998,StädteRegion Aachen,2018,18.783950,0.0,18.301830,60.82965,5.500000,83.249880,...,Nordrhein-Westfalen,556631,7.046180e+08,1.834700,33902,622,6090.569875,111.743687,0.715,789.975585


# Feature Engineering: Vaxx rate

In [5]:
df_covid_AP['BL'].unique()

array(['Berlin', 'Rheinland-Pfalz', 'Bayern', 'Baden-Württemberg',
       'Thüringen', 'Sachsen-Anhalt', 'Niedersachsen', 'Brandenburg',
       'Sachsen', 'Hessen', 'Nordrhein-Westfalen', 'Schleswig-Holstein',
       'Mecklenburg-Vorpommern', 'Saarland', 'Bremen', 'Hamburg'],
      dtype=object)

In [6]:
d = {'Berlin': 0.688, 'Rheinland-Pfalz': 0.678, 'Bayern': 0.663, 'Baden-Württemberg': 0.662,
       'Thüringen': 0.621, 'Sachsen-Anhalt': 0.645, 'Niedersachsen': 0.699, 'Brandenburg': 0.617,
       'Sachsen': 0.578, 'Hessen': 0.672, 'Nordrhein-Westfalen': 0.715, 'Schleswig-Holstein': 0.725,
       'Mecklenburg-Vorpommern': 0.665, 'Saarland':0.746, 'Bremen': 0.797, 'Hamburg':0.738}
df_covid_AP['Fully vaccinated'] = df_covid_AP['BL'].map(d)
df_covid_AP

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,county,year,NO2_annualMean,NO2_hrOver200,NO_annualMean,O3_annualMean,O3_daysOver120,O3_dailyMaxAnnualMean,...,BL,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k,Fully vaccinated,Population density
0,0,0,Berlin,2010,21.330970,0.0,4.689645,48.14162,1.321674,75.383964,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
1,1,1,Berlin,2011,21.107920,0.0,5.591758,46.78272,1.405013,75.515257,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
2,2,2,Berlin,2012,20.656000,0.0,5.372472,45.26885,1.513209,75.685711,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
3,3,3,Berlin,2013,19.166320,0.0,4.376160,47.91164,0.142857,73.145140,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
4,4,4,Berlin,2014,20.466660,0.0,15.755060,47.80345,0.000000,71.659810,...,Berlin,259169,8.933202e+08,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3995,3995,StädteRegion Aachen,2015,19.503560,0.0,18.259300,55.32369,3.500000,74.671370,...,Nordrhein-Westfalen,556631,7.046180e+08,1.834700,33902,622,6090.569875,111.743687,0.715,789.975585
3996,3996,3996,StädteRegion Aachen,2016,19.119410,0.0,20.256560,53.63288,1.000000,73.430130,...,Nordrhein-Westfalen,556631,7.046180e+08,1.834700,33902,622,6090.569875,111.743687,0.715,789.975585
3997,3997,3997,StädteRegion Aachen,2017,19.371040,0.0,20.455340,56.32148,1.500000,76.194330,...,Nordrhein-Westfalen,556631,7.046180e+08,1.834700,33902,622,6090.569875,111.743687,0.715,789.975585
3998,3998,3998,StädteRegion Aachen,2018,18.783950,0.0,18.301830,60.82965,5.500000,83.249880,...,Nordrhein-Westfalen,556631,7.046180e+08,1.834700,33902,622,6090.569875,111.743687,0.715,789.975585


In [7]:
df_covid_AP.to_csv('clean_merged_dataset_covid_pollution.csv')

In [9]:
df_covid_AP = pd.read_csv('../lung_pollution/data/covid_pollution_clean.csv')
df_covid_AP.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,county,year,NO2_annualMean,NO2_hrOver200,NO_annualMean,O3_annualMean,O3_daysOver120,...,BL,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k,Fully vaccinated,Population density
0,0,0,0,Berlin,2010,21.33097,0.0,4.689645,48.14162,1.321674,...,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
1,1,1,1,Berlin,2011,21.10792,0.0,5.591758,46.78272,1.405013,...,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
2,2,2,2,Berlin,2012,20.656,0.0,5.372472,45.26885,1.513209,...,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
3,3,3,3,Berlin,2013,19.16632,0.0,4.37616,47.91164,0.142857,...,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
4,4,4,4,Berlin,2014,20.46666,0.0,15.75506,47.80345,0.0,...,Berlin,259169,893320200.0,1.591636,242813,3759,6640.688066,103.703091,0.688,290.118803
