# To-do list

### IT setup

- new virtenv
- create project structure
- create trello board
- invite collaboration
- test GH pull/push/merge

### Quick and dirty

- Streamlit (Plotly Dash) frontend
- Google Cloud Platform backend
- upload data to Google Cloud Storage
- Test that all 3 interact

### Talk extensively about dataset and different steps

### Preprocessing and start

- numerical features
- categorical features
- feature engineering
- dumb baseline model
- create dashboard design


# Load Covid dataset

## Counties that exist as LK and SK

Regensburg, Bamberg, Bayreuth, Coburg, Hof, Ansbach, Fürth, Rostock, Karlsruhe, Kaiserslautern, Rosenheim, Offenbach, Oldenburg, Landshut, Passau, Heilbronn

## Load CSV

In [1]:
import pandas as pd

covid_data = pd.read_csv("/Users/saraisidebroggini/Desktop/RKI_corona_landskreise.csv")

covid_data.shape

(411, 47)

In [2]:
covid_data.columns

Index(['OBJECTID', 'ADE', 'GF', 'BSG', 'RS', 'AGS', 'SDV_RS', 'GEN', 'BEZ',
       'IBZ', 'BEM', 'NBD', 'SN_L', 'SN_R', 'SN_K', 'SN_V1', 'SN_V2', 'SN_G',
       'FK_S3', 'NUTS', 'RS_0', 'AGS_0', 'WSK', 'EWZ', 'KFL', 'DEBKG_ID',
       'Shape__Area', 'Shape__Length', 'death_rate', 'cases', 'deaths',
       'cases_per_100k', 'cases_per_population', 'BL', 'BL_ID', 'county',
       'last_update', 'cases7_per_100k', 'recovered', 'EWZ_BL',
       'cases7_bl_per_100k', 'cases7_bl', 'death7_bl', 'cases7_lk',
       'death7_lk', 'cases7_per_100k_txt', 'AdmUnitId'],
      dtype='object')

In [3]:
covid_data = covid_data[['BL','county','EWZ','Shape__Area', 'death_rate', 'cases', 'deaths','cases_per_100k']]
covid_data.head()

Unnamed: 0,BL,county,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k
0,Schleswig-Holstein,SK Flensburg,89934,49182930.0,1.223721,3187,39,3543.709832
1,Schleswig-Holstein,SK Kiel,246601,112231400.0,1.409469,8301,117,3366.166398
2,Schleswig-Holstein,SK Lübeck,215846,211677100.0,1.392355,7613,106,3527.051694
3,Schleswig-Holstein,SK Neumünster,79905,71402240.0,0.889996,2809,25,3515.424567
4,Schleswig-Holstein,LK Dithmarschen,133251,1425511000.0,1.915323,2976,57,2233.379112


In [4]:
covid_data['deaths_per_100k'] = covid_data['deaths']/covid_data['EWZ']*100000
covid_data.head()

Unnamed: 0,BL,county,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k
0,Schleswig-Holstein,SK Flensburg,89934,49182930.0,1.223721,3187,39,3543.709832,43.365134
1,Schleswig-Holstein,SK Kiel,246601,112231400.0,1.409469,8301,117,3366.166398,47.445063
2,Schleswig-Holstein,SK Lübeck,215846,211677100.0,1.392355,7613,106,3527.051694,49.109087
3,Schleswig-Holstein,SK Neumünster,79905,71402240.0,0.889996,2809,25,3515.424567,31.287153
4,Schleswig-Holstein,LK Dithmarschen,133251,1425511000.0,1.915323,2976,57,2233.379112,42.776414


## Merge all Berlin 'counties' to one (to match APexpose)

In [5]:
berlin = covid_data[covid_data["BL"] == 'Berlin']
berlin

Unnamed: 0,BL,county,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k
399,Berlin,SK Berlin Reinickendorf,259169,89436650.0,1.618289,19465,315,7510.543313,121.542314
400,Berlin,SK Berlin Charlottenburg-Wilmersdorf,315393,64774500.0,1.568608,20464,321,6488.412869,101.777782
401,Berlin,SK Berlin Treptow-Köpenick,272429,168005200.0,1.689394,13200,223,4845.299142,81.85619
402,Berlin,SK Berlin Pankow,403607,103363000.0,1.068934,21049,225,5215.221738,55.747299
403,Berlin,SK Berlin Neukölln,318128,44996870.0,1.522467,28375,432,8919.36579,135.794397
404,Berlin,SK Berlin Lichtenberg,291622,52198000.0,1.69755,16082,273,5514.673104,93.614336
405,Berlin,SK Berlin Marzahn-Hellersdorf,273676,61914770.0,1.768566,14475,256,5289.100981,93.541268
406,Berlin,SK Berlin Spandau,238922,92940420.0,1.444666,18551,268,7764.458694,112.170499
407,Berlin,SK Berlin Steglitz-Zehlendorf,290866,102687200.0,2.656722,16223,431,5577.482415,148.178199
408,Berlin,SK Berlin Mitte,374232,39452110.0,1.156393,30353,351,8110.744137,93.792086


In [6]:
covid_data['county'][399] = 'Berlin'
covid_data.loc[399] 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data['county'][399] = 'Berlin'


BL                          Berlin
county                      Berlin
EWZ                         259169
Shape__Area        89436651.129883
death_rate                1.618289
cases                        19465
deaths                         315
cases_per_100k         7510.543313
deaths_per_100k         121.542314
Name: 399, dtype: object

In [7]:
berlin_sum = covid_data[['Shape__Area', 'cases', 'deaths']].sum()
berlin_sum

Shape__Area    3.571745e+11
cases          5.021469e+06
deaths         9.767200e+04
dtype: float64

In [8]:
berlin_average = covid_data[['death_rate','cases_per_100k', 'deaths_per_100k']].mean()
berlin_average

death_rate            2.049408
cases_per_100k     6034.971334
deaths_per_100k     125.071466
dtype: float64

In [9]:
covid_data['cases'][399] = berlin_sum.cases
covid_data['Shape__Area'][399] = berlin_sum.Shape__Area
covid_data['deaths'][399] = berlin_sum.deaths
covid_data.loc[399] 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data['cases'][399] = berlin_sum.cases
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data['Shape__Area'][399] = berlin_sum.Shape__Area
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data['deaths'][399] = berlin_sum.deaths


BL                              Berlin
county                          Berlin
EWZ                             259169
Shape__Area        357174478948.466675
death_rate                    1.618289
cases                          5021469
deaths                           97672
cases_per_100k             7510.543313
deaths_per_100k             121.542314
Name: 399, dtype: object

In [10]:
covid_data['death_rate'][399] = berlin_average.death_rate
covid_data['cases_per_100k'][399] = berlin_average.cases_per_100k
covid_data['deaths_per_100k'][399] = berlin_average.deaths_per_100k
covid_data.loc[399] 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data['death_rate'][399] = berlin_average.death_rate
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data['cases_per_100k'][399] = berlin_average.cases_per_100k
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data['deaths_per_100k'][399] = berlin_average.deaths_per_100k


BL                              Berlin
county                          Berlin
EWZ                             259169
Shape__Area        357174478948.466675
death_rate                    2.049408
cases                          5021469
deaths                           97672
cases_per_100k             6034.971334
deaths_per_100k             125.071466
Name: 399, dtype: object

In [11]:
covid_data.drop(index=[400,401,402,403,404,405,406,407,408,409,410], axis=0, inplace=True)

In [13]:
covid_data.tail()

Unnamed: 0,BL,county,EWZ,Shape__Area,death_rate,cases,deaths,cases_per_100k,deaths_per_100k
395,Thüringen,LK Saale-Holzland-Kreis,82816,815610400.0,2.363552,6431,152,7765.407651,183.539413
396,Thüringen,LK Saale-Orla-Kreis,79632,1151821000.0,2.38505,8134,194,10214.486639,243.620655
397,Thüringen,LK Greiz,96668,846542600.0,2.504856,9781,245,10118.136302,253.44478
398,Thüringen,LK Altenburger Land,88356,569793200.0,3.458946,9049,313,10241.52293,354.248721
399,Berlin,Berlin,259169,357174500000.0,2.049408,5021469,97672,6034.971334,125.071466


# Load APexpose air pollution dataset

## Load data

In [None]:
!pip install chardet
import chardet
with open("/Users/saraisidebroggini/Desktop/APexpose.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

In [None]:
import pandas as pd 
pollution_data = pd.read_csv("../raw_data/APexpose.csv",
                             sep=';',
                             decimal='.',
                            encoding = 'Windows-1252'
                            )
pollution_data.head()

In [None]:
pollution_data.shape

## Clean AP expose county column to match Covid dataset

In [None]:
pollution_data['county'][:20]

In [None]:
pollution_data['county'] = pollution_data['county'].apply(lambda x: x.replace('Ÿ','ü'))
pollution_data['county'] = pollution_data['county'].apply(lambda x: x.replace('š','ö'))
pollution_data['county'] = pollution_data['county'].apply(lambda x: x.replace('§','ß'))
pollution_data['county'] = pollution_data['county'].apply(lambda x: x.replace('Š','ä'))
pollution_data['county'][150:200]

## Convert columns to floats 

In [None]:
pollution_data.dtypes

In [None]:
for column in ['NO2_annualMean', 'NO_annualMean','O3_annualMean', 'O3_daysOver120', 'O3_dailyMaxAnnualMean','O3_dailyHourlyMax', 'O3_daily8HrMax', 'PM10_annualMean','PM10_daysOver50', 'PM2.5_annualMean']:
    pollution_data[column] = pollution_data[column].apply(lambda x: x.replace(',','.'))
pollution_data[['NO2_annualMean', 'NO_annualMean',
       'O3_annualMean', 'O3_daysOver120', 'O3_dailyMaxAnnualMean',
       'O3_dailyHourlyMax', 'O3_daily8HrMax', 'PM10_annualMean',
       'PM10_daysOver50', 'PM2.5_annualMean']]

In [None]:
for column in ['NO2_annualMean', 'NO_annualMean','O3_annualMean', 'O3_daysOver120', 'O3_dailyMaxAnnualMean','O3_dailyHourlyMax', 'O3_daily8HrMax', 'PM10_annualMean','PM10_daysOver50', 'PM2.5_annualMean']:
    pollution_data[column] = pollution_data[column].apply(lambda x: float(x))

In [None]:
pollution_data.dtypes

## Drop counties present in APexpore but not Covid

Eisenach, Osterode am Harz: drop those rows for all time points

In [None]:
pollution_data.shape

In [None]:
pollution_data = pollution_data[pollution_data.county != 'Eisenach']
pollution_data.shape

In [None]:
pollution_data = pollution_data[pollution_data.county != 'Osterode am Harz']
pollution_data.shape


In [None]:
time_point_order = [12,13,14, 15,16,17, 9,10,11, 3,4,5, 27,28,29, 6,7,8, 21,22,23, 18,19,20, 24,25,26, 0,1,2]
time_point_order = [i*400 for i in time_point_order]

In [None]:
temporal_time_points = [pollution_data['PM2.5_annualMean'][300+i] for i in time_point_order]

In [None]:
import matplotlib.pyplot as plt
plt.plot(temporal_time_points)

# Merge APexpose and Covid datasets

In [None]:
merge_df = pol_df.merge(covid_data, how='inner', on='county')
merge_df

In [None]:
merge_df.columns

In [None]:
merge_df.dtypes

# Covid & Air pollution - quick correlation check

In [None]:
merge_df = merge_df[['county', 'NO2_annualMean', 'NO2_hrOver200', 'NO_annualMean',
       'O3_annualMean', 'O3_daysOver120', 'O3_dailyMaxAnnualMean',
       'O3_dailyHourlyMax', 'O3_daily8HrMax', 'PM10_annualMean',
       'PM10_daysOver50', 'PM2.5_annualMean', 'cases_per_100k','deaths_per_100k']]

In [None]:
merge_df = merge_df.rename(columns={'PM2.5_annualMean': 'PM2_5_annualMean'})

In [None]:
num_fact = ['NO2_annualMean', 'NO2_hrOver200', 'NO_annualMean',
       'O3_annualMean', 'O3_daysOver120', 'O3_dailyMaxAnnualMean',
       'O3_dailyHourlyMax', 'O3_daily8HrMax', 'PM10_annualMean',
       'PM10_daysOver50', 'PM2_5_annualMean']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15,15))

for i, feature in enumerate(num_fact):
    # First subplot
    plt.subplot(4,3,i+1)
    sns.histplot(merge_df[feature])
    # Global figure methods
plt.suptitle('Feature distributions')
plt.show()

In [None]:
plt.figure(figsize = (10,10))

sns.heatmap(merge_df.corr(), 
            cmap='coolwarm', 
            annot = True, 
            annot_kws={"size": 10})

In [None]:
# Start a figure
plt.figure(figsize=(10,3))
# First subplot
plt.subplot(1,2,1)
plt.scatter(merge_df['NO2_hrOver200'], merge_df['deaths_per_100k'])
plt.xlabel("NO2 #hours over 200")
plt.ylabel("deaths per 100K")
plt.title('unfiltered')
# Second subplot
plt.subplot(1,2,2) 
x = merge_df[merge_df['NO2_hrOver200']>0.001]
plt.scatter(x['NO2_hrOver200'], x['deaths_per_100k'])
plt.xlabel("NO2 #hours over 200")
plt.ylabel("deaths per 100K")
plt.title("filtered")
# Global figure methods
plt.suptitle('NO2_hrOver200')
plt.show()

In [None]:
import statsmodels.formula.api as smf

In [None]:
model = smf.ols(formula = 'deaths_per_100k ~ NO2_hrOver200', data = x).fit()

In [None]:
model.summary()

In [None]:
# Start a figure
plt.figure(figsize=(10,3))
# First subplot
plt.subplot(1,2,1)
plt.scatter(merge_df['PM2_5_annualMean'], merge_df['deaths_per_100k'])
plt.xlabel("PM2_5_annualMean")
plt.ylabel("deaths per 100K")
plt.title('unfiltered')
# Second subplot
plt.subplot(1,2,2) 
x = merge_df[merge_df['PM2_5_annualMean']>10**14]
plt.scatter(x['PM2_5_annualMean'], x['deaths_per_100k'])
plt.xlabel("PM2_5_annualMean")
plt.ylabel("deaths per 100K")
plt.title("filtered")
# Global figure methods
plt.suptitle('PM2_5_annualMean')
plt.show()

In [None]:
model = smf.ols(formula = 'deaths_per_100k ~ PM2_5_annualMean', data = x).fit()

In [None]:
model.summary()