In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
df_covid_AP = pd.read_csv('../lung_pollution/data/covid_pollution_complete.csv')
df_covid_AP.drop(columns=['Column1'], inplace = True)

In [3]:
df_covid_AP = df_covid_AP.rename(columns={"fully_vaccinated": "Fully_vaccinated"})

In [4]:
df_covid_AP.sort_values(['year'], axis=0, ascending=False,inplace=True,ignore_index=True)

In [5]:
df_covid_AP.head()

Unnamed: 0,id,county_new,county,year,NO2_annualMean,NO2_hrOver200,NO_annualMean,O3_annualMean,O3_daysOver120,O3_dailyMaxAnnualMean,...,deaths,cases_per_100k,deaths_per_100k,Fully_vaccinated,Population_density,NO2_totMean,NO_totMean,O3_totMean,PM10_totMean,PM2_5_totMean
0,285,Aachen,StädteRegion Aachen,2019,8.702983,0.0,0.743928,58.89937,8.0,80.62513,...,622,6090.569875,111.743687,0.715,789.975585,15.177349,11.589918,54.640123,17.472704,11.515066
1,414,Erfurt Städte,SK Erfurt,2019,15.71719,0.0,7.001803,52.44306,0.0,77.22419,...,290,6234.674204,135.709339,0.621,790.697991,21.353773,12.723735,47.732662,18.477456,12.834581
2,10,Hagen Städte,SK Hagen,2019,18.69454,0.00726,10.355297,52.20485,2.163659,76.71044,...,345,8593.066825,182.842485,0.715,1177.241586,20.609004,12.457916,50.831379,18.888289,13.836452
3,225,Göttingen,LK Göttingen,2019,11.42638,0.0,2.897505,52.75934,0.0,78.70706,...,289,3246.372337,89.225069,0.699,184.684859,13.785343,6.399425,50.439701,14.907914,11.067791
4,415,Gera Städte,SK Gera,2019,17.19362,0.0,7.878415,48.66422,0.0,77.85933,...,224,8066.126826,243.145258,0.621,604.684425,19.635684,11.617202,46.148697,20.010453,13.342688


In [6]:
X = df_covid_AP[['NO2_totMean', 'NO_totMean', 'O3_totMean','PM2_5_totMean', 'Fully_vaccinated', 'Population_density']]
y = df_covid_AP['cases_per_100k']

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # Instanciate Robust Scaler

scaler.fit(X)  # Fit scaler to feature
scaled_num = scaler.transform(X)  #Scale
X = pd.DataFrame(scaled_num, columns = X.columns)

# Models

## Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

linreg = LinearRegression()

cv_results_linreg = cross_validate(linreg, X, y, cv=5, 
                            scoring='r2'
                                     )
linreg.fit(X, y)
print(linreg.score(X, y))
print(cv_results_linreg['test_score'].mean())

0.400489263990241
0.400489263990241


## Lasso

In [20]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet

lasso = Lasso()

cv_results_lasso = cross_validate(lasso, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error'])

cv_results_lasso['test_r2'].mean()

0.40048659124870567
0.40048659124870556


## Ridge

In [11]:
ridge = Ridge()

cv_results_ridge = cross_validate(ridge, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error'])

cv_results_ridge['test_r2'].mean()

0.4004891871085713

## Elastic Net

In [12]:
elasticnet = ElasticNet()

cv_results_elasticnet = cross_validate(elasticnet, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error'])

cv_results_elasticnet['test_r2'].mean()

0.34234067621999753

## K-Nearest Neighbours

In [13]:
from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=4, weights='distance', leaf_size=10, p=1)

cv_results_neigh = cross_validate(neigh, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error'])

cv_results_neigh['test_r2'].mean()

1.0

## Support Vector Regression

In [15]:
from sklearn.svm import SVR

SVR = SVR()

cv_results_svr = cross_validate(SVR, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error'])

cv_results_svr['test_r2'].mean()

0.05670722434812718

## AdaBoost

In [16]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

adaboost = AdaBoostRegressor(n_estimators=10, 
    learning_rate= 1.2)

cv_results_adaboost = cross_validate(adaboost, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error'])

cv_results_adaboost['test_r2'].mean()

0.5947311169992925

## Random Forest Regressor

In [17]:
from sklearn.ensemble import  RandomForestRegressor

forest = RandomForestRegressor(min_samples_leaf=15, min_samples_split=10, n_estimators=100)


cv_results_forest = cross_validate(forest, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error'])

cv_results_forest['test_r2'].mean()

0.925514136935296

## XGBoost

In [18]:
from xgboost import XGBRegressor

xgbr = XGBRegressor()

cv_results_xgbr = cross_validate(xgbr, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error'])

cv_results_xgbr['test_r2'].mean()

0.9999862321109042

# Creating the Table

In [None]:

data = [['Linear Regression', cv_results_linreg['test_r2'].mean()], ['Lasso', cv_results_lasso['test_r2'].mean()], ['Ridge', cv_results_ridge['test_r2'].mean()], ['Elastic Net', cv_results_elasticnet['test_r2'].mean()], ['K-Nearest Neighbours', cv_results_neigh['test_r2'].mean()], ['Support Vector Regression', cv_results_svr['test_r2'].mean()], ['AdaBoost', cv_results_adaboost['test_r2'].mean()], ['Random Forest Regressor', cv_results_forest['test_r2'].mean()], ['XGBoost', cv_results_xgbr['test_r2'].mean()]]
  
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Model', 'R-squared'])

In [None]:
df

# Heatmap R-square

In [None]:
fig = px.imshow(data,
                labels=dict(color="R-squared"),
                x=['Model', 'Generalization'],
                y=['Linear Regression', 'Lasso', 'Ridge','Elastic Net','K-Nearest Neighbours','Support Vector Regression','AdaBoost','Random Forest Regressor','XGBoost']
               )
fig.update_xaxes(side="top")
fig.show()