In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
df_covid_AP = pd.read_csv('../lung_pollution/data/covid_pollution_complete.csv')
df_covid_AP.drop(columns=['Column1'], inplace = True)

In [3]:
df_covid_AP = df_covid_AP.rename(columns={"fully_vaccinated": "Fully_vaccinated"})

In [4]:
df_covid_AP.sort_values(['year'], axis=0, ascending=False,inplace=True,ignore_index=True)

In [5]:
df_covid_AP = df_covid_AP[0:400]

In [6]:
df_covid_AP.shape

(400, 30)

In [7]:
X = df_covid_AP[['NO2_totMean', 'NO_totMean', 'O3_totMean','PM2_5_totMean', 'Fully_vaccinated', 'Population_density']]
y = df_covid_AP['cases_per_100k']

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # Instanciate Robust Scaler

scaler.fit(X)  # Fit scaler to feature
scaled_num = scaler.transform(X)  #Scale
X = pd.DataFrame(scaled_num, columns = X.columns)

# Models

## Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

linreg = LinearRegression()

cv_results_linreg = cross_validate(linreg, X, y, cv=5, 
                            scoring='r2'
                                     )
linreg.fit(X, y)
print(linreg.score(X, y))
print(cv_results_linreg['test_score'].mean())

0.3955199153556628
0.35289087239363387


## Lasso

In [10]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet

lasso = Lasso()

cv_results_lasso = cross_validate(lasso, X, y, cv=5, 
                            scoring='r2'
                                     )
lasso.fit(X, y)
print(lasso.score(X, y))
print(cv_results_lasso['test_score'].mean())

0.3955171958463194
0.3528586343002059


## Ridge

In [11]:
ridge = Ridge()

cv_results_ridge = cross_validate(ridge, X, y, cv=5, 
                            scoring='r2'
                                     )

ridge.fit(X, y)
print(ridge.score(X, y))
print(cv_results_ridge['test_score'].mean())

0.3955137562576416
0.3531052227776959


## Elastic Net

In [12]:
elasticnet = ElasticNet()

cv_results_elasticnet = cross_validate(elasticnet, X, y, cv=5, 
                            scoring='r2'
                                     )

elasticnet.fit(X, y)
print(elasticnet.score(X, y))
print(cv_results_elasticnet['test_score'].mean())

0.3369366720911692
0.3107156495813002


## K-Nearest Neighbours

In [14]:
from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=4, weights='distance', leaf_size=10, p=1)

cv_results_neigh = cross_validate(neigh, X, y, cv=5, 
                            scoring='r2'
                                     )

neigh.fit(X, y)
print(neigh.score(X, y))
print(cv_results_neigh['test_score'].mean())

1.0
0.3957329190632219


## Support Vector Regression

In [16]:
from sklearn.svm import SVR

SVR = SVR()

cv_results_svr = cross_validate(SVR, X, y, cv=5, 
                            scoring='r2'
                                     )

SVR.fit(X, y)
print(SVR.score(X, y))
print(cv_results_svr['test_score'].mean())

0.003542530945239153
-0.009264780842408404


## AdaBoost

In [17]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

adaboost = AdaBoostRegressor(n_estimators=10, 
    learning_rate= 1.2)

cv_results_adaboost = cross_validate(adaboost, X, y, cv=5, 
                            scoring='r2'
                                     )

adaboost.fit(X, y)
print(adaboost.score(X, y))
print(cv_results_adaboost['test_score'].mean())

0.6118806140654992
0.4626354984193714


## Random Forest Regressor

In [18]:
from sklearn.ensemble import  RandomForestRegressor

forest = RandomForestRegressor(min_samples_leaf=15, min_samples_split=10, n_estimators=100)


cv_results_forest = cross_validate(forest, X, y, cv=5, 
                            scoring='r2'
                                     )

forest.fit(X, y)
print(forest.score(X, y))
print(cv_results_forest['test_score'].mean())

0.6203330664190791
0.46186556335126727


## XGBoost

In [19]:
from xgboost import XGBRegressor

xgbr = XGBRegressor()

cv_results_xgbr = cross_validate(xgbr, X, y, cv=5, 
                            scoring='r2'
                                     )


xgbr.fit(X, y)
print(xgbr.score(X, y))
print(cv_results_xgbr['test_score'].mean())

0.9999681484364248
0.4470820183835441


# Creating the Table

In [20]:

data = [['Linear Regression', cv_results_linreg['test_r2'].mean()], ['Lasso', cv_results_lasso['test_r2'].mean()], ['Ridge', cv_results_ridge['test_r2'].mean()], ['Elastic Net', cv_results_elasticnet['test_r2'].mean()], ['K-Nearest Neighbours', cv_results_neigh['test_r2'].mean()], ['Support Vector Regression', cv_results_svr['test_r2'].mean()], ['AdaBoost', cv_results_adaboost['test_r2'].mean()], ['Random Forest Regressor', cv_results_forest['test_r2'].mean()], ['XGBoost', cv_results_xgbr['test_r2'].mean()]]
  
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Model', 'R-squared'])

KeyError: 'test_r2'

In [None]:
df

# Heatmap R-square

In [None]:
fig = px.imshow(data,
                labels=dict(color="R-squared"),
                x=['Model', 'Generalization'],
                y=['Linear Regression', 'Lasso', 'Ridge','Elastic Net','K-Nearest Neighbours','Support Vector Regression','AdaBoost','Random Forest Regressor','XGBoost']
               )
fig.update_xaxes(side="top")
fig.show()