In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

## Import data and transform

In [85]:
df_raw = pd.read_csv("../../Data/Refined/1021/asthma_prevalence_rate.csv")
df_raw

Unnamed: 0,Measure,Country Name,Disease,Metric,Year,Value,Country Code,Access to clean fuels and technologies for cooking (% of population),Access to electricity (% of population),Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita),...,Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population),u10,v10,d2m,t2m,sst,sp,skt,blh
0,Prevalence,Japan,Asthma,Rate,2014,5192.177057,JPN,100,100.0,10.0274639366416,...,53543.70,19.6,2.180324,1.293077,288.367888,292.485756,293.779580,102115.801254,293.610257,885.832315
1,Prevalence,Japan,Asthma,Rate,2015,5206.382139,JPN,100,100.0,9.71691822464823,...,54171.10,18.2,2.247194,1.573335,288.342591,292.509937,293.900725,102082.760935,293.728914,905.719678
2,Prevalence,Japan,Asthma,Rate,2016,5222.775846,JPN,100,100.0,9.70188627278164,...,51006.40,18.3,1.896303,1.843566,288.471362,292.607193,293.833551,102084.147374,293.667026,903.834044
3,Prevalence,Japan,Asthma,Rate,2017,5259.014818,JPN,100,100.0,9.59147213558895,...,52248.52,17.7,2.231108,1.180876,288.519120,292.625293,293.974890,101947.341061,293.809990,896.601952
4,Prevalence,Japan,Asthma,Rate,2018,5302.827203,JPN,100,100.0,9.29901664682086,...,52331.77,17.8,2.239119,1.458739,288.927352,293.018189,294.475368,102117.550669,294.303794,904.800901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,Prevalence,Türkiye,Asthma,Rate,2017,4499.719504,TUR,95,100.0,5.39364397893842,...,54098.00,,1.828292,0.463260,284.082554,287.737981,288.662293,101909.206715,288.544441,840.552676
489,Prevalence,Türkiye,Asthma,Rate,2018,4779.019886,TUR,95.1,100.0,5.26873027109493,...,60020.00,,1.563059,0.545625,284.546408,288.057154,288.875183,102145.624975,288.762852,806.899897
490,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.00,28.0,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818
491,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.00,19.6,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818


In [86]:
columns_to_keep = [col for col in df_raw.columns if col not in ['Country Name', 'Measure', 'Metric', 'Disease']]
df_excluded = df_raw[columns_to_keep]

In [87]:
df_excluded.isnull().sum()

Year                                                                            0
Value                                                                           0
Country Code                                                                    0
Access to clean fuels and technologies for cooking (% of population)            0
Access to electricity (% of population)                                         0
Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita)      0
Compulsory education, duration (years)                                          0
GDP (current US$)                                                               0
GDP per capita (constant 2015 US$)                                              0
Gini index                                                                      0
Life expectancy at birth, total (years)                                         0
Mortality rate, infant (per 1,000 live births)                                  0
People using at 

In our dataset, there are many missing values (even though not so many relatively to the number of rows we have). 
Before training, we need to:
- Handle missing values (replace '..', 'N/A', etc. with np.nan) -> for models that can deal with them, we can leave these values there
- Use numeric encoding for categorical columns (e.g., one-hot for Country Name)

In [88]:
# Ensure df_excluded is a true copy
df_excluded = df_excluded.copy()

# Replace '..' and 'nan' strings with real NaN
df_excluded = df_excluded.replace(['..', 'nan'], np.nan)

# One-hot encode 'Country Code'
df_encoded = pd.get_dummies(df_excluded, columns=["Country Code"], drop_first=True)

# Convert all columns to numeric (coerce errors to NaN)
df_encoded = df_encoded.apply(pd.to_numeric, errors='coerce')

## Hist Gradient Boosting Regressor

HistGradientBoostingRegressor (from sklearn.ensemble) is a tree-based model. Tree-based models split data based on thresholds, not on absolute magnitudes, so scaling does not affect them.
Therefore we can avoid to normalize our features.

In [89]:
X = df_encoded[[col for col in df_encoded.columns if col not in ['Value']]] # Replace with actual feature names
y = df_encoded['Value'] 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)     # returns MSE
rmse = np.sqrt(mse)                          # RMSE = sqrt(MSE)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"R²: {r2:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")

R²: 0.952
MAE: 244.237
RMSE: 556.536


In [90]:
# Compute permutation importance
importance = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)

# Put results in a DataFrame
perm_importances = pd.Series(importance.importances_mean, index=X_test.columns)
perm_importances = perm_importances.sort_values(ascending=False)
print(perm_importances)

GDP per capita (constant 2015 US$)                                   1.006798
Life expectancy at birth, total (years)                              0.096757
Mortality rate, infant (per 1,000 live births)                       0.073566
Gini index                                                           0.072911
People using at least basic sanitation services (% of population)    0.043334
                                                                       ...   
Country Code_THA                                                     0.000000
Country Code_TUR                                                     0.000000
Country Code_UKR                                                     0.000000
Country Code_USA                                                     0.000000
Country Code_ZAF                                                     0.000000
Length: 78, dtype: float64


## Fill up with missing values

In [91]:
df_excluded[df_excluded["Total area (Square Km)"].isnull()][["Year", "Country Code", "Total area (Square Km)"]]

Unnamed: 0,Year,Country Code,Total area (Square Km)
156,2016,SRB,
157,2021,SRB,


In [92]:
df_excluded[df_excluded["Country Code"] == "SRB"][["PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
156,30.93375,22.667,24.2475,9.027099,368.4847,,
157,36.739889,23.5795,23.0679,9.060741,323.8058,,


Serbia is a country where many environmental data are missing: land use is not available, for none of the years. Same for the use of pesticides or percentage of smokers. These data cannot be imputed based on previous year. Also, we have only two years with data, so this could be a not very interesting country to keep.

In [93]:
df_no_nan = df_excluded.drop(df_excluded[df_excluded["Country Code"] == "SRB"].index)

In [94]:
df_no_nan[df_no_nan["PM10_ConcentrationAvg"].isnull()][["Year", "Country Code"]]

Unnamed: 0,Year,Country Code
0,2014,JPN
1,2015,JPN
2,2016,JPN
3,2017,JPN
4,2018,JPN
5,2019,JPN
24,2015,IDN
36,2020,CHN
49,2018,RUS
154,2018,KAZ


In [95]:
df_no_nan[df_no_nan["Country Code"] == "JPN"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
0,2014,,14.8352,26.890867,10.5282,,53543.7,19.6
1,2015,,,,10.23948,672.6071,54171.1,18.2
2,2016,,,,10.10148,638.3277,51006.4,18.3
3,2017,,,,9.992245,596.0964,52248.52,17.7
4,2018,,11.471067,19.6464,9.639091,562.3302,52331.77,17.8
5,2019,,10.107067,18.619533,9.373204,524.7334,51968.7,16.7


In the case of Japan, we can input the previous year values for PM25 and NO2, and the following year value for sulphut oxides. We cannot input PM10 based on other years. For now, we leave PM10 data nan.

In [96]:
mask = df_no_nan["Country Code"] == "JPN"
cols_to_fill = ["PM25_ConcentrationAvg", "NO2_ConcentrationAvg", "Sulphur oxides (tonnes)"]
df_no_nan.loc[mask, cols_to_fill] = df_no_nan.loc[mask, cols_to_fill].ffill().bfill()

We'll try to do the same reasoning for all the other countries and variables.

In [97]:
df_no_nan[df_no_nan["Country Code"] == "IDN"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
24,2015,,31.486,,3.153344,,1597.0,32.0
25,2017,23.1515,9.3175,15.69,3.26231,,1597.0,
26,2018,24.896,18.66525,27.6275,3.378282,,1597.0,32.3
27,2019,29.119167,26.697833,20.925,3.556219,,1597.0,32.6
28,2020,16.593167,15.773333,34.79,3.339661,,1597.0,32.6
29,2021,19.293667,18.667333,21.616,3.461016,,,


In [98]:
mask = df_no_nan["Country Code"] == "IDN"
cols_to_fill = ["PM10_ConcentrationAvg", "NO2_ConcentrationAvg", "Total sales of agricultural pesticides (tonnes)", "Share of population who are daily smokers (Pct population)"]
df_no_nan.loc[mask, cols_to_fill] = df_no_nan.loc[mask, cols_to_fill].ffill().bfill()

In [99]:
df_no_nan[df_no_nan["Country Code"] == "CHN"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
30,2013,116.962458,71.229986,44.389831,,,350926.63,
31,2014,109.186207,52.12969,42.560377,,,349083.93,
32,2015,89.738608,49.557086,35.851279,,,345983.05,25.8
33,2016,81.512459,45.739159,31.994571,,,338181.08,
34,2018,34.314375,40.153056,,,,294511.21,25.4
35,2019,34.58625,38.475538,,,,273375.75,25.4
36,2020,,33.631841,,9.652851,,273375.75,25.3


In [100]:
mask = df_no_nan["Country Code"] == "CHN"
cols_to_fill = ["PM10_ConcentrationAvg", "NO2_ConcentrationAvg", "Greenhouse gases (Kg CO2-equivalent Per Person)", "Share of population who are daily smokers (Pct population)"]
df_no_nan.loc[mask, cols_to_fill] = df_no_nan.loc[mask, cols_to_fill].ffill().bfill()

In [101]:
df_no_nan[df_no_nan["Country Code"] == "RUS"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
47,2016,28.0,14.0,,13.45777,4110.0,71057.0,30.3
48,2017,19.727,,,13.67797,3809.0,79522.64,27.5
49,2018,,,,14.07665,3703.0,74671.56,26.7


In [102]:
mask = df_no_nan["Country Code"] == "RUS"
cols_to_fill = ["PM10_ConcentrationAvg", "PM25_ConcentrationAvg"]
df_no_nan.loc[mask, cols_to_fill] = df_no_nan.loc[mask, cols_to_fill].ffill().bfill()

In [103]:
df_no_nan[df_no_nan["Country Code"] == "KAZ"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
154,2018,,32.248,,21.34503,1917.309,13058.0,
155,2019,,21.773,,19.34913,1751.677,13102.4,


In [104]:
df_no_nan = df_excluded.drop(df_excluded[df_excluded["Country Code"] == "KAZ"].index)

In [105]:
df_no_nan[df_no_nan["Country Code"] == "KOR"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
158,2012,44.225444,,0.023556,,,17438.0,21.6
159,2012,44.225444,,0.023556,,,17438.0,14.9
160,2013,47.152632,,0.023842,,,18708.0,19.9
161,2013,47.152632,,0.023842,,,18708.0,15.2
162,2014,48.62804,,0.024421,14.33164,,19788.0,20.0
163,2014,48.62804,,0.024421,14.33164,,19788.0,12.4
164,2015,49.7142,27.202313,1.95,14.24184,352.21,19482.0,17.3
165,2015,49.7142,27.202313,1.95,14.24184,352.21,19482.0,13.2
166,2016,46.2928,27.230412,2.9979,14.37503,313.13,19798.0,18.4
167,2016,46.2928,27.230412,2.9979,14.37503,313.13,19798.0,14.5


In [73]:
df_no_nan[df_no_nan["Country Code"] == "LUX"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
24,2015,23.1515,31.486,15.69,3.153344,,1597.0,32.0
25,2017,23.1515,9.3175,15.69,3.26231,,1597.0,32.0
26,2018,24.896,18.66525,27.6275,3.378282,,1597.0,32.3
27,2019,29.119167,26.697833,20.925,3.556219,,1597.0,32.6
28,2020,16.593167,15.773333,34.79,3.339661,,1597.0,32.6
29,2021,19.293667,18.667333,21.616,3.461016,,1597.0,32.6


In [73]:
df_no_nan[df_no_nan["Country Code"] == "UKR"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
24,2015,23.1515,31.486,15.69,3.153344,,1597.0,32.0
25,2017,23.1515,9.3175,15.69,3.26231,,1597.0,32.0
26,2018,24.896,18.66525,27.6275,3.378282,,1597.0,32.3
27,2019,29.119167,26.697833,20.925,3.556219,,1597.0,32.6
28,2020,16.593167,15.773333,34.79,3.339661,,1597.0,32.6
29,2021,19.293667,18.667333,21.616,3.461016,,1597.0,32.6


In [73]:
df_no_nan[df_no_nan["Country Code"] == "DNK"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
24,2015,23.1515,31.486,15.69,3.153344,,1597.0,32.0
25,2017,23.1515,9.3175,15.69,3.26231,,1597.0,32.0
26,2018,24.896,18.66525,27.6275,3.378282,,1597.0,32.3
27,2019,29.119167,26.697833,20.925,3.556219,,1597.0,32.6
28,2020,16.593167,15.773333,34.79,3.339661,,1597.0,32.6
29,2021,19.293667,18.667333,21.616,3.461016,,1597.0,32.6


In [73]:
df_no_nan[df_no_nan["Country Code"] == "NDL"][["Year", "PM10_ConcentrationAvg","PM25_ConcentrationAvg","NO2_ConcentrationAvg",\
    "Greenhouse gases (Kg CO2-equivalent Per Person)","Sulphur oxides (tonnes)","Total sales of agricultural pesticides (tonnes)",\
    "Share of population who are daily smokers (Pct population)"]]

Unnamed: 0,Year,PM10_ConcentrationAvg,PM25_ConcentrationAvg,NO2_ConcentrationAvg,Greenhouse gases (Kg CO2-equivalent Per Person),Sulphur oxides (tonnes),Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population)
24,2015,23.1515,31.486,15.69,3.153344,,1597.0,32.0
25,2017,23.1515,9.3175,15.69,3.26231,,1597.0,32.0
26,2018,24.896,18.66525,27.6275,3.378282,,1597.0,32.3
27,2019,29.119167,26.697833,20.925,3.556219,,1597.0,32.6
28,2020,16.593167,15.773333,34.79,3.339661,,1597.0,32.6
29,2021,19.293667,18.667333,21.616,3.461016,,1597.0,32.6
