In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

In [2]:
import sys
from pathlib import Path

project_root = Path.cwd().parent    
sys.path.append(str(project_root.resolve()))   # now Python can find modules there

import data_preparation   # regular import works

## Import data and transform

In [3]:
df_raw = pd.read_csv("../../Data/Refined/1021/asthma_prevalence_rate.csv")
df_raw

Unnamed: 0,Measure,Country Name,Disease,Metric,Year,Value,Country Code,Access to clean fuels and technologies for cooking (% of population),Access to electricity (% of population),Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita),...,Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population),u10,v10,d2m,t2m,sst,sp,skt,blh
0,Prevalence,Japan,Asthma,Rate,2014,5192.177057,JPN,100,100.0,10.0274639366416,...,53543.70,19.6,2.180324,1.293077,288.367888,292.485756,293.779580,102115.801254,293.610257,885.832315
1,Prevalence,Japan,Asthma,Rate,2015,5206.382139,JPN,100,100.0,9.71691822464823,...,54171.10,18.2,2.247194,1.573335,288.342591,292.509937,293.900725,102082.760935,293.728914,905.719678
2,Prevalence,Japan,Asthma,Rate,2016,5222.775846,JPN,100,100.0,9.70188627278164,...,51006.40,18.3,1.896303,1.843566,288.471362,292.607193,293.833551,102084.147374,293.667026,903.834044
3,Prevalence,Japan,Asthma,Rate,2017,5259.014818,JPN,100,100.0,9.59147213558895,...,52248.52,17.7,2.231108,1.180876,288.519120,292.625293,293.974890,101947.341061,293.809990,896.601952
4,Prevalence,Japan,Asthma,Rate,2018,5302.827203,JPN,100,100.0,9.29901664682086,...,52331.77,17.8,2.239119,1.458739,288.927352,293.018189,294.475368,102117.550669,294.303794,904.800901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,Prevalence,Türkiye,Asthma,Rate,2016,4269.509103,TUR,95,100.0,4.93489476936857,...,50054.00,22.3,3.125512,0.639595,283.793607,287.765514,288.800703,101928.934464,288.677710,927.511750
349,Prevalence,Türkiye,Asthma,Rate,2017,4499.719504,TUR,95,100.0,5.39364397893842,...,54098.00,,1.828292,0.463260,284.082554,287.737981,288.662293,101909.206715,288.544441,840.552676
350,Prevalence,Türkiye,Asthma,Rate,2018,4779.019886,TUR,95.1,100.0,5.26873027109493,...,60020.00,,1.563059,0.545625,284.546408,288.057154,288.875183,102145.624975,288.762852,806.899897
351,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.00,23.8,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818


In [4]:
columns_to_keep = [col for col in df_raw.columns if col not in ['Country Name', 'Measure', 'Metric', 'Disease']]
df_excluded = df_raw[columns_to_keep]

In [5]:
df_prepared = data_preparation.preprocessing(df_excluded)

In [6]:
df_prepared.isnull().sum()

Year                                                                            0
Value                                                                           0
Country Code                                                                    0
Access to clean fuels and technologies for cooking (% of population)            9
Access to electricity (% of population)                                         0
Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita)      2
Compulsory education, duration (years)                                          0
GDP (current US$)                                                               0
GDP per capita (constant 2015 US$)                                              0
Gini index                                                                     43
Life expectancy at birth, total (years)                                         0
Mortality rate, infant (per 1,000 live births)                                  0
People using at 

In [7]:
df_imputed = data_preparation.impute_nans(df_prepared)

  .transform(lambda g: g.ffill().bfill())


In [8]:
df_imputed

Unnamed: 0,Year,Value,Access to clean fuels and technologies for cooking (% of population),Access to electricity (% of population),Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita),"Compulsory education, duration (years)",GDP (current US$),GDP per capita (constant 2015 US$),Gini index,"Life expectancy at birth, total (years)",...,Country Code_RUS,Country Code_SRB,Country Code_SVK,Country Code_SVN,Country Code_SWE,Country Code_THA,Country Code_TUR,Country Code_UKR,Country Code_USA,Country Code_ZAF
96,2015.0,3553.507526,76.6,100.0,1.688303,9.0,1.147017e+10,3981.726623,32.8,78.358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,2016.0,3605.780840,78.3,99.9,1.575153,9.0,1.198867e+10,4143.989883,33.7,78.643,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,2017.0,3660.174600,79.6,99.9,1.858911,9.0,1.325827e+10,4283.982627,33.1,78.900,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,2018.0,3694.900293,80.7,100.0,1.850978,9.0,1.537951e+10,4452.237147,30.1,79.238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,2019.0,3741.546750,82.0,100.0,1.735518,9.0,1.558511e+10,4563.467363,30.1,79.467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,2017.0,3584.550156,85.8,84.4,8.151474,9.0,3.814488e+11,6125.692051,63.0,65.422,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
343,2018.0,4039.051490,86.6,84.7,8.081821,9.0,4.052607e+11,6117.270141,63.0,65.726,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
344,2019.0,4419.402469,87.4,85.0,8.031639,9.0,3.893300e+11,6032.829726,63.0,66.071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
345,2020.0,4588.454963,88.1,90.0,6.914741,9.0,3.379747e+11,5569.584833,63.0,65.150,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Hist Gradient Boosting Regressor

HistGradientBoostingRegressor (from sklearn.ensemble) is a tree-based model. Tree-based models split data based on thresholds, not on absolute magnitudes, so scaling does not affect them.
Therefore we can avoid to normalize our features.

In [9]:
X = df_imputed[[col for col in df_imputed.columns if col not in ['Value']]] # Replace with actual feature names
y = df_imputed['Value'] 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)     # returns MSE
rmse = np.sqrt(mse)                          # RMSE = sqrt(MSE)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"R²: {r2:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")

R²: 0.940
MAE: 374.186
RMSE: 690.760


In [10]:
# Compute permutation importance
importance = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)

# Put results in a DataFrame
perm_importances = pd.Series(importance.importances_mean, index=X_test.columns)
perm_importances = perm_importances.sort_values(ascending=False)
print(perm_importances)

GDP per capita (constant 2015 US$)                                   0.808478
Life expectancy at birth, total (years)                              0.095304
Sulphur oxides (tonnes)                                              0.060307
People using at least basic sanitation services (% of population)    0.040098
v10                                                                  0.025999
                                                                       ...   
Country Code_THA                                                     0.000000
Country Code_TUR                                                     0.000000
Country Code_UKR                                                     0.000000
Country Code_USA                                                     0.000000
Country Code_ZAF                                                     0.000000
Length: 78, dtype: float64
