In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

## Import data and transform

In [25]:
df_raw = pd.read_csv("../../Data/Refined/1021/asthma_prevalence_rate.csv")
df_raw

Unnamed: 0,Measure,Country Name,Disease,Metric,Year,Value,Country Code,Access to clean fuels and technologies for cooking (% of population),Access to electricity (% of population),Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita),...,Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population),u10,v10,d2m,t2m,sst,sp,skt,blh
0,Prevalence,Japan,Asthma,Rate,2014,5192.177057,JPN,100,100.0,10.0274639366416,...,53543.70,19.6,2.180324,1.293077,288.367888,292.485756,293.779580,102115.801254,293.610257,885.832315
1,Prevalence,Japan,Asthma,Rate,2015,5206.382139,JPN,100,100.0,9.71691822464823,...,54171.10,18.2,2.247194,1.573335,288.342591,292.509937,293.900725,102082.760935,293.728914,905.719678
2,Prevalence,Japan,Asthma,Rate,2016,5222.775846,JPN,100,100.0,9.70188627278164,...,51006.40,18.3,1.896303,1.843566,288.471362,292.607193,293.833551,102084.147374,293.667026,903.834044
3,Prevalence,Japan,Asthma,Rate,2017,5259.014818,JPN,100,100.0,9.59147213558895,...,52248.52,17.7,2.231108,1.180876,288.519120,292.625293,293.974890,101947.341061,293.809990,896.601952
4,Prevalence,Japan,Asthma,Rate,2018,5302.827203,JPN,100,100.0,9.29901664682086,...,52331.77,17.8,2.239119,1.458739,288.927352,293.018189,294.475368,102117.550669,294.303794,904.800901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,Prevalence,Türkiye,Asthma,Rate,2017,4499.719504,TUR,95,100.0,5.39364397893842,...,54098.00,,1.828292,0.463260,284.082554,287.737981,288.662293,101909.206715,288.544441,840.552676
489,Prevalence,Türkiye,Asthma,Rate,2018,4779.019886,TUR,95.1,100.0,5.26873027109493,...,60020.00,,1.563059,0.545625,284.546408,288.057154,288.875183,102145.624975,288.762852,806.899897
490,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.00,28.0,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818
491,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.00,19.6,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818


In [26]:
columns_to_keep = [col for col in df_raw.columns if col not in ['Country Name', 'Measure', 'Metric', 'Disease']]
df_excluded = df_raw[columns_to_keep]

In [27]:
df_excluded.isnull().sum()

Year                                                                            0
Value                                                                           0
Country Code                                                                    0
Access to clean fuels and technologies for cooking (% of population)            0
Access to electricity (% of population)                                         0
Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita)      0
Compulsory education, duration (years)                                          0
GDP (current US$)                                                               0
GDP per capita (constant 2015 US$)                                              0
Gini index                                                                      0
Life expectancy at birth, total (years)                                         0
Mortality rate, infant (per 1,000 live births)                                  0
People using at 

In our dataset, there are many missing values (even though not so many relatively to the number of rows we have). 
Before training, we need to:
- Handle missing values (replace '..', 'N/A', etc. with np.nan) -> for models that can deal with them, we can leave these values there
- Use numeric encoding for categorical columns (e.g., one-hot for Country Name)

In [28]:
df_encoded = pd.get_dummies(df_excluded, columns=["Country Code"], drop_first=True)
df_encoded.replace(['..', 'nan'], np.nan, inplace=True)
df_encoded = df_encoded.apply(pd.to_numeric, errors='coerce')

## Hist Gradient Boosting Regressor

HistGradientBoostingRegressor (from sklearn.ensemble) is a tree-based model. Tree-based models split data based on thresholds, not on absolute magnitudes, so scaling does not affect them.
Therefore we can avoid to normalize our features.

In [29]:
X = df_encoded[[col for col in df_encoded.columns if col not in ['Value']]] # Replace with actual feature names
y = df_encoded['Value'] 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)     # returns MSE
rmse = np.sqrt(mse)                          # RMSE = sqrt(MSE)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"R²: {r2:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")

R²: 0.952
MAE: 244.237
RMSE: 556.536


In [30]:
# Compute permutation importance
importance = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)

# Put results in a DataFrame
perm_importances = pd.Series(importance.importances_mean, index=X_test.columns)
perm_importances = perm_importances.sort_values(ascending=False)
print(perm_importances)

GDP per capita (constant 2015 US$)                                   1.006798
Life expectancy at birth, total (years)                              0.096757
Mortality rate, infant (per 1,000 live births)                       0.073566
Gini index                                                           0.072911
People using at least basic sanitation services (% of population)    0.043334
                                                                       ...   
Country Code_THA                                                     0.000000
Country Code_TUR                                                     0.000000
Country Code_UKR                                                     0.000000
Country Code_USA                                                     0.000000
Country Code_ZAF                                                     0.000000
Length: 78, dtype: float64
