In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Import data and transform

In [76]:
df_raw = pd.read_csv("../Data/Refined/1021/asthma_prevalence_rate.csv")
df_raw

Unnamed: 0,Measure,Country Name,Disease,Metric,Year,Value,Country Code,Access to clean fuels and technologies for cooking (% of population),Access to electricity (% of population),Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita),...,Total sales of agricultural pesticides (tonnes),Share of population who are daily smokers (Pct population),u10,v10,d2m,t2m,sst,sp,skt,blh
0,Prevalence,Japan,Asthma,Rate,2014,5192.177057,JPN,100,100.0,10.0274639366416,...,53543.7,8.5,2.180324,1.293077,288.367888,292.485756,293.779580,102115.801254,293.610257,885.832315
1,Prevalence,Japan,Asthma,Rate,2014,5192.177057,JPN,100,100.0,10.0274639366416,...,53543.7,32.2,2.180324,1.293077,288.367888,292.485756,293.779580,102115.801254,293.610257,885.832315
2,Prevalence,Japan,Asthma,Rate,2014,5192.177057,JPN,100,100.0,10.0274639366416,...,53543.7,19.6,2.180324,1.293077,288.367888,292.485756,293.779580,102115.801254,293.610257,885.832315
3,Prevalence,Japan,Asthma,Rate,2015,5206.382139,JPN,100,100.0,9.71691822464823,...,54171.1,7.9,2.247194,1.573335,288.342591,292.509937,293.900725,102082.760935,293.728914,905.719678
4,Prevalence,Japan,Asthma,Rate,2015,5206.382139,JPN,100,100.0,9.71691822464823,...,54171.1,30.1,2.247194,1.573335,288.342591,292.509937,293.900725,102082.760935,293.728914,905.719678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.0,28.0,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818
1127,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.0,31.0,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818
1128,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.0,7.9,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818
1129,Prevalence,Türkiye,Asthma,Rate,2019,5017.556119,TUR,95.2,100.0,5.00896106827559,...,51297.0,19.6,1.701146,0.641168,285.172083,288.868438,289.786199,102025.270326,289.665676,820.898818


In [84]:
columns_to_keep = [col for col in df_raw.columns if col not in ['Country Name', 'Measure', 'Metric', 'Disease']]
df_excluded = df_raw[columns_to_keep]

In [81]:
df_excluded.isnull().sum()

Year                                                                            0
Value                                                                           0
Country Code                                                                    0
Access to clean fuels and technologies for cooking (% of population)            0
Access to electricity (% of population)                                         0
Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita)      0
Compulsory education, duration (years)                                          0
GDP (current US$)                                                               0
GDP per capita (constant 2015 US$)                                              0
Gini index                                                                      0
Life expectancy at birth, total (years)                                         0
Mortality rate, infant (per 1,000 live births)                                  0
People using at 

In our dataset, there are many missing values (even though not so many relatively to the number of rows we have). 
Before training, we need to:
- Handle missing values (replace '..', 'N/A', etc. with np.nan) -> for models that can deal with them, we can leave these values there
- Use numeric encoding for categorical columns (e.g., one-hot for Country Name)

In [85]:
df_encoded = pd.get_dummies(df_excluded, columns=["Country Code"], drop_first=True)
df_encoded.replace(['..', 'nan'], np.nan, inplace=True)
df_encoded = df_encoded.apply(pd.to_numeric, errors='coerce')

## Hist Gradient Boosting Regressor

HistGradientBoostingRegressor (from sklearn.ensemble) is a tree-based model. Tree-based models split data based on thresholds, not on absolute magnitudes, so scaling does not affect them.
Therefore we can avoid to normalize our features.

In [90]:
X = df_encoded[[col for col in df_encoded.columns if col not in ['Value']]] # Replace with actual feature names
y = df_encoded['Value'] 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)     # returns MSE
rmse = np.sqrt(mse)                          # RMSE = sqrt(MSE)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"R²: {r2:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")

R²: 0.994
MAE: 66.713
RMSE: 240.309
