# First prediction model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
import warnings

# Ignorer les avertissements pour garder le notebook propre
warnings.filterwarnings('ignore')

In [2]:
# Définir le chemin de base
BASE_PATH = "/mnt/c/Users/ala78/OneDrive - HESSO/BA5/GML/data-gml-health"

# Définir le nom du fichier
file_name = "COPD_prevalence_rate.csv"

# Charger le dataset
try:
    df = pd.read_csv(f"{BASE_PATH}/Refined/1021/{file_name}")

    # Afficher les premières lignes
    print(f"--- Aperçu des données ({file_name}) ---")
    print(df.head())

    # Afficher les informations sur les colonnes et les types
    print(f"\n--- Informations sur les colonnes ({file_name}) ---")
    df.info()

    # Vérifier le nombre d'années disponibles
    print("\n--- Répartition des données par année ---")
    print(df['Year'].value_counts())

except FileNotFoundError:
    print(f"Erreur: Le fichier '{file_name}' n'a pas été trouvé.")

--- Aperçu des données (COPD_prevalence_rate.csv) ---
      Measure Country Name                                Disease Metric  \
0  Prevalence    Indonesia  Chronic obstructive pulmonary disease   Rate   
1  Prevalence    Indonesia  Chronic obstructive pulmonary disease   Rate   
2  Prevalence    Indonesia  Chronic obstructive pulmonary disease   Rate   
3  Prevalence        China  Chronic obstructive pulmonary disease   Rate   
4  Prevalence        China  Chronic obstructive pulmonary disease   Rate   

   Year        Value Country Code  \
0  2015  1588.006576          IDN   
1  2017  1646.266156          IDN   
2  2018  1680.660819          IDN   
3  2013  3138.935866          CHN   
4  2014  3190.618304          CHN   

  Access to clean fuels and technologies for cooking (% of population)  \
0                                               67.3                     
1                                               75.5                     
2                                           

## Analysis

### Cols
We can see that we have 38 different cols. The one we will focus on and try do predict is "Value". This will be our target value. All the other cols will be the features.

### Issue
- One issue with this dataframe is that some cols have an object type, it means that they are only text, but we want some of them to be numeric. For example, the column "Access to clean fuels and technologies for cooking (% of population)" is a percentage and should be numeric.
- An other issue is that some columns, like "Total sales of agricultural pesticides (tonnes)" have missing values. We will fix this issue later and at first only tests models on this dataframe.


## Preprocessing

In [3]:
# 1. Define target
target_column = 'Value'
y = df[target_column]

# 2. Define features and excludes useless cols
id_columns = ['Measure', 'Country Name', 'Disease', 'Metric', 'Country Code']
X = df.drop(columns=id_columns + [target_column])

print(f"Initial features: {X.shape[1]}")

# 3. Convert all cols to numeric
X = X.apply(pd.to_numeric, errors='coerce')

# 4. Manage missing values
df_combined = pd.concat([X, y], axis=1)
rows_before = df_combined.shape[0]
print(f"\nNb of rows before deletion: {rows_before}")

# Delete rows with at least one missing value
df_clean = df_combined.dropna()
rows_after = df_clean.shape[0]
print(f"Nb of rows after deletion: {rows_after}")
print(f"Rows lost: {rows_before - rows_after} ({((rows_before - rows_after) / rows_before) * 100:.2f}%)")

# 5. Redefine X and Y with clean data
if rows_after > 0:
    X_clean = df_clean.drop(columns=[target_column])
    y_clean = df_clean[target_column]

    # 6. Divide data in train and test
    X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)
    
    print("\n--- Data size ---")
    print(f"X_train: {X_train.shape}")
    print(f"X_test: {X_test.shape}")
    
    # Store data for next steps
    regression_data = (X_train, X_test, y_train, y_test)
else:
    print("\nERROR: No more data after deletion of NaNs")
    regression_data = None

Initial features: 32

Nb of rows before deletion: 353
Nb of rows after deletion: 53
Rows lost: 300 (84.99%)

--- Data size ---
X_train: (42, 32)
X_test: (11, 32)


### Analysis

With this simple method, we lost a lot of information. We had 1119 rows. Now we have only 306 of them and our training contains 244 rows.

## Regression

We will train a DecisionTreeRegressor and a RandomForestRegressor "out of the box". We will evaluate them with R2 and RMSE.

In [4]:
X_train, X_test, y_train, y_test = regression_data

print("\n--- 1. Decision Tree (Regression) ---")
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train, y_train)
y_pred_dt = dt_regressor.predict(X_test)

rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)

print(f"Metrics for Decision Tree:")
print(f"  R-squared (R2): {r2_dt:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_dt:.4f}")

# --- 2. Modèle Forêt Aléatoire (Random Forest Regressor) ---
print("\n--- 2. Random Forest (Regression) ---")
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train, y_train)
y_pred_rf = rf_regressor.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Metrics for Random Forest:")
print(f"  R-squared (R2): {r2_rf:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_rf:.4f}")


--- 1. Decision Tree (Regression) ---
Metrics for Decision Tree:
  R-squared (R2): -0.6947
  Root Mean Squared Error (RMSE): 1493.9528

--- 2. Random Forest (Regression) ---
Metrics for Random Forest:
  R-squared (R2): 0.3265
  Root Mean Squared Error (RMSE): 941.8366


### Analysis

Scores are perfect. It is not normal because it should not be able to predict all of the values. We should find a way to detect how the models makes it predictions to understand if it is normal or not.

## Binary classification

We will transform our continous target y_clean to a binary target :
- 0 if Value is below the median
- 1 if Value is above the median

In [5]:
# 1. Create binary target
median_value = y_clean.median()
y_binary = (y_clean > median_value).astype(int)

print(f"\nBinary target (par rapport à la médiane {median_value:.2f}):")
print(y_binary.value_counts())

# 2. Divide data for classification
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clean, 
    y_binary, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_binary
)

print("\n--- Size of datasets (Classification) ---")
print(f"X_train_c: {X_train_c.shape}")
print(f"y_train_c:\n{y_train_c.value_counts(normalize=True)}")

classification_data = (X_train_c, X_test_c, y_train_c, y_test_c)


Binary target (par rapport à la médiane 4266.62):
Value
0    27
1    26
Name: count, dtype: int64

--- Size of datasets (Classification) ---
X_train_c: (42, 32)
y_train_c:
Value
1    0.5
0    0.5
Name: proportion, dtype: float64


## Classification

We will train the classifier versions of our models on binary data. We will evaluate accuracy and classification_report.

In [6]:
X_train_c, X_test_c, y_train_c, y_test_c = classification_data

print("\n--- 1. Decision Tree (Classification) ---")
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_c, y_train_c)
y_pred_dt_c = dt_classifier.predict(X_test_c)

acc_dt_c = accuracy_score(y_test_c, y_pred_dt_c)
report_dt_c = classification_report(y_test_c, y_pred_dt_c)

print(f"Metrics for Decision Tree:")
print(f"  Accuracy: {acc_dt_c:.4f}")
print("  Classification report:\n", report_dt_c)

print("\n--- 2. Random Forest (Classification) ---")
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_c, y_train_c)
y_pred_rf_c = rf_classifier.predict(X_test_c)

acc_rf_c = accuracy_score(y_test_c, y_pred_rf_c)
report_rf_c = classification_report(y_test_c, y_pred_rf_c)

print(f"Metrics for Random Forest:")
print(f"  Accuracy: {acc_rf_c:.4f}")
print("  Classification report:\n", report_rf_c)


--- 1. Decision Tree (Classification) ---
Metrics for Decision Tree:
  Accuracy: 0.8182
  Classification report:
               precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       1.00      0.60      0.75         5

    accuracy                           0.82        11
   macro avg       0.88      0.80      0.80        11
weighted avg       0.86      0.82      0.81        11


--- 2. Random Forest (Classification) ---
Metrics for Random Forest:
  Accuracy: 0.8182
  Classification report:
               precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       1.00      0.60      0.75         5

    accuracy                           0.82        11
   macro avg       0.88      0.80      0.80        11
weighted avg       0.86      0.82      0.81        11



### Analysis

We can also see here that the score are perfect, we need to find out if it is overfitting or just because we have a double of the "Value" column.

In [None]:
corr_matrix = df_clean.corr()

corr_with_target = corr_matrix[target_column]

corr_with_target_sorted = corr_with_target.abs().sort_values(ascending=False)

print("\n--- Strongest correlations with 'Value' ---")
# Exclude correlation of 'Value' (which is 1.0)
top_correlations = corr_with_target_sorted.drop(target_column)

# Show 10 highest
print(top_correlations.head(10))


Calcul de la matrice de corrélation...

--- 5. Corrélations les plus fortes avec la cible 'Value' ---
GDP (current US$)                                                             0.581604
Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita)    0.494271
Population, total                                                             0.466731
Greenhouse gases (Kg CO2-equivalent Per Person)                               0.441710
Total sales of agricultural pesticides (tonnes)                               0.439943
Life expectancy at birth, total (years)                                       0.411421
u10                                                                           0.385698
People using at least basic sanitation services (% of population)             0.335515
Unemployment, total (% of total labor force) (national estimate)              0.309608
sp                                                                            0.297547
Name: Value, dtype: float64

### Conclusion

In this last part, we showed that the highest correlation is with GDP, which means that our models may actually perform overfitting instead of being particularly good.