# Dengue Analysis:
---

In [None]:
import pandas as pd
import numpy as np
import subprocess
import os
import sys
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
import seaborn as sns
import matplotlib.pyplot as plt
from utils.utils import (
    print_with_colors,
    is_int,
    process_num_like_cols,
    print_with_multiple_columns,
)

## Download:
---
* Download the Dataset from Web if not already downloaded.

In [None]:
if not os.path.exists("./raw_data"):
    os.makedirs("./raw_data")

if not os.path.exists("./raw_data/arbovirus_clinical_data"):
    # Download of .zip file
    url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/2d3kr8zynf-4.zip"
    output = f"./raw_data/dataset.zip"
    subprocess.run(["wget", "--quiet", "--no-check-certificate", url, "-O", output])

    # Extraction of .zip file
    subprocess.run(["unzip", output])
    subprocess.run(["mv", "./2d3kr8zynf-4", "./raw_data/arbovirus_clinical_data"])
    subprocess.run(["rm", output])

## Reading the Dataset:
---
* Using `chunksize` on `pd.read_csv()` method to use less RAM memory during reading

<font color='yellow'>Note: if you already have the .parquet file, you can skip to [this](#file) section<font>

In [None]:
import warnings

missing_values = [
    '', ' ', 'NA', 'N/A', 'NULL',
    'ID_AGRAVO', 'DT_NOTIFIC', 'SEM_NOT', 'NU_ANO', 'SG_UF_NOT',
    'ID_MUNICIP', 'ID_REGIONA', 'ID_UNIDADE', 'DT_SIN_PRI', 'SEM_PRI',
    'DT_NASC', 'NU_IDADE_N', 'CS_SEXO', 'CS_GESTANT', 'CS_RACA',
    'CS_ESCOL_N', 'SG_UF', 'ID_MN_RESI', 'ID_RG_RESI', 'ID_PAIS',
    'DT_INVEST', 'FEBRE', 'MIALGIA', 'CEFALEIA', 'EXANTEMA',
    'VOMITO', 'NAUSEA', 'DOR_COSTAS', 'CONJUNTVIT', 'ARTRITE',
    'ARTRALGIA', 'PETEQUIA_N', 'LEUCOPENIA', 'LACO', 'DOR_RETRO',
    'DIABETES', 'HEMATOLOG', 'HEPATOPAT', 'RENAL', 'HIPERTENSA',
    'ACIDO_PEPT', 'AUTO_IMUNE', 'RESUL_SORO', 'RESUL_NS1', 'RESUL_VI_N',
    'RESUL_PCR_', 'HISTOPA_N', 'IMUNOH_N', 'HOSPITALIZ', 'TPAUTOCTO',
    'COUFINF', 'COPAISINF', 'COMUNINF', 'CLASSI_FIN', 'EVOLUCAO', 'DT_ENCERRA', '.'
]

warnings.filterwarnings("ignore")
# Low memory safe reading of the CSV file
splitted_df = pd.read_csv(
    './raw_data/arbovirus_clinical_data/dengue.csv',
    sep=',',
    header=0,
    na_values=missing_values,
    chunksize=100_000,
)

# Concatenate all chunks into a single DataFrame
dengue_df = pd.concat(splitted_df, ignore_index=True)
warnings.filterwarnings("default")

* The file `attributes.csv` has important information about the features

In [None]:
attributes = pd.read_csv("raw_data/arbovirus_clinical_data/attributes.csv", sep=",", header=0, low_memory=False)
attributes = attributes.ffill()
attributes = attributes.groupby(["Attribute", "Description"])["Value"].apply('; '.join).reset_index(name="Values")

## Pre Processing
---
### Null Data Removal:
* Features with frequency > 60% of null values are dropped.
* Also, columns like `["CS_FLXRET", "TP_SISTEMA", "CRITERIO", "TP_NOT", "Unnamed: 0"]` doesn't have useful information, therefore they can be dropped.

In [None]:
dengue_df = dengue_df.loc[:, dengue_df.isnull().mean() < .60]
dengue_df = dengue_df.drop(columns=["CS_FLXRET", "TP_SISTEMA", "CRITERIO", "TP_NOT", "Unnamed: 0"])

* Printing unique values for each feature to check their data type.

In [None]:
for col in dengue_df.columns.to_list():
    if str(col) in attributes["Attribute"].to_list():
        print(f"Column '{col}' has {dengue_df[col].unique().size} unique values.")
        if dengue_df[col].unique().size < 50:
            print(dengue_df[col].unique(), end="\n\n")
        else:
            print("To many unique values, skipping...", end="\n\n")
    else:
        print_with_colors(f"Column '{col}' not in attributes. Skipping display...", "yellow", end="\n\n")

### Standardization of column values:
* Since the system has changed over the year, multiple codes were used to represent some types of Dengue. In the cell below we standardized these problem.

In [None]:
dengue_df['CLASSI_FIN'] = dengue_df['CLASSI_FIN'].astype('object')

dengue_df.loc[dengue_df['CLASSI_FIN']==1, 'CLASSI_FIN'] = 'Dengue'
dengue_df.loc[dengue_df['CLASSI_FIN']==10, 'CLASSI_FIN'] = 'Dengue'

dengue_df.loc[dengue_df['CLASSI_FIN']==3, 'CLASSI_FIN'] = 'Dengue Grave'
dengue_df.loc[dengue_df['CLASSI_FIN']==4, 'CLASSI_FIN'] = 'Dengue Grave'
dengue_df.loc[dengue_df['CLASSI_FIN']==12, 'CLASSI_FIN'] = 'Dengue Grave'

dengue_df.loc[dengue_df['CLASSI_FIN']==2, 'CLASSI_FIN'] = 'Dengue com sinais de alarme'
dengue_df.loc[dengue_df['CLASSI_FIN']==11, 'CLASSI_FIN'] = 'Dengue com sinais de alarme'

# Discarded/Inconclusive
dengue_df.loc[dengue_df['CLASSI_FIN']==5, 'CLASSI_FIN'] = 'Discarded/Inconclusive'
dengue_df.loc[dengue_df['CLASSI_FIN']==6, 'CLASSI_FIN'] = 'Discarded/Inconclusive'
dengue_df.loc[dengue_df['CLASSI_FIN']==8, 'CLASSI_FIN'] = 'Discarded/Inconclusive'

dengue_df['CLASSI_FIN'] = dengue_df['CLASSI_FIN'].fillna('Discarded/Inconclusive')
dengue_df['CLASSI_FIN'] = dengue_df['CLASSI_FIN'].astype('category')


dengue_df["NU_IDADE_N"] = dengue_df["NU_IDADE_N"].apply(lambda x: x-4000 if (x >= 4000 and x <= 4999) else x)
dengue_df["NU_IDADE_N"] = dengue_df["NU_IDADE_N"].apply(lambda x: x-400 if (x >= 400 and x <= 499) else x)
dengue_df["NU_IDADE_N"] = dengue_df["NU_IDADE_N"].apply(lambda x: 1 if (x >= 3000 and x <= 3999) else x)
dengue_df["NU_IDADE_N"] = dengue_df["NU_IDADE_N"].apply(lambda x: 1 if (x >= 300 and x <= 399) else x)
dengue_df["NU_IDADE_N"] = dengue_df["NU_IDADE_N"].apply(lambda x: 1 if (x >= 2000 and x <= 2999) else x)
dengue_df["NU_IDADE_N"] = dengue_df["NU_IDADE_N"].apply(lambda x: 1 if (x >= 200 and x <= 299) else x)
dengue_df["NU_IDADE_N"] = dengue_df["NU_IDADE_N"].apply(lambda x: 1 if (x >= 1000 and x <= 1999) else x)
dengue_df["NU_IDADE_N"] = dengue_df["NU_IDADE_N"].apply(lambda x: 1 if (x >= 100 and x <= 199) else x)

### Null data padding with default values:
The resulting attributes that still had null data were entered with the default values referring to the data dictionary.

In [None]:
exam_cols = [
    "RESUL_SORO",
    "RESUL_NS1",
    "RESUL_VI_N",
    "RESUL_PCR_",
    "HISTOPA_N",
    "IMUNOH_N"
]
for col in exam_cols:
    if dengue_df[col].isnull().sum() > 0:
        dengue_df.loc[dengue_df[col].isnull(), col] = 4

dengue_df['CS_SEXO'] = dengue_df['CS_SEXO'].fillna('I')

# In the other attributes, the value of "not informed" is 9.
columns_to_be_filled = [
    col
    for col in dengue_df.columns
    if col not in exam_cols
    and 'DT_' not in str(col) # for datetime columns it doesn't make sense
    and not 'CS_SEXO'.__eq__(str(col)) # CS_SEXO has the special value 'I' for NaNs
]
for col in columns_to_be_filled:
    if dengue_df[col].isnull().sum() > 0:
        dengue_df.loc[dengue_df[col].isnull(), col] = 9


Removing columns 'ID_AGRAVO', because it has only 1 fixed value: `A90`

In [None]:
dengue_df = dengue_df.drop(columns=['ID_AGRAVO'])

In [None]:
dtypes = {
    'SEM_NOT': 'category',
    'NU_ANO': 'int16',
    'SG_UF_NOT': 'category',
    'ID_MUNICIP': 'category',
    'ID_REGIONA': 'category',
    'ID_UNIDADE': 'category',
    'SEM_PRI': 'int32',
    'NU_IDADE_N': 'int8',
    'CS_SEXO': 'category',
    'CS_GESTANT': 'category',
    'CS_RACA': 'category',
    'CS_ESCOL_N': 'category',
    'SG_UF': 'category',
    'ID_MN_RESI': 'category',
    'ID_RG_RESI': 'category',
    'ID_PAIS': 'category',
    'FEBRE': 'category',
    'MIALGIA': 'category',
    'CEFALEIA': 'category',
    'EXANTEMA': 'category',
    'VOMITO': 'category',
    'NAUSEA': 'category',
    'DOR_COSTAS': 'category',
    'CONJUNTVIT': 'category',
    'ARTRITE': 'category',
    'ARTRALGIA': 'category',
    'PETEQUIA_N': 'category',
    'LEUCOPENIA': 'category',
    'LACO': 'category',
    'DOR_RETRO': 'category',
    'DIABETES': 'category',
    'HEMATOLOG': 'category',
    'HEPATOPAT': 'category',
    'RENAL': 'category',
    'HIPERTENSA': 'category',
    'ACIDO_PEPT': 'category',
    'AUTO_IMUNE': 'category',
    'RESUL_SORO': 'category',
    'RESUL_NS1': 'category',
    'RESUL_VI_N': 'category',
    'RESUL_PCR_': 'category',
    'HISTOPA_N': 'category',
    'IMUNOH_N': 'category',
    'HOSPITALIZ': 'category',
    'TPAUTOCTO': 'category',
    'COUFINF': 'category',
    'COPAISINF': 'category',
    'COMUNINF': 'category',
    'EVOLUCAO': 'category',
}

In [None]:
dengue_df = process_num_like_cols(dengue_df)

### Setting dtypes to columns:

In [None]:
date_cols = ['DT_NOTIFIC', 'DT_SIN_PRI', 'DT_NASC', 'DT_INVEST', 'DT_ENCERRA']
for col in date_cols:
    if col in dengue_df.columns:
        dengue_df[col] = pd.to_datetime(dengue_df[col], errors='coerce')

dengue_df['SEM_PRI'] = dengue_df['SEM_PRI'].apply(lambda x: x.replace('-', '') if isinstance(x, str) else x)

exam_cols = [
    "RESUL_SORO",
    "RESUL_NS1",
    "RESUL_VI_N",
    "RESUL_PCR_",
    "HISTOPA_N",
    "IMUNOH_N"
]
for col in exam_cols:
    if dengue_df[col].isnull().sum() > 0:
        dengue_df.loc[dengue_df[col].isnull(), col] = 4

dengue_df['CS_SEXO'] = dengue_df['CS_SEXO'].fillna('I')

# In the other attributes, the value of "not informed" is 9.
columns_to_be_filled = [
    col
    for col in dengue_df.columns
    if col not in exam_cols
    and 'DT_' not in str(col) # for datetime columns it doesn't make sense
    and not 'CS_SEXO'.__eq__(str(col)) # CS_SEXO has the special value 'I' for NaNs
]
for col in columns_to_be_filled:
    if dengue_df[col].isnull().sum() > 0:
        dengue_df.loc[dengue_df[col].isnull(), col] = 9

dengue_df = dengue_df.astype(dtypes)

### Saving processed DataFrame to parquet:

In [None]:
dengue_df.to_parquet("./preprocessed_data/dengue.parquet")

## Reading Parquet File:
---
* Uncomment the cell below if you already have the `.parquet` file and just want to load it.

In [None]:
# dengue_df = pd.read_parquet("./preprocessed_data/dengue.parquet")

# dtypes = {
#     'SEM_NOT': 'int32', 'NU_ANO': 'int16', 'SG_UF_NOT': 'category', 'ID_MUNICIP': 'category',
#     'ID_REGIONA': 'category', 'ID_UNIDADE': 'category', 'SEM_PRI': 'int32',
#     'NU_IDADE_N': 'int8', 'CS_SEXO': 'category', 'CS_GESTANT': 'category',
#     'CS_RACA': 'category', 'CS_ESCOL_N': 'category', 'SG_UF': 'category',
#     'ID_MN_RESI': 'category', 'ID_RG_RESI': 'category', 'ID_PAIS': 'category',
#     'FEBRE': 'category', 'MIALGIA': 'category', 'CEFALEIA': 'category',
#     'EXANTEMA': 'category', 'VOMITO': 'category', 'NAUSEA': 'category',
#     'DOR_COSTAS': 'category', 'CONJUNTVIT': 'category', 'ARTRITE': 'category',
#     'ARTRALGIA': 'category', 'PETEQUIA_N': 'category', 'LEUCOPENIA': 'category',
#     'LACO': 'category', 'DOR_RETRO': 'category', 'DIABETES': 'category',
#     'HEMATOLOG': 'category', 'HEPATOPAT': 'category', 'RENAL': 'category',
#     'HIPERTENSA': 'category', 'ACIDO_PEPT': 'category', 'AUTO_IMUNE': 'category',
#     'RESUL_SORO': 'category', 'RESUL_NS1': 'category', 'RESUL_VI_N': 'category',
#     'RESUL_PCR_': 'category', 'HISTOPA_N': 'category', 'IMUNOH_N': 'category',
#     'HOSPITALIZ': 'category', 'TPAUTOCTO': 'category', 'COUFINF': 'category',
#     'COPAISINF': 'category', 'COMUNINF': 'category', 'EVOLUCAO': 'category',
# }

# dengue_df = dengue_df.astype(dtypes)

## Removing Data Leakage Features:
---
Some features in the dataset have information from the future (*i.e.* after the `CLASSI_FIN` has been diagnosed)

In [None]:
leaky_columns = [
    'RESUL_SORO', 'RESUL_NS1', 'RESUL_VI_N', 'RESUL_PCR_', 'HISTOPA_N',
    'IMUNOH_N', 'EVOLUCAO', 'DT_ENCERRA', 'TPAUTOCTO', 'COUFINF', 'COPAISINF',
    'COMUNINF', 'CODISINF', 'CO_BAINFC', 'NOBAIINF', 'DOENCA_TRA', 'DT_OBITO',
    
]
leaky_columns = [col for col in leaky_columns if col in dengue_df.columns]

dengue_df = dengue_df.drop(columns=leaky_columns)

dengue_df = dengue_df[~dengue_df["NU_IDADE_N"] < 0] # Age < 0 don't make sense

## Feature Engineering
---
* Trying to extract useful information from features:

In [None]:
# Columns with error: 'DT_SIN_PRI' cannot be equal to 'DT_NASC'.
dengue_df = dengue_df[~(dengue_df['DT_NASC'] == dengue_df['DT_SIN_PRI'])]

# Time gap between notification and first simptoms.
dengue_df['time_until_report'] = (dengue_df['DT_NOTIFIC'] - dengue_df['DT_SIN_PRI']).dt.days

# Negative values for 'time_until_report' or 'time_until_report' >= 30 doesn't make sense, since the virus
# expresses their simptoms in a shorter period of time.
dengue_df = dengue_df.loc[(dengue_df['time_until_report'] <= 30) & (dengue_df['time_until_report'] >= 0)]

# 'SEM_NOT' is the epidemiological week of the notification date.
# We don't need the year, so we can convert it to a string and remove the first characters.
dengue_df['SEM_NOT'] = dengue_df['SEM_NOT'].astype('str')
dengue_df['SEM_NOT'] = dengue_df['SEM_NOT'].apply(lambda x: x[4:] if len(x) > 4 else x[2:])
dengue_df['SEM_NOT'] = dengue_df['SEM_NOT'].astype('category')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(x=dengue_df['time_until_report'], orient='horizontal')
plt.title('Distribuição de "time_until_report"')
plt.xlabel('Dias entre Sintomas e Notificação')
plt.show()

In [None]:
attributes = pd.read_csv("raw_data/arbovirus_clinical_data/attributes.csv", sep=",", header=0, low_memory=False)
attributes = attributes.ffill()
attributes = attributes.groupby(["Attribute", "Description"])["Value"].apply('; '.join).reset_index(name="Values")
attributes = attributes[~attributes["Attribute"].isin(leaky_columns)]
attributes = attributes[attributes["Attribute"].isin(dengue_df.columns)]
attributes = attributes.reset_index(drop=True)

acido_pept = {
    "Attribute": "ACIDO_PEPT",
    "Description": "Pre-existing disease - Acid peptic disease",
    "Values": "1: Yes; 2: No",
}

time_until_report = {
    "Attribute": "time_until_report",
    "Description": "Time until report",
    "Values": "0: 0 days; 1: 1 day; 2: 2 days; ...; 30: 30 days",
}
attributes = pd.concat([attributes, pd.DataFrame([acido_pept]), pd.DataFrame([time_until_report])], ignore_index=True)

In [None]:
attributes

In [None]:
# Label encoding the target variable
le = LabelEncoder()
y = le.fit_transform(dengue_df['CLASSI_FIN'])

In [None]:
# Keep mapping of target variable
target_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"Target mapping: {target_mapping}")

In [None]:
# Preparing the feature matrix
X = dengue_df.drop(columns=['CLASSI_FIN'])

In [None]:
# Extracting month and day of the week from the notification date
if 'DT_NOTIFIC' in X.columns:
    X['notif_month'] = X['DT_NOTIFIC'].dt.month
    X['notif_week_day'] = X['DT_NOTIFIC'].dt.dayofweek

In [None]:
# Remove raw date columns
cols_to_drop = [col for col in X.columns if 'DT_' in col]
X = X.drop(columns=cols_to_drop)

In [None]:
print("Features to train with:")
print_with_multiple_columns(X.columns.tolist(), 5)

In [None]:
# Double-checking for 'object' columns and converting them to appropriate types
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

## Hyperparameter Tuning In a Small Subset:
---
* Instead of using 12 million registries, we will do Hyperparameter Tuning using only 500k samples, to save time and computer power.

In [None]:
# Setting optuna objective function
def objective_parallel(trial, X_inner, y_inner):
    """
    Versão otimizada para paralelismo. Cada trial usará apenas 1 core.
    """
    X_train, X_val, y_train, y_val = train_test_split(
        X_inner, y_inner, test_size=0.25, random_state=42, stratify=y_inner
    )
    
    params = {
        'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': len(np.unique(y_inner)),
        'verbosity': -1, 'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'n_jobs': 1,
    }

    model = lgb.LGBMClassifier(**params, n_estimators=1000)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(15, verbose=False)]
    )

    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average='macro')
    
    return f1

In [None]:
# Testing on a small subset
# Stratified sampling for training...
dengue_df['strata'] = (
    dengue_df['NU_ANO'].astype(str) + '_' +
    dengue_df['CLASSI_FIN'].astype(str)
)

# We will use a large ~ 5% (500k) sample size to ensure we have enough data for training.
sample_size = 500_000
sample_ratio = sample_size / len(dengue_df)

_, df_subset = train_test_split(
    dengue_df,
    test_size=sample_ratio,
    stratify=dengue_df['strata'],
    random_state=42
)

print("Sample subset created with size:", len(df_subset))

df_subset = df_subset.drop(columns=['strata'])

In [None]:
# Label encoding the target variable
le = LabelEncoder()
y = le.fit_transform(df_subset['CLASSI_FIN'])

# Keep mapping of target variable
target_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"Target mapping: {target_mapping}")

# Preparing the feature matrix
X = df_subset.drop(columns=['CLASSI_FIN'])

# Extracting month and day of the week from the notification date
if 'DT_NOTIFIC' in X.columns:
    X['notif_month'] = X['DT_NOTIFIC'].dt.month
    X['notif_week_day'] = X['DT_NOTIFIC'].dt.dayofweek
    
# Remove raw date columns
cols_to_drop = [col for col in X.columns if 'DT_' in col]
X = X.drop(columns=cols_to_drop)

# Double-checking for 'object' columns and converting them to appropriate types
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

In [None]:
# Nested Cross Validation:
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=100)

outer_scores = []
best_params_per_fold = []

In [None]:
print("Starting Nested Cross Validation...")

for i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
    print(f"\n--- Processing over external fold {i+1}/5 ---")
    
    X_train_outer, X_test_outer = X.iloc[train_idx], X.iloc[test_idx]
    y_train_outer, y_test_outer = y[train_idx], y[test_idx]

    study = optuna.create_study(direction='maximize')
    study.optimize(
        lambda trial: objective_parallel(trial, X_train_outer, y_train_outer),
        n_trials=20,  # You can adjust the number of trials based on your computational resources
        n_jobs=-1     # <-- Here we use all available cores for parallel execution
    )

    best_params = study.best_trial.params
    best_params['n_jobs'] = 1 
    print(f"Best parameters found: {best_params}")

    final_model = lgb.LGBMClassifier(**best_params, n_estimators=1000, random_state=42)
    
    final_model.fit(
        X_train_outer, y_train_outer,
        eval_set=[(X_test_outer, y_test_outer)],
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(15, verbose=False)]
    )
    
    y_pred_outer = final_model.predict(X_test_outer)
    score = f1_score(y_test_outer, y_pred_outer, average='macro')
    outer_scores.append(score)
    print(f"F1-Macro for External Fold: {i+1}: {score:.4f}")

In [None]:
print("\n--- Final Evaluation of the Optimized Nested Cross-Validation ---")
print(f"Macro F1-Scores for each outer fold: {np.round(outer_scores, 4)}")
print(f"Mean Macro F1-Score: {np.mean(outer_scores):.4f}")
print(f"Macro F1-Score Standard Deviation: {np.std(outer_scores):.4f}")

In [None]:
# --- SIMPLIFY WITH A SINGLE SPLIT ---
# We don't need Nested CV to find the leak
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# --- STEP 1: TRAIN A MODEL AND CHECK FEATURE IMPORTANCE ---
print("Training a LightGBM model for importance analysis...")
simple_lgbm = lgb.LGBMClassifier(objective='multiclass', random_state=42)
simple_lgbm.fit(X_train, y_train)

print("\n--- FEATURE IMPORTANCE PLOT (LGBM) ---")
print("Look for a bar that is MUCH taller than all the others.")
lgb.plot_importance(simple_lgbm, max_num_features=20, figsize=(10, 8),
                    importance_type='gain', title='Feature Importance (Gain)')
plt.show()

# The feature name at the top of the plot is our SUSPECT #1.

# --- STEP 2: VISUALIZE A SIMPLE DECISION TREE ---
print("\n--- SIMPLE DECISION TREE PLOT ---")
print("The feature at the top of the tree (root node) is the most likely culprit.")
# We need an X with only numeric columns for plot_tree
X_train_numeric = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)

simple_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
simple_tree.fit(X_train_numeric, y_train)

plt.figure(figsize=(20, 12))
plot_tree(simple_tree,
          feature_names=X_train_numeric.columns,
          class_names=le.classes_,
          filled=True,
          rounded=True,
          fontsize=10)
plt.title("Simple Decision Tree to Identify the Dominant Feature")
plt.show()

In [None]:
import shap

# TODO more tests are needed to check if the feature is really a leak