## Automated running decision tree
- Experiment variables
	- feature sets:
		- feature set1: all features
		- feature set2: all features - plant tyeps
		- feature set3: all features - plant types - soil temp
		- feature set4: all features - soil temp
	- model parameters
		- DecisionTreeClassifier(random_state=42)
		- DecisionTreeClassifier(random_state=42, max_depth =
		  3)  
- Output includes
	- WHO FPPL (species/pathogen)
	- Feature
	- Condition (for only non-existing species)
	- Training metrics
		- accuracy
		- precision
		- recall
	- Test data
		- number of samples that matched condition
		- accuracy (based on only samples that matched conditions)

In [3]:
import pandas as pd
import os
import re

import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
import importlib

import sys
sys.path.append('../scripts')
from decision_tree_helpers import find_paths_with_gini_zero, evaluate_path

## Helpers

In [2]:
def round_condition_numbers(conditions, decimals=2):
    rounded_conditions = []
    for cond in conditions:
        # Replace numbers with rounded numbers (handles both <= and >)
        rounded = re.sub(
            r'([<>]=?\s*)(\d+\.\d+)',
            lambda m: f"{m.group(1)}{float(m.group(2)):.{decimals}f}",
            cond
        )
        rounded_conditions.append(rounded)
    return rounded_conditions

## Input

In [None]:
# get path for folders
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
data_folder = os.path.join(project_root, "data")
results_folder = os.path.join(project_root, "results")

In [None]:
# training data
df = pd.read_csv(f"{data_folder}/DroughtITS_mapping_w_labels_training_data.csv")

In [None]:
with open(f"{results_folder}/group_names/pathogen_group_names_training_data.txt", "r") as file:
    group_names = [line.strip() for line in file.readlines()]

In [None]:
# we don't run Fusarium spp. because it appears in all samples
group_names.remove("Fusarium spp.")

In [7]:
features = ['zone', 'province', 'lat', 'lon', 'drought', 'plant',
       'water_content', 'organic_matter', 'nitrogen', 'phosphorus',
       'potassium', 'temp_soil', 'pH']

### Test data

In [None]:
test_df = pd.read_csv(f'{data_folder}/DroughtITS_mapping_w_labels_test_data.csv.csv')
test_df['nitrogen'] = test_df['n_nitrate']

## Data pre-processing

In [None]:
# One-hot encode categorical features
cat_features = ['plant']

# training data (data from 2022)
df_encoded = pd.get_dummies(df, columns=cat_features)
df_encoded = df_encoded.rename(columns={'plant_Rubber tree': 'plant_Rubber_tree'})

# test data (data from 2025)
test_df_encoded = pd.get_dummies(test_df, columns=cat_features)
test_df_encoded['plant_Rice'] = False
test_df_encoded = test_df_encoded.rename(columns={'plant_Rubber tree': 'plant_Rubber_tree'})


## Features

In [11]:
features_sets = {'features_set1':  ['drought', 'water_content', 'organic_matter',
       'nitrogen', 'phosphorus', 'potassium', 'temp_soil', 'pH',
       'plant_Cassava', 'plant_Rice', 'plant_Rubber_tree', 'plant_Sugarcane'],
                 'features_set2': ['drought', 'water_content', 'organic_matter',
       'nitrogen', 'phosphorus', 'potassium', 'temp_soil', 'pH'],
                 'features_set3': ['drought', 'water_content', 'organic_matter',
       'nitrogen', 'phosphorus', 'potassium', 'pH'],
                 'features_set4': ['drought', 'water_content', 'organic_matter',
       'nitrogen', 'phosphorus', 'potassium', 'pH',
       'plant_Cassava', 'plant_Rice', 'plant_Rubber_tree', 'plant_Sugarcane']}

## Decision Trees

In [12]:
model_names = ['DecisionTreeClassifier', 'DecisionTreeClassifier_max_depth_3' ]

In [None]:
output_metrics = []
for idx, group_name in enumerate(group_names):
    print(f"{idx}. Running for group: {group_name}")
    for feature_set in features_sets.keys():
        print(f"Running for feature set: {feature_set}")
        selected_features = features_sets[feature_set] 
        for model_name in model_names:
            print(f"Running for model: {model_name}")

            # STEP1: Run the model
            X = df_encoded[selected_features]
            y = df_encoded[group_name]

            if model_name == 'DecisionTreeClassifier':
                final_model = DecisionTreeClassifier(random_state=42)
            elif model_name == 'DecisionTreeClassifier_max_depth_3':
                final_model = DecisionTreeClassifier(random_state=42, max_depth = 3)
            final_model.fit(X, y)
            y_pred = final_model.predict(X)

            # STEP2: Plotting the decision tree
            plt.figure(figsize=(40,20))
            plot_tree(final_model, filled=True, feature_names=selected_features, class_names=['0','1'], rounded=True)

            save_dir = f"{results_folder}/decision_trees/{group_name}/{model_name}/{feature_set}"
            os.makedirs(save_dir, exist_ok=True)
            plt.savefig(f"{results_folder}/decision_trees/{group_name}/{model_name}/{feature_set}/decision_tree_plot.png") 
            plt.close()

            # STEP3: Identifying paths
            # Extract paths
            paths = find_paths_with_gini_zero(final_model, selected_features)

            # Store path metrics in a list
            path_metrics = []

            # Evaluate metrics for each path
            for i, (path, sample_count) in enumerate(paths):
                accuracy, precision, recall = evaluate_path(path, df_encoded, target_column=group_name)  # Replace 'target' with your actual target column name
                path_metrics.append({
                    "model": final_model,
                    "path": path,
                    "sample_count": sample_count,
                    "accuracy": accuracy,
                    "precision": precision,
                    "recall": recall
                })

            # Sort paths by recall in descending order
            sorted_paths = sorted(path_metrics, key=lambda x: x["recall"], reverse=True)

            sorted_paths_df = pd.DataFrame(sorted_paths)
            sorted_paths_df['path'] = sorted_paths_df['path'].apply(lambda x: " & ".join(x))
            sorted_paths_df.to_excel(f"{results_folder}/decision_trees/{group_name}/{model_name}/{feature_set}/decision_tree_paths_model.xlsx", index=False)

            # STEP4: Test output for a given path
            for path in sorted_paths:
                # round to 2 decimal places because water_content in test data has only 2 decimal places
                rounded_path = round_condition_numbers(path['path'], decimals=2)
                condition = " and ".join(rounded_path)
                matched_test_df = test_df_encoded.query(condition)

                n_matched_test_samples = matched_test_df.shape[0]
                n_correct_matched_test_samples = matched_test_df[matched_test_df[group_name] == False].shape[0]
                accuracy = n_correct_matched_test_samples / n_matched_test_samples if n_matched_test_samples > 0 else None
                
                output_metrics.append({
                    'group_name': group_name,
                    'feature_set': feature_set,
                    'features': selected_features,
                    'model_name': model_name,
                    'path': " & ".join(path['path']),
                    'training_sample_count': path['sample_count'],
                    'training_accuracy': path['accuracy'],
                    'training_precision': path['precision'],
                    'training_recall': path['recall'],
                    'n_matched_test_samples': n_matched_test_samples if n_matched_test_samples is not None else None,
                    'n_correct_matched_test_samples': n_correct_matched_test_samples if n_correct_matched_test_samples is not None else None,
                    'test_accuracy': accuracy if accuracy is not None else None
                }) 

0. Running for group: Acremonium spp.
Running for feature set: features_set1
Running for model: DecisionTreeClassifier
Running for model: DecisionTreeClassifier_max_depth_3
Running for feature set: features_set2
Running for model: DecisionTreeClassifier
Running for model: DecisionTreeClassifier_max_depth_3
Running for feature set: features_set3
Running for model: DecisionTreeClassifier
Running for model: DecisionTreeClassifier_max_depth_3
Running for feature set: features_set4
Running for model: DecisionTreeClassifier
Running for model: DecisionTreeClassifier_max_depth_3
1. Running for group: Candida tropicalis
Running for feature set: features_set1
Running for model: DecisionTreeClassifier
Running for model: DecisionTreeClassifier_max_depth_3
Running for feature set: features_set2
Running for model: DecisionTreeClassifier
Running for model: DecisionTreeClassifier_max_depth_3
Running for feature set: features_set3
Running for model: DecisionTreeClassifier
Running for model: DecisionTre

In [None]:
output_metrics_df = pd.DataFrame(output_metrics)

In [None]:
output_metrics_df.to_excel(f"{results_folder}/decision_trees/metrics/decision_tree_paths_metrics.xlsx", index=False)

In [None]:
with pd.ExcelWriter(f"{results_folder}/decision_trees/metrics/decision_tree_paths_metrics_by_group.xlsx") as writer:
    for group, df_group in output_metrics_df.groupby("group_name"):
        # Excel sheet names can't be longer than 31 characters
        sheet_name = group[:31]
        df_group.to_excel(writer, sheet_name=sheet_name, index=False)