This script shall test the limits of the classifier training. 

It will use the embeddings calculated from a demo dataset (currently the prenylated flavonoids and chalcones from the MSnLib [public database]), and iteratively remove several **train - relevant** spectra to see how well the **validation** data is predicted. 

In [None]:
## Parameters

df_file = "../demo/output/PrenylatedCompounds_PublicDBs/df_embeddings.pkl"

training_subsets = {
    "hcd_pos_step[20,45,70]" : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "positive") & (x["CE"] in ["45.0", "stepped20,45,70ev(absolute)"]),
}

output_dir = "./GTLimits_output/PrenylatedCompounds_PublicDBs_TrainingSetLimits"

In [None]:
## Script
# Import necessary libraries
import pandas as pd

from AnnoMe.Classification import (
    train_and_classify,
    generate_embedding_plots,
    generate_prediction_overview,
    set_random_seeds,
)

import pandas as pd

from IPython.display import display

from collections import OrderedDict
import os
import re

import plotnine as p9

from natsort import natsorted

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load the DataFrame from the specified pickle file
df = pd.read_pickle(df_file)

# Check if the columns "ms2deepscore:cleaned_spectra" and "ms2deepscore:embeddings" are present
if "ms2deepscore:cleaned_spectra" in df.columns and "ms2deepscore:embeddings" in df.columns:
    # Rename the columns
    df.rename(columns={"ms2deepscore:cleaned_spectra": "cleaned_spectra", "ms2deepscore:embeddings": "embeddings"}, inplace=True)

# show overview of the dataframe column type
type_counts = df["type"].value_counts()
print("DataFrame type counts:")
display(type_counts)

In [None]:
for red_name, red_n in [("all", 1.0), ("90Percent", 0.9), ("80Percent", 0.8), ("70Percent", 0.7), ("60Percent", 0.6), ("50Percent", 0.5), ("40Percent", 0.4), ("30Percent", 0.3), ("20Percent", 0.2), ("10Percent", 0.1)]:
    for iteration in range(1, 6):        
        # Reduce the DataFrame to the specified percentage of rows
        c_df = df.copy()

        # Step 1: Exclude rows where type is "train - relevant"
        c_df_other = c_df[c_df['type'] != "train - relevant"]

        # Step 2: From the remaining rows, select those where type is "train_relevant" and randomly sample red_n fraction of them
        c_df_relevant = c_df[c_df['type'] == "train - relevant"].sample(frac=red_n, random_state=iteration)

        # Step 3: Combine the two DataFrames
        df_reduced = pd.concat([c_df_other, c_df_relevant])

        num_train_relevant = df_reduced[df_reduced['type'] == "train - relevant"].shape[0]
        print(f"Number of rows where type is 'train - relevant': {num_train_relevant}")

        # iterate over the training subsets, produces better output
        for subset_name in training_subsets:
            print(f"Processing subset: {subset_name}, with reduction: {red_name}, iteration: {iteration}")
            print(f"##############################################################################")
            
            # Set random seeds for reproducibility
            set_random_seeds(42)

            # Get the subset function
            subset_fn = training_subsets[subset_name]

            # Create output directory for the subset
            c_output_dir = f"{output_dir}/subset_{subset_name}_traRel{num_train_relevant}_it{iteration}/"
            os.makedirs(c_output_dir, exist_ok=True)

            # subset the dataframe
            df_subset = df_reduced[df_reduced.apply(subset_fn, axis=1)].reset_index(drop=True)

            # Train
            try:
                df_train, df_validation, df_inference, df_metrics = train_and_classify(df_subset, subsets=subset_fn, output_dir=c_output_dir)
                generate_prediction_overview(df_subset, df_validation, c_output_dir, "validation", min_prediction_threshold=13)
            except Exception as e:
                print(f"Error during training and classification for subset {subset_name}: {e}")
                continue



In [None]:
# Generate an overview of the validation datasets measured in-house
print(f"\nGenerating an overview of the validation datasets measured in-house")
# Initialize an empty list to store DataFrames
all_validation_results = []

# Iterate through all folders in the output directory
for folder_name in os.listdir(output_dir):
    folder_path = os.path.join(output_dir, folder_name)
    if os.path.isdir(folder_path):  # Check if it's a directory
        file_path = os.path.join(folder_path, "validation_data.xlsx")
        if os.path.exists(file_path):  # Check if the file exists
            # Read the contents of the sheet 'overall'
            df = pd.read_excel(file_path, sheet_name="overall")
            # Add a new column with the folder name
            df["subset"] = folder_name
            # Append the DataFrame to the list
            all_validation_results.append(df)

if len(all_validation_results) == 0:
    print("No validation datasets found in the output directory.")
    all_validation_results = None
else:
    # Concatenate all DataFrames into a single DataFrame
    all_validation_results = pd.concat(all_validation_results, ignore_index=True)
    all_validation_results["annotated_as"] = all_validation_results["annotated_as_times:relevant"].map(lambda x: "relevant" if x != 0 else "other")
    all_validation_results.rename(columns={"row_count": "n_features"}, inplace=True)
    all_validation_results["percent_features"] = (100.0 * all_validation_results["n_features"] / all_validation_results.groupby(["source", "subset"])["n_features"].transform("sum")).round(1)
    # Split the 'subset' column into three new columns using the regex pattern
    all_validation_results[["fragmentation_method", "polarity", "collision_energy", "reduction", "iteration"]] = all_validation_results["subset"].str.extract(r"subset_(.*)_(.*)_(.*)_(.*)_(.*)")
    all_validation_results["source"] = all_validation_results["source"].str.replace(" - gt ", " - ", regex=False)
    all_validation_results[["source", "gt_type"]] = all_validation_results["source"].str.extract(r"(.*) - (other|relevant)")
    # Order the DataFrame by 'source', 'subset', and 'annotated_as'
    all_validation_results.sort_values(by=["source", "polarity", "fragmentation_method", "collision_energy", "reduction", "iteration", "gt_type", "annotated_as"], inplace=True)
    # Reorder the columns
    all_validation_results = all_validation_results[["source", "polarity", "fragmentation_method", "collision_energy", "reduction", "iteration", "gt_type", "annotated_as", "n_features", "percent_features"]]


display(all_validation_results.head())
# Ensure the 'reduction' column is sorted in natural order

all_validation_results["reduction"] = pd.Categorical(
    all_validation_results["reduction"],
    categories=natsorted(all_validation_results["reduction"].unique()),
    ordered=True
)

p = p9.ggplot(all_validation_results, p9.aes(x="reduction", y="percent_features", colour="gt_type + ': ' + annotated_as")) + \
    p9.theme_bw() + \
    p9.geom_boxplot() + \
    p9.geom_jitter(size=1, alpha=0.5, height = 0, width = 0.1) + \
    p9.facet_grid("source ~ polarity + fragmentation_method + collision_energy") + \
    p9.theme(axis_text_x=p9.element_text(rotation=90, hjust=1)) + \
    p9.labs(title="Overview of validation datasets measured in-house", x="Reduction", y="Percent of features", colour="GT Type") + \
    p9.scale_color_manual(values={"other: other": "#41e541", "other: relevant": "#2999e9", "relevant: other": "#1f77b4", "relevant: relevant": "#2ca02c"})
display(p)
# Save the plot to a file
output_plot_file = os.path.join(output_dir, "validation_overview.png")
p.save(output_plot_file, width=12, height=8, dpi=300)

# Export the two tables to an Excel file
output_excel_file = os.path.join(output_dir, "summary_tables.xlsx")
with pd.ExcelWriter(output_excel_file, engine="openpyxl") as writer:
    if all_validation_results is not None:
        all_validation_results.to_excel(writer, sheet_name="all_validation_results")

print(f"Exported tables to {output_excel_file}")