# Comparison of predicted to true chemical formulae

## Get the true molecular formula

In [None]:
import pandas as pd
import numpy as np

import pickle

import rdkit
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

In [None]:
# made in 2025-05-12_SIRIUS_output_comparisons.ipynb in the 'output_no_mass_correction' folder
with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/2025-05-13_ground_truth_molecular_formula_no_sirius.pkl', 'rb') as f:
     ground_truth = pickle.load(f)

In [None]:
ground_truth.value_counts(subset='source')

In [None]:
ground_truth

## Get the predicted molecular formula

In [None]:
import glob
import os

In [None]:
# Get the folder path for the SIRIUS output files
exp_output_folder = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/ellinor_data'
iris_dry_output_folder = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/iris_data_dry'
iris_wet_output_folder = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/iris_data_wet'
isabell_output_folder = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/isabell_data'
library_data = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/library_data'

In [None]:
def get_formula_predictions(output_folder):
    '''
    Get the formula predictions from the SIRIUS output files.
    '''
    formula_predictions = pd.DataFrame()

    for filename in glob.glob(f'{output_folder}/*/formula_candidates.tsv'):
        df = pd.read_csv(filename, sep='\t')
        df = df[['formulaRank', 'molecularFormula', 'adduct', 'precursorFormula', 'SiriusScore', 'numExplainedPeaks','massErrorPrecursor(ppm)']]
        df['compound_name'] = os.path.basename(os.path.dirname(filename))
        df = df[['compound_name', 'molecularFormula', 'formulaRank', 'adduct', 'precursorFormula', 'SiriusScore', 'numExplainedPeaks','massErrorPrecursor(ppm)']] #reorder columns

        formula_predictions = pd.concat([formula_predictions, df], ignore_index=True)

    return formula_predictions

In [None]:
# Save out formula predictions for each output folder
sirius_output_folder_list = [exp_output_folder, iris_dry_output_folder, iris_wet_output_folder, isabell_output_folder, library_data]

formula_predictions_df = pd.DataFrame()

for output_folder in sirius_output_folder_list:
    formula_predictions = get_formula_predictions(output_folder)
    #formula_predictions.sort_values(by=['compound_name', 'formulaRank'], ascending=[True, True], inplace=True)
    formula_predictions['dataset'] = os.path.basename(output_folder)

    formula_predictions_df = pd.concat([formula_predictions_df, formula_predictions], ignore_index=True)

In [None]:
formula_predictions_df.rename(columns={'molecularFormula': 'molecular_formula',
                                       'compound_name':'compound_name_sirius_output'}, inplace=True)

In [None]:
formula_predictions_df['compound_name_sirius_output'] = formula_predictions_df['compound_name_sirius_output'].apply(lambda x: x.split('_')[1:])

formula_predictions_df['compound_name_sirius_output'] = ['_'.join(name[-2:]) if name[-1] == 'e' or name[-1] == 'H' or name[-1].isnumeric() 
                                                         else name[-1] 
                                                         for name in formula_predictions_df['compound_name_sirius_output'].tolist()]

formula_predictions_df['compound_name_sirius_output'] = formula_predictions_df['compound_name_sirius_output'].apply(lambda x: x.translate(str.maketrans('', '', '()[]{}<>, :\'\"')).split('_')[0])
formula_predictions_df['compound_name_sirius_output'] = formula_predictions_df['compound_name_sirius_output'].str.lower()

In [None]:
formula_predictions_df[formula_predictions_df.dataset=='isabell_data'].drop_duplicates(subset='compound_name_sirius_output').sort_values(by='compound_name_sirius_output')

In [None]:
formula_predictions_df[formula_predictions_df.duplicated(subset=['compound_name_sirius_output', 'adduct'], keep='first')].sort_values(['compound_name_sirius_output', 'adduct'])

In [None]:
formula_predictions_df['compound_name_sirius_output'] = formula_predictions_df['compound_name_sirius_output'].apply(lambda x: x.split('-'))

formula_predictions_df['compound_name_sirius_output'] = ['-'.join(x[0:-1]) if x[-1] == 'e' or x[-1] == 'h' or x[-1].isnumeric()
                                                         else '-'.join(x)
                                                         for x in formula_predictions_df['compound_name_sirius_output']]

## Combine predicted and true molecular formula

In [None]:
formula_predict_ground_truth = pd.merge(formula_predictions_df, ground_truth, how='left', on='molecular_formula')

In [None]:
formula_predict_ground_truth

In [None]:
output_directory = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13'
with open(f'{output_directory}/2025-05-16_formula_predictions_to_true_formula.pkl', 'wb') as f:
    pickle.dump(formula_predict_ground_truth, f)

In [None]:
correct_prediction = formula_predict_ground_truth[formula_predict_ground_truth['id']==formula_predict_ground_truth['compound_name_sirius_output']][['compound_name_sirius_output', 'id', 'molecular_formula', 'formulaRank', 'adduct', 'source']]

In [None]:
correct_prediction.sort_values(by=['source','compound_name_sirius_output'], ascending=[True, True], inplace=True)

ducplicated_correct_prediction = correct_prediction[correct_prediction['id'].duplicated(keep=False)]
unique_correct_prediction = correct_prediction[~correct_prediction['id'].duplicated(keep=False)]

In [None]:
unique_correct_prediction[['source', 'adduct']].value_counts()

In [None]:
unique_correct_prediction[unique_correct_prediction['formulaRank']==1][['source', 'adduct']].value_counts()

In [None]:
ducplicated_correct_prediction

In [None]:
ducplicated_correct_prediction.columns

In [None]:
duplicated_unqiue_ionization = ducplicated_correct_prediction[~ducplicated_correct_prediction.duplicated(keep='first')]

duplicated_unqiue_ionization[['source', 'adduct']].value_counts()

In [None]:
duplicated_unqiue_ionization[duplicated_unqiue_ionization.formulaRank==1][['source', 'adduct']].value_counts()

In [None]:
duplicated_unqiue_ionization

In [None]:
duplicated_unqiue_ionization[duplicated_unqiue_ionization['formulaRank']==1][['source', 'adduct']].value_counts()

In [None]:
duplicated_unqiue_ionization[(duplicated_unqiue_ionization.source=='ellinor')]

In [None]:
formula_predictions_df_merged[formula_predictions_df_merged['name'].notna()][['compound_name_prediction', 'name', 'molecular_formula', 'dataset', 'formulaRank', 'adduct', 'precursorFormula', 'SiriusScore', 'numExplainedPeaks','massErrorPrecursor(ppm)']]

In [None]:
formula_predictions_df_merged[formula_predictions_df_merged['id'].notna()]

In [None]:
formula_predictions_df_merged[formula_predictions_df_merged['InChIKey14'].notna()]

In [None]:
for df in list_of_dfs:
    # Add the molecular formula to the dataframe
    formula_predictions_df = pd.merge(formula_predictions_df, df[['InChIKey14', 'molecular_formula']], on='molecular_formula', how='left')