# Comparison of probabilistic and deterministic fingerprints

In [None]:
import pandas as pd
import numpy as np
import pickle

import glob
import os

In [None]:
with open(f'/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/2025-05-16_formula_predictions_to_true_formula.pkl', 'rb') as f:
    formula_predictions = pickle.load(f)

In [None]:
correct_formula = formula_predictions[formula_predictions['id']==formula_predictions['compound_name_sirius_output']]

In [None]:
correct_formula

In [None]:
correct_formula['adduct'] = correct_formula['adduct'].apply(
    lambda x: x.replace(' ', '') if pd.notnull(x) is not None else x
)

In [None]:
correct_formula

## Get deterministic fingerprints for all compounds

### Standardize SMILES for deterministic fingerprint conversion

In [None]:
from rdkit import Chem
from rdkit.Chem import PandasTools, SaltRemover, MolStandardize, inchi
from rdkit.Chem.MolStandardize import rdMolStandardize

In [None]:
def standardize_mol(data):
  # Gets mol objects from SMILES
  PandasTools.AddMoleculeColumnToFrame(data, 'SMILES', 'ROMol') # Assuming that SMILES notations are given in column SMILES
  #data['ROMol'] = data.InChI.apply(lambda x: Chem.MolFromInchi(x))
  
  def remove_ions(mol, ions):
    remover = SaltRemover.SaltRemover(defnData=ions)
    return remover.StripMol(mol)

  parts2remove = ['[F,Cl,Br,I]', '[Na,Mg,K,Ca,Li,Ba]', 'CC(=O)O', '[O,N]', 'CS(=O)(=O)O', 'O=S(=O)(O)O', 'O=[N+]([O-])O', 'O=S(=O)(O)CCO',
                  'F[P-](F)(F)(F)(F)F' 'O=S(=O)([O-])C(F)(F)F', 'F[B-](F)(F)F',
                  '[Co,Pd,Ni,Al,Sn,Zn,Cu,Hg]'] # additional ions to remove

  for part in parts2remove:
    data['ROMol'] = data.ROMol.apply(lambda x: remove_ions(x, part))

  uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
  data['ROMol'] = data.ROMol.apply(lambda x: uncharger.uncharge(x))

  data.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
  data['SMILES'] = data.ROMol.apply(lambda x: Chem.MolToSmiles(x))
  data['InChIKey'] = data.ROMol.apply(lambda x: inchi.MolToInchiKey(x))
  data['InChIKey14'] = data.InChIKey.apply(lambda x: x.split('-')[0])
  return data

correct_formula_std = standardize_mol(correct_formula)

In [None]:
correct_formula_std[correct_formula_std.SMILES.str.contains(r'\.')]

In [None]:
%pip install -e 'git+https://github.com/boecker-lab/standardizeUtils/#egg=standardizeUtils'

In [None]:
from standardizeUtils.standardizeUtils import standardize_structure_with_pubchem
from standardizeUtils.standardizeUtils import standardize_structure_list_with_pubchem

import time
import requests  



def standardize_in_batches(smiles_list, initial_batch_size=1000):
    """
    Standardizes SMILES in batches, reducing batch size on failure.
    Input: SMILES list
    Output: Standardized SMILES list
    """
    total_smiles = len(smiles_list)
    processed_smiles = []
    idx = 0
    batch_size = initial_batch_size # Starting with 1000

    while idx < total_smiles:
        attempt_success = False
        current_batch_size = batch_size

        while current_batch_size >= 100:
            try:
                print(f"Processing batch {idx}:{idx + current_batch_size} (size={current_batch_size})")
                batch = smiles_list[idx : idx + current_batch_size]
                standardized_batch = standardize_structure_list_with_pubchem(batch, 'smiles')
                processed_smiles.extend(standardized_batch)
                idx += current_batch_size
                attempt_success = True
                break  

            except requests.exceptions.RequestException as e:
                print(f"Error occurred: {e}, reducing batch size...")
                current_batch_size //= 2  # Reduce batch size by half on failure > can be changed
                time.sleep(2)  

        if not attempt_success:
            print(f"Skipping batch {idx}:{idx + current_batch_size} due to repeated failures.")
            idx += current_batch_size  

    return processed_smiles

correct_formula_std['std_SMILES'] = standardize_in_batches(correct_formula_std.SMILES.to_list())

In [None]:
correct_formula_std['std_SMILES'].to_csv('2025-05-17_std_SMILES_experimental.tsv', sep='\t', index=False)

In [None]:
# with open('2025-05-17_correct_formula_w_std_SMILES.pkl', 'wb') as f:
#     pickle.dump(correct_formula_std, f)

### Connect to SIRIUS through command-line and get deterministic fingerprints

In [None]:
import subprocess

import pandas as pd
import numpy as np

import requests

email = 'ellinor.samuelsson-hoppe@su.se'
password = 'ndgvSC3SUnZ?!?_new'

process = subprocess.Popen(['/Applications/sirius.app/Contents/MacOS/sirius', 'login', '-u', email, '-p'], 
                           stdout=subprocess.PIPE, 
                           stderr=subprocess.PIPE,
                           stdin=subprocess.PIPE,
                           text=True)

process.communicate(input=password + '\n')

stdout, stderr = process.communicate()
print('STDOUT:\n', stdout)
print('STDERR:\n', stderr)

smiles_file = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/2025-05-17_std_SMILES_experimental.tsv'
output_file = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/2025-05-17_std_SMILES_SIRIUS5_FP_experimental.tsv'
fp_version_file = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/2025-05-17_SIRIUS5_fp_version.tsv'
charge = 1 # 1 for ESI+/ -1 for ESI-
command = ['/Applications/sirius.app/Contents/MacOS/sirius', '-i', smiles_file, 'fingerprinter', '--charge', str(charge), '-o', output_file, '-v', fp_version_file]

result = subprocess.run(command, capture_output=True, text=True)

print('STDOUT:\n', result.stdout)
print('STDERR:\n', result.stderr)

#### Update fingerprints to get binary matrix

In [None]:
correct_formula_fp = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/2025-05-17_std_SMILES_SIRIUS5_FP_experimental.tsv', sep='\t')

correct_formula_fp.columns = ['std_SMILES', 'FP']

correct_formula_fp.head()

In [None]:
# Split the strings in column 1 into lists of numbers
correct_formula_fp_matrix = correct_formula_fp.FP.str.split(',', expand=True)

# Convert the resulting DataFrame to numeric values
correct_formula_fp_matrix = correct_formula_fp_matrix.apply(pd.to_numeric, errors='coerce')

# Display the resulting matrix
correct_formula_fp_matrix.head()

In [None]:
# get fp descriptors
fp_desc = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/2025-05-17_SIRIUS5_fp_version.tsv', sep = '\t')

fp_desc.head()

In [None]:
# Create a new DataFrame with the same shape as tox21_fp_matrix, filled with zeros
correct_formula_fp_binary_matrix = pd.DataFrame(0, index=correct_formula_fp_matrix.index, columns=fp_desc.absoluteIndex)

# Iterate over each row in tox21_fp_matrix
for idx, row in correct_formula_fp_matrix.iterrows():
    # Drop NaN values and convert to integers
    indices = row.dropna().astype(int)
    # Set the corresponding positions in the binary matrix to 1
    correct_formula_fp_binary_matrix.loc[idx, indices] = 1

correct_formula_fp_binary_matrix_smiles = pd.concat([correct_formula_fp.std_SMILES, correct_formula_fp_binary_matrix], axis=1)

correct_formula_fp_binary_matrix_smiles.head()

In [None]:
# Compress the fingerprints into a single list in a column
correct_formula_fp_binary_matrix_smiles['sirius5_fp'] = correct_formula_fp_binary_matrix_smiles.apply(lambda row: row[1:].tolist(), axis=1)

# Display the resulting DataFrame
correct_formula_fp_compressed = correct_formula_fp_binary_matrix_smiles[['std_SMILES', 'sirius5_fp']]

correct_formula_fp_compressed.head()

#### Concatenate to existing dataframe

In [None]:
correct_formula_fp_compressed

In [None]:
correct_formula_std_fp = pd.merge(correct_formula_std, correct_formula_fp_compressed, on='std_SMILES', how='left')

correct_formula_std_fp = correct_formula_std_fp.drop_duplicates(subset=correct_formula_std_fp.columns[0:-1], keep='first')

In [None]:
correct_formula_std_fp

## Get probabilistic fingerprints from correct formula

In [None]:
with open('2025-05-17_correct_formula_w_std_SMILES.pkl', 'rb') as f:
    correct_formula_std = pickle.load(f)

In [None]:
# Get the folder path for the SIRIUS output files
exp_output_folder = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/ellinor_data'
iris_dry_output_folder = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/iris_data_dry'
isabell_output_folder = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/isabell_data'
library_data = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/MSMS/SIRIUS_output/output_mass_correction_2025-05-13/library_data'

In [None]:
fp_info = pd.read_csv(f'{exp_output_folder}/csi_fingerid.tsv', sep='\t')

# Generating the dataframe for SIRIUS+CSI:FingerID results
columns = np.concatenate([['id', 'clean_id',  'adduct', 'formula'], [str(idx) for idx in fp_info.absoluteIndex.values]], axis=0)
fp_data = pd.DataFrame(columns=columns)

In [None]:
without_fp = [] # Array for MS features without predicted fingerprints

for directory in [exp_output_folder, iris_dry_output_folder, isabell_output_folder, library_data]:
    for filename in glob.glob(f'{directory}/*'):
        id = filename.split('/')[-1].split('_')[-1].lower() #get matching ID as in correct_formula df
        id_split = id.split('-')
        id_clean = ['-'.join(id_split[0:-1]) if id_split[-1] == 'e' or id_split[-1] == 'h' or id_split[-1].isnumeric()
                    else '-'.join(id_split)][0] #clean id from e, h or numbers at the end of the id
        if id_clean in correct_formula_std['id'].values: #check if id is in correct_formula df
            formula = correct_formula_std[correct_formula_std['id'] == id_clean]['molecular_formula'].values[0]
            adduct = correct_formula_std[correct_formula_std['id'] == id_clean]['adduct'].values
            if os.path.exists(f'{filename}/fingerprints/{formula}_{adduct[0]}.fpt'):
                fp = pd.read_csv(f'{filename}/fingerprints/{formula}_{adduct[0]}.fpt', header=None).T.values.flatten()
                data_ready = np.concatenate([[id, id_clean, adduct[0], formula], fp], axis=0)
                fp_data.loc[len(fp_data)] = data_ready
            elif len(adduct) > 1:
                try:
                    fp = pd.read_csv(f'{filename}/fingerprints/{formula}_{adduct[1]}.fpt', header=None).T.values.flatten()
                    data_ready = np.concatenate([[id, id_clean, adduct[1], formula], fp], axis=0)
                    fp_data.loc[len(fp_data)] = data_ready
                except:
                    without_fp.append(id_clean)
            else:
                without_fp.append(id_clean)

fp_data = fp_data.apply(pd.to_numeric, errors='ignore')

In [None]:
fp_data.sort_values(by='id', inplace=True)

In [None]:
fp_data

### Save predicted fingerprints to correct_formula df

In [None]:
fp_data_compressed = fp_data.copy()
# Compress the fingerprints into a single list in a column
fp_data_compressed['sirius5_predicted_fp'] = fp_data_compressed.apply(lambda row: row[4:].tolist(), axis=1)

# Display the resulting DataFrame
fp_data_compressed = fp_data_compressed[['id', 'clean_id', 'adduct', 'formula', 'sirius5_predicted_fp']]

fp_data_compressed = fp_data_compressed.rename(columns={'formula': 'molecular_formula',
                                                        'id': 'sirius_output_name',
                                                        'clean_id': 'id'})

fp_data_compressed

In [None]:
fp_data_compressed

In [None]:
# add compressed fingerprints to correct_formula_std_fp
correct_formula_std_fp_pred = pd.merge(correct_formula_std_fp, fp_data_compressed, on=['id', 'adduct', 'molecular_formula'], how='left')

correct_formula_std_fp_pred = correct_formula_std_fp_pred[['id', 'sirius_output_name', 'molecular_formula', 'adduct',
                                                           'formulaRank',  'sirius5_fp','sirius5_predicted_fp', 'SiriusScore', 'numExplainedPeaks', 'massErrorPrecursor(ppm)', 'InChIKey14', 'SMILES',  'std_SMILES', 'source', 'ROMol', 'InChIKey']]

correct_formula_std_fp_pred

In [None]:
# with open('2025-05-17_experimental_pred_fp_true_fp_correct_formula.pkl', 'wb') as f:
#     pickle.dump(correct_formula_std_fp_pred, f)

In [None]:
correct_formula[correct_formula.sirius_output_name.isna()]

## Determine accuracy in prediction using Binary Cross-Entropy Loss

In [None]:
import pandas as pd
import numpy as np
import pickle

import glob
import os

In [None]:
with open('2025-05-17_experimental_pred_fp_true_fp_correct_formula.pkl', 'rb') as f:
    correct_formula = pickle.load(f)

In [None]:
correct_formula.sort_values(by='id', inplace=True)

In [None]:
correct_formula

In [None]:
fp = correct_formula[correct_formula['sirius5_predicted_fp'].notnull() & correct_formula['sirius5_fp'].notnull()]

In [None]:
fp.reset_index(drop=True, inplace=True)

fp

In [None]:
type(fp)

In [None]:
fp_no_dupl = fp.drop_duplicates(subset=['id', 'molecular_formula'], keep='first')

In [None]:
fp_no_dupl

In [None]:
fp_duplicate = fp[fp.duplicated(subset=['id', 'molecular_formula'], keep=False)]

In [None]:
fp_duplicate


In [None]:
fp_m = fp[fp['adduct']=='[M]+']

fp_mh = fp[fp['adduct']=='[M+H]+']

In [None]:
fp_m = fp_no_dupl[fp_no_dupl['adduct']=='[M]+']

fp_mh = fp_no_dupl[fp_no_dupl['adduct']=='[M+H]+']

In [None]:
fp_m.shape, fp_mh.shape

### Computing the binary cross entropy loss for each prediction

In [None]:
def get_pred_prob(df):
    pred_fp = df['sirius5_predicted_fp'].apply(pd.Series)
    true_fp = df['sirius5_fp'].apply(pd.Series)

    pred_prob = true_fp * (pred_fp) + (1 - true_fp) * (1 - pred_fp)

    mean_pred_prob_mf = pred_prob.mean(axis=1)
    mean_pred_prob_feat = pred_prob.mean(axis=0)

    return pred_prob, mean_pred_prob_mf, mean_pred_prob_feat

pred_prob_m, mean_pred_prob_mf_m, mean_pred_prob_feat_m = get_pred_prob(fp_m)
pred_prob_mh, mean_pred_prob_mf_mh, mean_pred_prob_feat_mh = get_pred_prob(fp_mh)

In [None]:
mean_pred_prob_feat_m.mean(), mean_pred_prob_feat_mh.mean()

In [None]:
from scipy import stats

# calculate confidence interval of mean predicted probabilities
def calc_ci(data, confidence=0.95):
    """
    Calculate the confidence interval of the mean for a given dataset.
    """
    n = len(data)
    mean = np.mean(data)
    sem = stats.sem(data)  # Standard error of the mean

    # CI bounds
    h = sem * stats.t.ppf((1 + confidence) / 2, n - 1)  # two-tailed t-distribution

    ci_lower = mean - h
    ci_upper = mean + h

    return ci_lower, ci_upper, h

ci_lower_m, ci_upper_m, ci_m = calc_ci(mean_pred_prob_mf_m)
ci_lower_mh, ci_upper_mh, ci_mh = calc_ci(mean_pred_prob_mf_mh)

ci_m, ci_mh

In [None]:
# import numpy as np

# # true_fp_expanded contains the binary labels (0 or 1)
# # pred_fp_expanded contains the predicted probabilities (between 0 and 1) of a compound being in class 1
# # Assume both are DataFrames with same shape and matching indices/columns

# # Small constant to avoid log(0)
# epsilon = 1e-20

# # Clip predicted values to avoid log(0) issues
# df_pred_clipped = pred_fp_expanded.clip(epsilon, 1 - epsilon)

# # Calculate BCE for each element
# average_prob = true_fp_expanded * (df_pred_clipped) + (1 - true_fp_expanded) * (1 - df_pred_clipped)

# # Optionally: compute mean BCE across all elements
# mean_average_prob = average_prob.mean().mean()  # mean over all columns and rows

# print("Mean Binary Cross Entropy:", mean_average_prob)

In [None]:
import matplotlib.pyplot as plt

# Set the figure parameters
plt.rcParams.update({'figure.figsize':[9.8,9.8],
                'font.size': 16, 
                'font.weight': 'normal',
                'axes.titlesize': 12,
                'axes.labelsize': 12,
                'xtick.labelsize': 12,
                'ytick.labelsize': 12,
                'legend.fontsize': 12,
                'legend.title_fontsize': 12,
                'axes.titleweight': 'bold',
                'font.family': 'serif',
                'font.serif': ['Times New Roman'],
                'figure.dpi':300,
                
                })

In [None]:
import matplotlib.pyplot as plt

# Set the figure parameters
plt.rcParams.update({'figure.figsize':[9.8,9.8],
                'font.size': 16, 
                'font.weight': 'normal',
                'axes.titlesize': 12,
                'axes.labelsize': 12,
                'xtick.labelsize': 12,
                'ytick.labelsize': 12,
                'legend.fontsize': 12,
                'legend.title_fontsize': 12,
                'axes.titleweight': 'bold',
                'font.family': 'serif',
                'font.serif': ['Times New Roman'],
                'figure.dpi':300,
                
                })

plt.figure(figsize=(8,4))

hist_mp = plt.hist(pred_prob_mh.values.flatten(), bins=100, density=True, log=True,
                   color='#FD9E02', alpha=1, label='[M+H]+')
hist_m = plt.hist(pred_prob_m.values.flatten(), bins=100, density=True, log=True,
                  color='#228CBD', alpha=1, label='[M]+')

plt.xlabel('Correct class likelihood, %')
plt.ylabel('Density, %')

plt.legend()

plt.title('Correct class likelihood of fingerprint-bits for compounds with\ncorrect molecular formula prediction')

#plt.savefig('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Visualizations/2025-05-18_correct_class_lieklihood_fp.pdf', dpi=300, bbox_inches='tight')

In [None]:
min(mean_pred_prob_mf_m)

In [None]:
import matplotlib.pyplot as plt

from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle

plt.figure(figsize=(8,4))

hist_mh = plt.hist(mean_pred_prob_mf_mh.values.flatten(), bins=18, density=True, log=False,
                   color='#FD9E02', alpha=1, label=r'[M+H]$^+$')
# plt.axvline(mean_pred_prob_mf_mh.mean(), color='#FD9E02', linestyle='--')
# plt.axvspan(xmin=ci_lower_mh, xmax=ci_upper_mh, facecolor='#FD9E02', edgecolor='none', alpha=0.3)

hist_m = plt.hist(mean_pred_prob_mf_m.values.flatten(), bins=20, density=True, log=False,
                  color='#228CBD', alpha=0.8, label=r'[M]$^{\cdot +}$')
# plt.axvline(mean_pred_prob_mf_m.mean(), color='#228CBD', linestyle='--')

# plt.axvspan(xmin=ci_lower_mh, xmax=ci_upper_mh, facecolor='#FD9E02', edgecolor='none', alpha=0.05)
# plt.axvspan(xmin=ci_lower_m, xmax=ci_upper_m, facecolor='#228CBD',edgecolor='none', alpha=0.2)

plt.xlabel('Mean MF likelihood')
plt.ylabel('')

ax = plt.gca()
ax.set_yticks([])             # Removes tick locations
ax.set_yticklabels([])        # Removes tick labels

plt.legend()

plt.savefig('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Visualizations/2025-05-24_correct_class_lieklihood_mean_spectra_wo_mean_and_ci.pdf', dpi=300, bbox_inches='tight')

In [None]:
import pickle
# save datframe
with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Code/Model_training/evaluation_set_uncorrected_mass.pkl', 'rb') as f:
    fp_data_uncorr = pickle.load(f)

In [None]:
with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Code/Model_training/evaluation_set_corrected_mass.pkl', 'rb') as f:
    fp_data_corr = pickle.load(f)

In [None]:
fp_data_uncorr

In [None]:
fp.value_counts(subset='adduct')

### Accuracy of feature prediction for models

In [None]:
with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Code/Model_training/AHR/ahr_features.pkl', 'rb') as f:
    ahr_features = pickle.load(f)

with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Code/Model_training/MMP/mmp_features.pkl', 'rb') as f:
    mmp_features = pickle.load(f)

In [None]:
# Difference between the two sets of features
diff = list(set(mmp_features) - set(ahr_features))
diff

In [None]:
ahr_diff = list(set(ahr_features)-set(mmp_features))

In [None]:
for feature in diff:
    if feature in ahr_features:
        print(f"{feature} is in ahr set")
    else:
        print(f"{feature} is only in MMP features")

In [None]:
fp_desc = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Data/Tox21/2025-03-12_fp_description.tsv', sep='\t')

In [None]:
fp_desc

In [None]:
fp_desc.iloc[diff, :]

In [None]:
fp_desc.iloc[ahr_diff, :]

In [None]:
ahr_features

In [None]:
mmp_features

In [None]:
ahr_features_accuracy_m = pd.DataFrame(mean_pred_prob_feat_m.iloc[ahr_features])
ahr_features_accuracy_mh = pd.DataFrame(mean_pred_prob_feat_mh.iloc[ahr_features])

mmp_features_accuracy_m = pd.DataFrame(mean_pred_prob_feat_m.iloc[mmp_features])
mmp_features_accuracy_mh = pd.DataFrame(mean_pred_prob_feat_mh.iloc[mmp_features])

In [None]:
ahr_features_accuracy_m[0]

In [None]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle

import seaborn as sns

fig, ax = plt.subplots(1, 2, figsize=(12, 5), sharey=True)


sns.histplot(data=ahr_features_accuracy_m, x=0, label=r'[M]$^{\cdot +}$', color='#219EBC', edgecolor=None, kde=False, stat='density', bins=30, ax=ax[0])
sns.histplot(data=ahr_features_accuracy_mh, x=0, label=r'[M+H]$^+$', color='#FB8500', edgecolor=None, kde=False, stat='density', bins=30, ax=ax[0])
ax[0].set_title(r'AhR features ($\it{N}$=363)', fontweight='bold', fontsize=14)

ax[0].set_xlabel('')
ax[0].set_ylabel('')

ax[0].tick_params(axis='y', left=False, labelleft=False)

sns.histplot(data=mmp_features_accuracy_m, x=0, label=r'[M]$^{\cdot +}$', color='#219EBC', edgecolor=None,  kde=False, stat='density', bins=30, ax=ax[1])
sns.histplot(data=mmp_features_accuracy_mh, x=0, label=r'[M+H]$^+$', color='#FB8500', edgecolor=None,  kde=False, stat='density', bins=30, ax=ax[1])
ax[1].set_title(r'MMP features ($\it{N}$=364)', fontweight='bold', fontsize=14)

ax[1].set_xlabel('')

ax[1].tick_params(axis='y', left=False, labelleft=False)

# Add legend and labels
ax[0].legend()
ax[1].legend()
plt.tight_layout()

fig.supxlabel('Mean feature accuracy, %', y=-0.02, fontsize=16)


plt.savefig('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Master_thesis/Visualizations/2025-05-24_feature_accuracy_for_selected_features.pdf', dpi=300, bbox_inches='tight')