In [None]:
from typing import Any

import pandas as pd
import numpy as np

from pubchempy import get_compounds, Compound

from molmass import Formula
from rdkit import Chem
from rdkit.Chem import Descriptors

from rdkit.Chem import AllChem as Chem
from rdkit.Chem.MolStandardize import rdMolStandardize

In [None]:
import logging

logging.getLogger('pubchempy').setLevel(logging.DEBUG)

In [None]:
# define all variables
lib_id = "mce"
add = ""

# usually empty unless, e.g., second measurement or other parameters
# always ends with underscore _
prefix = "100AGC_60000Res_"
instrument_method = r"C:\Xcalibur\methods\Corinna_Brungs\Library6_100AGC_60000Res_MS5_POS_mz115-2000"

plates = ["1D1","1D2","1D3"]
plate_id_header = "mixed_location_plate1"

unique_id_header = "lib_plate_well"
raw_filename = "raw_filename"

library_file = "data/{}_library.tsv".format(lib_id)
library_file_extension = "data/{}_library{}.tsv".format(lib_id, add)

In [None]:
add_df = pd.read_csv(library_file_extension, sep="\t")
add_df

## Getting compound information from cas or Name (CAS to PubChem)

In [None]:
def compound_score(comp: Compound):
    smiles = comp.canonical_smiles
    if not smiles:
        return 0
    return 1000 - str(smiles).count(".")


def search_pubchem_by_name(name_or_cas: str) -> Compound | None:
    """
    In pubchem many entries contain the cas as an alternative name - so searching for cas in name works often

    :param name_or_cas: input name or cas
    :return: first compound or None
    """
    if name_or_cas == "NaN":
        return None
    compounds = get_compounds(name_or_cas, "name")
    if not compounds:
        logging.info("cas:{} had NO entries".format(name_or_cas))
        return None
    else:
        compounds.sort(key=lambda comp: compound_score(comp), reverse=True)
        return compounds[0]

In [None]:
compounds = [search_pubchem_by_name(str(cas)) if not pd.isnull(cas) else np.NAN for cas in add_df["cas"]]

In [None]:
compounds = [search_pubchem_by_name(str(name)) if pd.isnull(comp) else comp for comp, name in zip(compounds, add_df["Product Name"])]
# only one compound was found as CAS-
compounds = [search_pubchem_by_name("CAS-{}".format(cas)) if pd.isnull(comp) else comp for comp, cas in zip(compounds, add_df["cas"])]


In [None]:
add_df["PubChemID"] = pd.array([compound.cid if not pd.isnull(compound) else np.NAN for compound in compounds], dtype=pd.Int64Dtype())
add_df["isomeric_smiles"] = [compound.isomeric_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
add_df["canonical_smiles"] = [compound.canonical_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
add_df

In [None]:
add_df.to_csv("data/{}_library{}_smiles.csv".format(lib_id, add), sep="\t", index=False)

## Cleaning SMILES and getting the exact mass

In [None]:
def exact_mass(formula):
    try:
        clean = formula.split(".")[0]
        return Formula(clean).isotope.mass
    except:
        return np.NAN

In [None]:
# returns canonical smiles
def mol_to_canon_smiles(mol):
    try:
        return Chem.MolToSmiles(mol, isomericSmiles=True)
    except:
        return None

# def smi_to_canon_smiles(smi):
#     try:
#         return Chem.MolToSmiles(Chem.MolFromSmiles(smi), isomericSmiles=False)
#     except:
#         pass

uncharger = rdMolStandardize.Uncharger()
# smiles_stats = {'n_dots': Counter(), 'charge': Counter(), 'invalid_smiles': []}


def cleaned_mol(smiles: str):
    original_input = smiles
    try:
        # find the longest smiles that might be the main molecule
        # for smiles that contain the salt partner etc
        split_smiles = str(smiles).split('.')
        if len(split_smiles) > 1:
            # smiles_stats['n_dots'][len(split_smiles)-1] += 1
            smiles = max(split_smiles, key=len)
        else:
            smiles = split_smiles[0]


        mol = Chem.MolFromSmiles(smiles)
        charge = Chem.GetFormalCharge(mol)
        if abs(charge) > 0:
            # smiles_stats['charge'][charge] += 1
            mol = uncharger.uncharge(mol)

        if mol is None:
            return mol_from_pepseq(original_input)
        else:
            return mol
    except:
        return mol_from_pepseq(original_input)


def mol_from_pepseq(original_input):
    # read protein seq
    try:
        sequence = str(original_input).replace("[", "").replace("]", "").replace(" (TFA salt)", "")
        return Chem.MolFromSequence(sequence)
    except:
        return None


def exact_mass_from_mol(mol):
    try:
        # canonical
        return Descriptors.ExactMolWt(mol)
    except:
        return None

# def exact_mass_from_smiles(smiles: str):
#     try:
#         # find the longest smiles that might be the main molecule
#         # for smiles that contain the salt partner etc
#         split_smiles = smiles.split('.')
#         if len(split_smiles) > 1:
#             # smiles_stats['n_dots'][len(split_smiles)-1] += 1
#             smiles = max(split_smiles, key=len)
#         else:
#             smiles = split_smiles[0]
#
#
#         # for those smiles provided as salts (e.g., .Na+) add H+ until charge is neutral
#         # if charge is neutral already (N+ and PO-) keep both charges
#         mol = Chem.MolFromSmiles(smiles)
#         charge = Chem.GetFormalCharge(mol)
#         if abs(charge) > 0:
#             # smiles_stats['charge'][charge] += 1
#             mol = uncharger.uncharge(mol)
#
#         # canonical
#         return Descriptors.ExactMolWt(mol)
#     except:
#         return np.NAN

In [None]:
# define file names
add_df[unique_id_header] = ["pluskal_{}_{}".format(lib_id, plate_id) for plate_id in add_df[plate_id_header]]
# lib_df[raw_filename] = ["{}_{}{}".format(current_date, prefix, unique_id) for unique_id in lib_df[unique_id_header]]


electron_mass = 0.00054857
mzh = exact_mass("H")-electron_mass
mzna = exact_mass("Na")-electron_mass

# # define exact mass
# if not "exact_mass" in lib_df:
#     add_df["exact_mass"] = [exact_mass(formula) for formula in add_df["Formula"]]
#     add_df["mz_h"] = add_df["exact_mass"] + mzh
#     add_df["mz_na"] = add_df["exact_mass"] + mzna

# from smiles
mols = [cleaned_mol(smiles) if not pd.isnull(smiles) else np.NAN for smiles in add_df["isomeric_smiles"]]
add_df["cleaned_smiles"] = [mol_to_canon_smiles(mol) for mol in mols]
add_df["exact_mass_smiles"] = [exact_mass_from_mol(mol) for mol in mols]
add_df["mz_h_smiles"] = add_df["exact_mass_smiles"] + mzh
add_df["mz_na_smiles"] = add_df["exact_mass_smiles"] + mzna

# add_df["mass_matches"] = [abs(a-b) < 0.01 for a, b in zip(add_df["exact_mass_smiles"], add_df["exact_mass"])]


add_df.to_csv("data/lib_formatted_{}{}.csv".format(lib_id, add), sep="\t", index=False)

add_df

In [None]:
pg_