In [1]:
import numpy as np
import pandas as pd
from datetime import date

import rdkit.Chem
from molmass import Formula
from rdkit import Chem
from rdkit.Chem import Descriptors

from rdkit.Chem import AllChem as Chem
from rdkit.Chem.MolStandardize import rdMolStandardize

from pubchempy import get_compounds, Compound


In [2]:
import logging

logging.getLogger('pubchempy').setLevel(logging.DEBUG)

In [None]:
# define all variables
lib_id = "mce"

# usually empty unless, e.g., second measurement or other parameters
# always ends with underscore _
prefix = "100AGC_60000Res_"
instrument_method = r"C:\Xcalibur\methods\Corinna_Brungs\Library6_100AGC_60000Res_MS5_POS_mz115-2000"

plates = ["1D1","1D2","1D3"]
plate_id_header = "mixed_location_plate1"

# plates are inserted into the BLUE B compartment
plate_loc_in_autosampler = "B"


# final values
unique_id_header = "lib_plate_well"
raw_filename = "raw_filename"


library_file = "data/{}_library.csv".format(lib_id)

## Import library

In [None]:
lib_df = pd.read_csv(library_file, sep="\t")
lib_df


## Add unique column with internal ID and well location
Use internal ID of plate and then library ID

In [None]:
def exact_mass(formula):
    try:
        clean = formula.split(".")[0]
        return Formula(clean).isotope.mass
    except:
        return np.NAN



## Get exact mass from cleaned SMILES

In [None]:
# returns canonical smiles
def mol_to_canon_smiles(mol):
    try:
        return Chem.MolToSmiles(mol, isomericSmiles=True)
    except:
        return None

# def smi_to_canon_smiles(smi):
#     try:
#         return Chem.MolToSmiles(Chem.MolFromSmiles(smi), isomericSmiles=False)
#     except:
#         pass

uncharger = rdMolStandardize.Uncharger()
# smiles_stats = {'n_dots': Counter(), 'charge': Counter(), 'invalid_smiles': []}


def cleaned_mol(smiles: str):
    original_input = smiles
    try:
        # find the longest smiles that might be the main molecule
        # for smiles that contain the salt partner etc
        split_smiles = str(smiles).split('.')
        if len(split_smiles) > 1:
            # smiles_stats['n_dots'][len(split_smiles)-1] += 1
            smiles = max(split_smiles, key=len)
        else:
            smiles = split_smiles[0]


        mol = Chem.MolFromSmiles(smiles)
        charge = Chem.GetFormalCharge(mol)
        if abs(charge) > 0:
            # smiles_stats['charge'][charge] += 1
            mol = uncharger.uncharge(mol)

        if mol is None:
            return mol_from_pepseq(original_input)
        else:
            return mol
    except:
        return mol_from_pepseq(original_input)


def mol_from_pepseq(original_input):
    # read protein seq
    try:
        sequence = str(original_input).replace("[", "").replace("]", "").replace(" (TFA salt)", "")
        return Chem.MolFromSequence(sequence)
    except:
        return None


def exact_mass_from_mol(mol):
    try:
        # canonical
        return Descriptors.ExactMolWt(mol)
    except:
        return None

# def exact_mass_from_smiles(smiles: str):
#     try:
#         # find the longest smiles that might be the main molecule
#         # for smiles that contain the salt partner etc
#         split_smiles = smiles.split('.')
#         if len(split_smiles) > 1:
#             # smiles_stats['n_dots'][len(split_smiles)-1] += 1
#             smiles = max(split_smiles, key=len)
#         else:
#             smiles = split_smiles[0]
#
#
#         # for those smiles provided as salts (e.g., .Na+) add H+ until charge is neutral
#         # if charge is neutral already (N+ and PO-) keep both charges
#         mol = Chem.MolFromSmiles(smiles)
#         charge = Chem.GetFormalCharge(mol)
#         if abs(charge) > 0:
#             # smiles_stats['charge'][charge] += 1
#             mol = uncharger.uncharge(mol)
#
#         # canonical
#         return Descriptors.ExactMolWt(mol)
#     except:
#         return np.NAN

In [None]:
# define file names
lib_df[unique_id_header] = ["pluskal_{}_{}".format(lib_id, plate_id) for plate_id in lib_df[plate_id_header]]
# lib_df[raw_filename] = ["{}_{}{}".format(current_date, prefix, unique_id) for unique_id in lib_df[unique_id_header]]


electron_mass = 0.00054857
mzh = exact_mass("H")-electron_mass
mzna = exact_mass("Na")-electron_mass

# define exact mass
if not "exact_mass" in lib_df:
    lib_df["exact_mass"] = [exact_mass(formula) for formula in lib_df["Formula"]]
    lib_df["mz_h"] = lib_df["exact_mass"] + mzh
    lib_df["mz_na"] = lib_df["exact_mass"] + mzna

# from smiles
mols = [cleaned_mol(smiles) for smiles in lib_df["Smiles"]]
lib_df["cleaned_smiles"] = [mol_to_canon_smiles(mol) for mol in mols]
lib_df["exact_mass_smiles"] = [exact_mass_from_mol(mol) for mol in mols]
lib_df["mz_h_smiles"] = lib_df["exact_mass_smiles"] + mzh
lib_df["mz_na_smiles"] = lib_df["exact_mass_smiles"] + mzna

lib_df["mass_matches"] = [abs(a-b) < 0.01 for a, b in zip(lib_df["exact_mass_smiles"], lib_df["exact_mass"])]


# lib_df.to_csv("data/lib_formatted_{}.csv".format(lib_id), sep="\t", index=False)

lib_df

## Getting compound information from CAS or Name (CAS to PubChem)

In [None]:
def compound_score(comp: Compound):
    smiles = comp.canonical_smiles
    if not smiles:
        return 0
    return 1000 - str(smiles).count(".")


def search_pubchem_by_name(name_or_cas: str) -> Compound | None:
    """
    In pubchem many entries contain the cas as an alternative name - so searching for cas in name works often

    :param name_or_cas: input name or cas
    :return: first compound or None
    """
    if name_or_cas == "NaN":
        return None
    compounds = get_compounds(name_or_cas, "name")
    if not compounds:
        logging.info("cas:{} had NO entries".format(name_or_cas))
        return None
    else:
        compounds.sort(key=lambda comp: compound_score(comp), reverse=True)
        return compounds[0]

In [None]:
compounds = [search_pubchem_by_name(str(cas)) if not pd.isnull(cas) else np.NAN for cas in lib_df["CAS No."]]

In [None]:
compounds = [search_pubchem_by_name(str(name)) if pd.isnull(comp) else comp for comp, name in zip(compounds, lib_df["Product Name"])]
# only one compound was found as CAS-
compounds = [search_pubchem_by_name("CAS-{}".format(cas)) if pd.isnull(comp) else comp for comp, cas in zip(compounds, lib_df["CAS No."])]


In [None]:
lib_df["PubChemID"] = pd.array([compound.cid if not pd.isnull(compound) else np.NAN for compound in compounds], dtype=pd.Int64Dtype())
lib_df["isomeric_smiles"] = [compound.isomeric_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
lib_df["canonical_smiles"] = [compound.canonical_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
lib_df

## Cleaning PubChem Smiles and calculating exact mass

In [None]:
electron_mass = 0.00054857
mzh = exact_mass("H")-electron_mass
mzna = exact_mass("Na")-electron_mass

# from smiles
mols = [cleaned_mol(smiles) if not pd.isnull(smiles) else np.NAN for smiles in lib_df["isomeric_smiles"]]
lib_df["cleaned_psmiles"] = [mol_to_canon_smiles(mol) for mol in mols]
lib_df["exact_mass_psmiles"] = [exact_mass_from_mol(mol) for mol in mols]
lib_df["mz_h_psmiles"] = lib_df["exact_mass_psmiles"] + mzh
lib_df["mz_na_psmiles"] = lib_df["exact_mass_psmiles"] + mzna

lib_df["mce_smiles_vs_pubchem_smiles"] = [abs(a-b) < 0.01 for a, b in zip(lib_df["exact_mass_smiles"], lib_df["exact_mass_psmiles"])]


lib_df.to_csv("data/lib_formatted_pubchem_{}.csv".format(lib_id), sep="\t", index=False)

lib_df

In [None]:
lib_df[lib_df["exact_mass_smiles"].isna()]

### Creating one cleaned SMILES column, compounds can have multiple entries if provided SMILES and PubChem SMILES are different

In [None]:
lib_df = pd.read_csv("data/lib_formatted_pubchem_mce.csv", sep="\t")
add_df = pd.read_csv("data/lib_formatted_mce_add_compounds.csv", sep="/t")

In [None]:
lib1_df = lib_df[["Cat. No.", "Product Name", "Synonyms", "CAS No.", "Smiles", "PubChemID", "isomeric_smiles", "canonical_smiles", "lib_plate_well", "URL", "Target", "Information", "Pathway", "Research Area", "Clinical Information"]].copy()

lib1_df["Source"] = "MCE"
lib1_df

In [None]:
lib2_df = lib_df[["Cat. No.", "Product Name", "Synonyms", "CAS No.", "Smiles", "PubChemID", "isomeric_smiles", "canonical_smiles", "lib_plate_well", "URL", "Target", "Information", "Pathway", "Research Area", "Clinical Information"]].copy()
lib2_df["Smiles"] = lib2_df["canonical_smiles"]
lib2_df["Source"] = "PubChem"
lib2_df

In [None]:
merged_df = pd.concat([lib2_df, lib1_df], ignore_index=True, sort=False)
merged_df

In [3]:
df = pd.read_csv("data/test_metadata_cleaned.tsv", sep="\t")

In [60]:

def get_all_synonyms(row):
    synonyms = [
        get_or_else(row, "Product Name"),
        get_or_else(row, "CAS No."),
        get_or_else(row, "CAS"),
    ]

    old = get_or_else(row, "synonyms", [])
    if isinstance(old, str):
        synonyms.append(old)
    else:
        synonyms = synonyms + old

    synonyms.extend([s.strip() for s in str(get_or_else(row, "Synonyms", "")).split(";")])

    synonyms = [x.strip() for x in synonyms if x]
    seen = set()
    unique = [x for x in synonyms if x.lower() not in seen and not seen.add(x.lower())]
    return unique

def get_or_else(row, key, default=None):
    return row[key] if key in row and not pd.isnull(row[key]) else default



df["synonyms2"] = df.apply(lambda row: get_all_synonyms(row), axis=1)

df["synonyms2"]

KeyboardInterrupt: 

In [47]:
synonyms = ['4egi-1', '315706-13-9', 'CHEMBL254578', 'UNIi ;-H57R:EU3DHP', 'H57REU3DHP', '4EGI1', '4EGI 1', 'SCHEMBL3334288', 'alpha-(2-(4-(3,4-Dichlorophenyl)-2-thiazolyl)hydrazinylidene)-2-nitrobenzenepropano', "UNII-123124dawdawd"]

import re
# [name for name in synonyms if "UNII" in name]
gen = (re.sub('[ .;:\-]|UNII', '', name.upper()) for name in synonyms if "UNII" in name.upper())

next(gen)
# re.sub('[ .;:\-]|UNII', '', next(gen))

False