In [1]:
import numpy as np
import pandas as pd
from datetime import date

import rdkit.Chem
from molmass import Formula
from rdkit import Chem
from rdkit.Chem import Descriptors

from rdkit.Chem import AllChem as Chem
from rdkit.Chem.MolStandardize import rdMolStandardize

In [2]:
# define all variables
lib_id = "mce"

# usually empty unless, e.g., second measurement or other parameters
# always ends with underscore _
prefix = "100AGC_60000Res_"
instrument_method = r"C:\Xcalibur\methods\Corinna_Brungs\Library6_100AGC_60000Res_MS5_POS_mz115-2000"

plates = ["1D1","1D2","1D3"]
plate_id_header = "mixed_location_plate1"

# plates are inserted into the BLUE B compartment
plate_loc_in_autosampler = "B"


# final values
unique_id_header = "lib_plate_well"
raw_filename = "raw_filename"


library_file = "data/{}_library.csv".format(lib_id)

## Import library

In [4]:
lib_df = pd.read_csv(library_file, sep="\t")
lib_df


Unnamed: 0,RackCode,Plate Location,VialCode,Cat. No.,Product Name,Synonyms,CAS No.,M.Wt,Target,Saltdata,...,Pathway,Research Area,Clinical Information,WellNumber,WellLetter,MixedWell,MixedPlate,mixed_location_plate1,mixed_location_plate2,mixed_location_plate3
0,HYCPK16574,A2,,HY-15338,TG003,,719277-26-6,249.33,CDK,Free Base,...,Cell Cycle/DNA Damage,Cancer,No Development Reported,2,A,A1,1,1D1_A1,2D1_A1,3D1_A1
1,HYCPK16574,A3,,HY-15440B,Fostemsavir Tris,BMS-663068 (Tris),864953-39-9,704.62,HIV,Tris,...,Anti-infection,Infection,Launched,3,A,A1,1,1D1_A1,2D1_A1,3D1_A1
2,HYCPK16574,A4,,HY-114315,NQO1 substrate,,2304503-05-5,268.18,Others,Free Base,...,Others,Cancer,No Development Reported,4,A,A1,1,1D1_A1,2D1_A1,3D1_A1
3,HYCPK16574,A5,,HY-15357,ALK inhibitor 1,,761436-81-1,562.48,ALK; FAK,Free Base,...,Protein Tyrosine Kinase/RTK,Cancer,No Development Reported,5,A,A1,1,1D1_A1,2D1_A1,3D1_A1
4,HYCPK16574,A6,,HY-15880,CCT007093,,176957-55-4,272.39,Apoptosis; Autophagy; Phosphatase,Free Base,...,Apoptosis; Autophagy; Metabolic Enzyme/Protease,Cancer,No Development Reported,6,A,A1,1,1D1_A1,2D1_A1,3D1_A1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10264,HYCPK16762,A4,,HY-B0272,Rifampicin,Rifampin; Rifamycin AMP,13292-46-1,822.94,Antibiotic; Bacterial; Influenza Virus,Free Base,...,Anti-infection,Infection,Launched,4,A,K19,3,1D3_K19,2D3_K19,3D3_K19
10265,HYCPK16762,A5,,HY-Y0546,Benzophenone,,119-61-9,182.22,Endogenous Metabolite,Free Base,...,Metabolic Enzyme/Protease,Metabolic Disease,No Development Reported,5,A,K19,3,1D3_K19,2D3_K19,3D3_K19
10266,HYCPK16762,A6,,HY-B1008,4-Aminobenzoic acid,PABA; Vitamin Bx; Vitamin H1,150-13-0,137.14,Endogenous Metabolite,Free Acid,...,Metabolic Enzyme/Protease,Others,No Development Reported,6,A,K19,3,1D3_K19,2D3_K19,3D3_K19
10267,HYCPK16762,A7,,HY-N0115,Gastrodin,Gastrodine,62499-27-8,286.28,Others,Free Base,...,Others,Inflammation/Immunology; Neurological Disease,Launched,7,A,K19,3,1D3_K19,2D3_K19,3D3_K19


## Add unique column with internal ID and well location
Use internal ID of plate and then library ID

In [None]:
def exact_mass(formula):
    try:
        clean = formula.split(".")[0]
        return Formula(clean).isotope.mass
    except:
        return np.NAN



## Get exact mass from cleaned SMILES

In [None]:
# returns canonical smiles
def mol_to_canon_smiles(mol):
    try:
        return Chem.MolToSmiles(mol, isomericSmiles=True)
    except:
        return None

# def smi_to_canon_smiles(smi):
#     try:
#         return Chem.MolToSmiles(Chem.MolFromSmiles(smi), isomericSmiles=False)
#     except:
#         pass

uncharger = rdMolStandardize.Uncharger()
# smiles_stats = {'n_dots': Counter(), 'charge': Counter(), 'invalid_smiles': []}


def cleaned_mol(smiles: str):
    original_input = smiles
    try:
        # find the longest smiles that might be the main molecule
        # for smiles that contain the salt partner etc
        split_smiles = str(smiles).split('.')
        if len(split_smiles) > 1:
            # smiles_stats['n_dots'][len(split_smiles)-1] += 1
            smiles = max(split_smiles, key=len)
        else:
            smiles = split_smiles[0]


        mol = Chem.MolFromSmiles(smiles)
        charge = Chem.GetFormalCharge(mol)
        if abs(charge) > 0:
            # smiles_stats['charge'][charge] += 1
            mol = uncharger.uncharge(mol)

        if mol is None:
            return mol_from_pepseq(original_input)
        else:
            return mol
    except:
        return mol_from_pepseq(original_input)


def mol_from_pepseq(original_input):
    # read protein seq
    try:
        sequence = str(original_input).replace("[", "").replace("]", "").replace(" (TFA salt)", "")
        return Chem.MolFromSequence(sequence)
    except:
        return None


def exact_mass_from_mol(mol):
    try:
        # canonical
        return Descriptors.ExactMolWt(mol)
    except:
        return None

# def exact_mass_from_smiles(smiles: str):
#     try:
#         # find the longest smiles that might be the main molecule
#         # for smiles that contain the salt partner etc
#         split_smiles = smiles.split('.')
#         if len(split_smiles) > 1:
#             # smiles_stats['n_dots'][len(split_smiles)-1] += 1
#             smiles = max(split_smiles, key=len)
#         else:
#             smiles = split_smiles[0]
#
#
#         # for those smiles provided as salts (e.g., .Na+) add H+ until charge is neutral
#         # if charge is neutral already (N+ and PO-) keep both charges
#         mol = Chem.MolFromSmiles(smiles)
#         charge = Chem.GetFormalCharge(mol)
#         if abs(charge) > 0:
#             # smiles_stats['charge'][charge] += 1
#             mol = uncharger.uncharge(mol)
#
#         # canonical
#         return Descriptors.ExactMolWt(mol)
#     except:
#         return np.NAN

In [None]:
# define file names
lib_df[unique_id_header] = ["pluskal_{}_{}".format(lib_id, plate_id) for plate_id in lib_df[plate_id_header]]
# lib_df[raw_filename] = ["{}_{}{}".format(current_date, prefix, unique_id) for unique_id in lib_df[unique_id_header]]


electron_mass = 0.00054857
mzh = exact_mass("H")-electron_mass
mzna = exact_mass("Na")-electron_mass

# define exact mass
if not "exact_mass" in lib_df:
    lib_df["exact_mass"] = [exact_mass(formula) for formula in lib_df["Formula"]]
    lib_df["mz_h"] = lib_df["exact_mass"] + mzh
    lib_df["mz_na"] = lib_df["exact_mass"] + mzna

# from smiles
mols = [cleaned_mol(smiles) for smiles in lib_df["Smiles"]]
lib_df["cleaned_smiles"] = [mol_to_canon_smiles(mol) for mol in mols]
lib_df["exact_mass_smiles"] = [exact_mass_from_mol(mol) for mol in mols]
lib_df["mz_h_smiles"] = lib_df["exact_mass_smiles"] + mzh
lib_df["mz_na_smiles"] = lib_df["exact_mass_smiles"] + mzna

lib_df["mass_matches"] = [abs(a-b) < 0.01 for a, b in zip(lib_df["exact_mass_smiles"], lib_df["exact_mass"])]


lib_df.to_csv("data/lib_formatted_{}.csv".format(lib_id), sep="\t", index=False)

lib_df

In [None]:
mol = Chem.MolFromSequence("[H-{Aib}-EGTFTSDVSSYLEGQAAKEFIAWLVK-{Aib}-R-NH2]")
mol = Chem.MolFromSequence("EGTFTSDVSSYLEGQAAKEFIAWLVK")
mol = Chem.MolFromSequence("LPSDDLEFWCHVMY")
exact_mass_from_mol(mol)

In [None]:
lib_df[lib_df["exact_mass_smiles"].isna()]