In [None]:
import numpy as np
import pandas as pd
from datetime import date

from rdkit.Chem import AllChem as Chem
from chembl_structure_pipeline import standardizer

from pubchempy import get_compounds, Compound

import mol_identifiers as molid

import database_client as db_client

In [None]:
import logging

logging.getLogger('pubchempy').setLevel(logging.DEBUG)

In [None]:
lib_df = pd.read_csv("data/lib_formatted_pubchem_mce.tsv", sep="\t")
add_df = pd.read_csv("data/lib_formatted_mce_add_compounds.tsv", sep="\t")

### Getting information out of PubChem -> isomeric and canonical SMILES by CAS or Name

In [None]:
# compounds = [search_pubchem_by_name(str(cas)) if not pd.isnull(cas) else np.NAN for cas in lib_df["CAS No."]]
# compounds = [search_pubchem_by_name(str(name)) if pd.isnull(comp) else comp for comp, name in
#              zip(compounds, lib_df["Product Name"])]
# # only one compound was found as CAS-
# compounds = [search_pubchem_by_name("CAS-{}".format(cas)) if pd.isnull(comp) else comp for comp, cas in
#              zip(compounds, lib_df["CAS No."])]
#
# lib_df["PubChemID"] = pd.array([compound.cid if not pd.isnull(compound) else np.NAN for compound in compounds],
#                                dtype=pd.Int64Dtype())
# lib_df["isomeric_smiles"] = [compound.isomeric_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
# lib_df["canonical_smiles"] = [compound.canonical_smiles if not pd.isnull(compound) else np.NAN for compound in
#                               compounds]
# lib_df

### Creating two libraries to use the SMILES given by company and extracted SMILES by PubChem

In [None]:
lib1_df = lib_df[["Cat. No.", "Product Name", "Synonyms", "CAS No.", "Smiles", "PubChemID", "isomeric_smiles", "canonical_smiles", "lib_plate_well", "URL", "Target", "Information", "Pathway", "Research Area", "Clinical Information"]].copy()

lib1_df["Source"] = "MCE"
lib1_df

In [None]:
lib2_df = lib_df[["Cat. No.", "Product Name", "Synonyms", "CAS No.", "Smiles", "PubChemID", "isomeric_smiles", "canonical_smiles", "lib_plate_well", "URL", "Target", "Information", "Pathway", "Research Area", "Clinical Information"]].copy()
lib2_df["Smiles"] = lib2_df["canonical_smiles"]
lib2_df["Source"] = "PubChem"
lib2_df

In [None]:
add1_df = add_df[["Cat. No.", "Product Name", "Synonyms", "CAS No.", "Smiles", "PubChemID", "isomeric_smiles", "canonical_smiles", "lib_plate_well", "URL", "Target", "Information", "Pathway", "Research Area", "Clinical Information"]].copy()
add1_df["Smiles"] = add1_df["canonical_smiles"]
add1_df["Source"] = "PubChem"
add1_df

### Merging the dfs

In [None]:
merged_df = pd.concat([lib2_df, lib1_df, add1_df], ignore_index=True, sort=False)
merged_df

### Cleaning Smiles (either provided by company or by PubChem) to get the canonical smiles

In [None]:
# from smiles
mols = [Chem.MolFromSmiles(smiles) if not pd.isnull(smiles) else np.NAN for smiles in merged_df["Smiles"]]
mols = [chembl_standardize_mol(mol) if not pd.isnull(mol) else np.NAN for mol in mols]
merged_df["canonical_smiles"] = [mol_to_canon_smiles(mol) for mol in mols]
# merged_df["isomerical_smiles"] = [mol_to_canon_smiles(mol) for mol in mols]
merged_df["exact_mass"] = [exact_mass_from_mol(mol) for mol in mols]
merged_df["inchi"] = [inchi_from_mol(mol) for mol in mols]
merged_df["inchi_key"] = [inchikey_from_mol(mol) for mol in mols]
merged_df["formula"] = [formula_from_mol(mol) for mol in mols]

In [None]:
merged_df = merged_df.drop_duplicates(['Product Name','lib_plate_well', "exact_mass"], keep="first").sort_index()

In [None]:
merged_df

In [None]:
merged_df.to_csv("data/library_cleanup.csv", sep="\t", index=False)

### Getting PubChem information

In [None]:
compounds = [search_pubchem_by_smiles(str(smiles)) if not pd.isnull(smiles) else np.NAN for smiles in lib_df["canonical_smiles"]]

merged_df["iupac"] = pd.array([compound.cid if not pd.isnull(compound) else np.NAN for compound in compounds])
# lib_df["isomeric_smiles"] = [compound.isomeric_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
# lib_df["canonical_smiles"] = [compound.canonical_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
merged_df

In [None]:
get_compounds(r"C1=CC=C(C(=C1)C/C(=N/NC2=NC(=CS2)C3=CC(=C(C=C3)Cl)Cl)/C(=O)O)[N+](=O)[O-]", "smiles")

In [None]:
Chem.MolFromSmiles(r"C1=CC=C(C(=C1)C/C(=N/NC2=NC(=CS2)C3=CC(=C(C=C3)Cl)Cl)/C(=O)O)[N+](=O)[O-]")

In [None]:
Chem.MolFromSmiles(r"C1=CC=C(C(=C1)C/C(=N\NC2=NC(=CS2)C3=CC(=C(C=C3)Cl)Cl)/C(=O)O)[N+](=O)[O-]")


In [None]:
Chem.MolFromSmiles(r"O=C(O)/C(Cc1ccccc1[N+](=O)[O-])=N\Nc1nc(-c2cc(Cl)c(Cl)cc2)cs1")

In [None]:
get_compounds("InChI=1S/C18H12Cl2N4O4S/c19-12-6-5-10(7-13(12)20)15-9-29-18(21-15)23-22-14(17(25)26)8-11-3-1-2-4-16(11)24(27)28/h1-7,9H,8H2,(H,21,23)(H,25,26)/b22-14-", "inchi")

In [None]:
get_compounds(r"O=C(O)/C(Cc1ccccc1[N+](=O)[O-])=N\Nc1nc(-c2cc(Cl)c(Cl)cc2)cs1", "smiles")

In [None]:
get_compounds("KFRKRECSIYXARE-HMAPJEAMSA-N", "inchikey")

In [None]:
InChI1= "1S/C18H12Cl2N4O4S/c19-12-6-5-10(7-13(12)20)15-9-29-18(21-15)23-22-14(17(25)26)8-11-3-1-2-4-16(11)24(27)28/h1-7,9H,8H2,(H,21,23)(H,25,26)/b22-14+"
InChI2= "1S/C18H12Cl2N4O4S/c19-12-6-5-10(7-13(12)20)15-9-29-18(21-15)23-22-14(17(25)26)8-11-3-1-2-4-16(11)24(27)28/h1-7,9H,8H2,(H,21,23)(H,25,26)/b22-14-"

InChI1==InChI2


In [None]:
get_compounds(r"CN(C1=CC=CC2=C1C=NN2)C3=NC(NC4=CC(N5CCOCC5)=CC(N6CCOCC6)=C4)=NC=C3", "smiles")
