In [1]:
import pandas as pd
import pubchempy as pcp

In [2]:
df = pd.read_excel("./apex_bio.xlsx", sheet_name="Chemical Data")
df.to_csv("./apex_bio.tsv", index=False, sep="\t")

In [3]:
from cache_decorator import Cache


@Cache()
def compound_to_df(cas_number: str, namespace: str = "name") -> pd.DataFrame:
    data = pcp.get_compounds(cas_number, namespace, as_dataframe=True)
    data["inchikey_2D"] = data["inchikey"].str[:14]
    data["cid"] = data.index
    data.reset_index(drop=True, inplace=True)
    data.drop_duplicates(subset="inchikey_2D", inplace=True)
    return data

In [4]:
cas_numbers = df["CAS Number"]

In [5]:
from tqdm.auto import tqdm

found_cas_numbers = {}
failed_cas = []
for i, cas_num in enumerate(tqdm(cas_numbers)):
    try:
        found_cas_numbers[cas_num] = compound_to_df(cas_num)

    except:
        failed_cas.append(cas_num)


second_fail = []
for i in failed_cas:
    try:
        found_cas_numbers[i] = compound_to_df(
            df[df["CAS Number"] == i]["Item Name"].values
        )

    except:
        print(f"Failed again: {i}")
        second_fail.append(i)


for i in second_fail:
    try:

        found_cas_numbers[i] = compound_to_df(
            df[df["CAS Number"] == i]["SMILES"].values,
            "smiles",
        )

    except:
        print(f"Failed again a final time: {i}")

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 550/550 [00:08<00:00, 64.17it/s] 


Failed again: 1405-87-4
Failed again: 1264-72-8
Failed again: 76135-82-5
Failed again a final time: 1264-72-8


In [6]:
multiple_entries = []
for key, value in found_cas_numbers.items():
    if len(value) > 1:
        print(f"Multiple entries for {key}")
        multiple_entries.append(key)

Multiple entries for 25316-40-9
Multiple entries for 24390-14-5
Multiple entries for 5908-99-6
Multiple entries for 53-84-9
Multiple entries for 1476-53-5
Multiple entries for 68-19-9
Multiple entries for 58-58-2
Multiple entries for 52152-93-9


In [7]:
df_from_pubchem = (
    pd.concat(found_cas_numbers)
    .reset_index()
    .drop(
        columns=[
            "record",
            "level_1",
            "bonds",
            "atoms",
            "atom_stereo_count",
            "cactvs_fingerprint",
            "bond_stereo_count",
            "complexity",
            "conformer_id_3d",
            "coordinate_type",
            "covalent_unit_count",
            "defined_atom_stereo_count",
            "defined_bond_stereo_count",
            "effective_rotor_count_3d",
            "elements",
            "feature_selfoverlap_3d",
            "fingerprint",
            "conformer_rmsd_3d",
            "volume_3d",
            "pharmacophore_features_3d",
            "rotatable_bond_count",
            "shape_fingerprint_3d",
            "multipoles_3d",
            "shape_selfoverlap_3d",
            "undefined_atom_stereo_count",
            "undefined_bond_stereo_count",
            "mmff94_energy_3d",
            "mmff94_partial_charges_3d",
            "h_bond_acceptor_count",
            "h_bond_donor_count",
            "iupac_name",
        ]
    )
)

  pd.concat(found_cas_numbers)


In [8]:
df_merged = pd.merge(
    df, df_from_pubchem, left_on="CAS Number", right_on="level_0", how="outer"
)

In [23]:
df_merged[df_merged["CAS Number"] == "25316-40-9"].columns

Index(['CatalogNumber', 'Item Name', 'CAS Number', 'Plate Location',
       'Rack Number', 'M.w.', 'Solvent', 'Formula', 'SMILES', 'Solubility',
       'Pathway', 'Target', 'Information', 'URL', 'level_0',
       'canonical_smiles', 'charge', 'exact_mass', 'heavy_atom_count', 'inchi',
       'inchikey', 'isomeric_smiles', 'isotope_atom_count',
       'molecular_formula', 'molecular_weight', 'monoisotopic_mass', 'tpsa',
       'xlogp', 'inchikey_2D', 'cid'],
      dtype='object')

In [9]:
from molvs import standardize_smiles, Standardizer
from rdkit import Chem

x = []

for i in tqdm(df["SMILES"], total=len(df["SMILES"])):
    try:
        if i is not None:
            mol = Chem.MolFromSmiles(i)
            x.append(Standardizer().super_parent(mol))

    except:
        pass

 16%|█▌        | 89/550 [00:21<00:38, 12.05it/s] [15:50:34] SMILES Parse Error: syntax error while parsing: O=C([C@]([H])(C)NC([C@](N)([H])CNC1=O)=O)N[C@@]([H])(CNC(CC(CCCN)N)=O)C(NC(C(N[C@@]1([H])[C@@H]2CCNC(N)=N2)=O)=CNC(N)=O)=O.[xH2SO4]
[15:50:34] SMILES Parse Error: check for mistakes around position 125:
[15:50:34] N2)=O)=CNC(N)=O)=O.[xH2SO4]
[15:50:34] ~~~~~~~~~~~~~~~~~~~~^
[15:50:34] SMILES Parse Error: Failed parsing SMILES 'O=C([C@]([H])(C)NC([C@](N)([H])CNC1=O)=O)N[C@@]([H])(CNC(CC(CCCN)N)=O)C(NC(C(N[C@@]1([H])[C@@H]2CCNC(N)=N2)=O)=CNC(N)=O)=O.[xH2SO4]' for input: 'O=C([C@]([H])(C)NC([C@](N)([H])CNC1=O)=O)N[C@@]([H])(CNC(CC(CCCN)N)=O)C(NC(C(N[C@@]1([H])[C@@H]2CCNC(N)=N2)=O)=CNC(N)=O)=O.[xH2SO4]'
 18%|█▊        | 98/550 [00:29<02:15,  3.34it/s][15:50:34] Explicit valence for atom # 17 P, 7, is greater than permitted
[15:50:34] Explicit valence for atom # 17 P, 7, is greater than permitted
[15:50:34] Explicit valence for atom # 17 P, 7, is greater than permitted
[15:50:34] Expl

In [11]:
smiles = [Chem.MolToSmiles(i) for i in x]

In [13]:
len(smiles)

527

In [None]:
from rdkit import Chem

mols = []
failed_mols = []
for i in tqdm(df.SMILES):
    try:
        mols.append(Chem.MolFromSmiles(i))
    except:
        failed_mols.append(i)

In [None]:
smiles = [Chem.MolToSmiles(i) for i in mols if i is not None]
inchikey = [Chem.MolToInchiKey(i) for i in mols if i is not None]
inchi = [Chem.MolToInchi(i) for i in mols if i is not None]

In [None]:
data_from_rdkit = pd.DataFrame(
    {
        "SMILES": smiles,
        "InChIKey": inchikey,
        "InChI": inchi,
    }
)

In [None]:
set(data_from_rdkit.SMILES) - set(df.SMILES)

In [None]:
pcp.get_compounds("VBTZKFAHKJXHBA-UHFFFAOYSA-N", "inchikey", as_dataframe=True)

In [None]:
df[pd.isna(df.SMILES)]