In [1]:
from typing import Any

import pandas as pd
import numpy as np

import cirpy
from pandas import Series, DataFrame
from pandas.core.generic import NDFrame

from pubchempy import get_compounds, Compound

In [2]:
import logging

logging.getLogger('pubchempy').setLevel(logging.DEBUG)

In [3]:
add_df = pd.read_csv("data/mce_library_add_compounds.csv", sep="\t")
add_df

Unnamed: 0,RackCode,Plate Location,mixed_location_plate1,mixed_location_plate2,mixed_location_plate3,VialCode,Cat. No.,Product Name,Synonyms,CAS No.,...,Solubility,Solvent,Batch No.,Quantity,URL,Pathway,Research Area,Clinical Information,WellNumber,WellLetter
0,HYCPK16568,A2,1D3_K20,2D3_K20,3D3_K20,,HY-50880,Elacridar (hydrochloride),,143851-98-3,...,,,,,,,,,2,A
1,HYCPK16568,A3,1D3_K20,2D3_K20,3D3_K20,,HY-15989,SM-164,,957135-43-2,...,,,,,,,,,3,A
2,HYCPK16568,A4,1D3_K20,2D3_K20,3D3_K20,,HY-U00094,INO5042,,14782-19-5,...,,,,,,,,,4,A
3,HYCPK16568,A5,1D3_K20,2D3_K20,3D3_K20,,HY-12484,BMH-21,,896705-16-1,...,,,,,,,,,5,A
4,HYCPK16568,A6,1D3_K20,2D3_K20,3D3_K20,,HY-B1434,7-Aminophelosporanic acid,,957-68-6,...,,,,,,,,,6,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,HYCPK16573,B3,1D3_L19,2D3_L19,3D3_L19,,HY-101916,Heparan Sulfate,,9050-30-0,...,,,,,,,,,3,B
231,HYCPK16573,B4,1D3_L19,1D3_L19,1D3_L19,,HY-17567C,Heparin (sodium salt) (MW 135 kDa),,9041-8-1,...,,,,,,,,,4,B
232,HYCPK16573,B5,1D3_L19,1D3_L19,1D3_L19,,HY-107910,Hyaluronidase,,37326-33-3,...,,,,,,,,,5,B
233,HYCPK16573,B6,1D3_L19,1D3_L19,1D3_L19,,HY-17567B,Heparin (lithium salt),,9045-22-1,...,,,,,,,,,6,B


In [14]:
def compound_score(comp: Compound):
    smiles = comp.canonical_smiles
    if not smiles:
        return 0
    return 1000 - str(smiles).count(".")


def cas_to_pubchem(cas: str) -> Compound | None:
    if cas == "NaN":
        return None
    compounds = get_compounds(cas, "name")
    if not compounds:
        logging.info("cas:{} had NO entries".format(cas))
        return None
    else:
        compounds.sort(key=lambda comp: compound_score(comp), reverse=True)
        return compounds[0]

In [15]:
compounds = [cas_to_pubchem(str(cas)) if not pd.isnull(cas) else np.NAN for cas in add_df["CAS No."]]

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/JSON
DEBUG:pubchempy:Request data: b'name=143851-98-3'
DEBUG:pubchempy:Created Compound(170320)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/JSON
DEBUG:pubchempy:Request data: b'name=957135-43-2'
DEBUG:pubchempy:Created Compound(17756618)
DEBUG:pubchempy:Created Compound(73409610)
DEBUG:pubchempy:Created Compound(122172970)
DEBUG:pubchempy:Created Compound(129626571)
DEBUG:pubchempy:Created Compound(134692765)
DEBUG:pubchempy:Created Compound(138107685)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/JSON
DEBUG:pubchempy:Request data: b'name=14782-19-5'
DEBUG:pubchempy:Created Compound(10446437)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/JSON
DEBUG:pubchempy:Request data: b'name=896705-16-1'
DEBUG:pubchempy:Created Compound(3508054)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm

In [17]:
add_df["PubChemID"] = pd.array([compound.cid if not pd.isnull(compound) else np.NAN for compound in compounds], dtype=pd.Int64Dtype())
add_df["isomeric_smiles"] = [compound.isomeric_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
add_df["canonical_smiles"] = [compound.canonical_smiles if not pd.isnull(compound) else np.NAN for compound in compounds]
add_df

Unnamed: 0,RackCode,Plate Location,mixed_location_plate1,mixed_location_plate2,mixed_location_plate3,VialCode,Cat. No.,Product Name,Synonyms,CAS No.,...,Quantity,URL,Pathway,Research Area,Clinical Information,WellNumber,WellLetter,PubChemID,isomeric_smiles,canonical_smiles
0,HYCPK16568,A2,1D3_K20,2D3_K20,3D3_K20,,HY-50880,Elacridar (hydrochloride),,143851-98-3,...,,,,,,2,A,170320,COC1=CC=CC2=C1NC3=C(C2=O)C=CC=C3C(=O)NC4=CC=C(...,COC1=CC=CC2=C1NC3=C(C2=O)C=CC=C3C(=O)NC4=CC=C(...
1,HYCPK16568,A3,1D3_K20,2D3_K20,3D3_K20,,HY-15989,SM-164,,957135-43-2,...,,,,,,3,A,17756618,C[C@@H](C(=O)N[C@H]1CCCC[C@H]2CC[C@H](N2C1=O)C...,CC(C(=O)NC1CCCCC2CCC(N2C1=O)C(=O)NC(C3=CC=CC=C...
2,HYCPK16568,A4,1D3_K20,2D3_K20,3D3_K20,,HY-U00094,INO5042,,14782-19-5,...,,,,,,4,A,10446437,C1=CC=C2C(=C1)C(=O)C3=C(C2=O)SC(=N3)C4=CC=CO4,C1=CC=C2C(=C1)C(=O)C3=C(C2=O)SC(=N3)C4=CC=CO4
3,HYCPK16568,A5,1D3_K20,2D3_K20,3D3_K20,,HY-12484,BMH-21,,896705-16-1,...,,,,,,5,A,3508054,CN(C)CCNC(=O)C1=CC=CN2C1=NC3=CC4=CC=CC=C4C=C3C2=O,CN(C)CCNC(=O)C1=CC=CN2C1=NC3=CC4=CC=CC=C4C=C3C2=O
4,HYCPK16568,A6,1D3_K20,2D3_K20,3D3_K20,,HY-B1434,7-Aminophelosporanic acid,,957-68-6,...,,,,,,6,A,441328,CC(=O)OCC1=C(N2[C@@H]([C@@H](C2=O)N)SC1)C(=O)O,CC(=O)OCC1=C(N2C(C(C2=O)N)SC1)C(=O)O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,HYCPK16573,B3,1D3_L19,2D3_L19,3D3_L19,,HY-101916,Heparan Sulfate,,9050-30-0,...,,,,,,3,B,137699201,CO[C@@H]1[C@@H]([C@@H]([C@@H](OC1C(=O)[O-])O[C...,COC1C(C(C(OC1C(=O)[O-])OC2C(OC(C(C2O)NS(=O)(=O...
231,HYCPK16573,B4,1D3_L19,1D3_L19,1D3_L19,,HY-17567C,Heparin (sodium salt) (MW 135 kDa),,9041-8-1,...,,,,,,4,B,,,
232,HYCPK16573,B5,1D3_L19,1D3_L19,1D3_L19,,HY-107910,Hyaluronidase,,37326-33-3,...,,,,,,5,B,91820602,CC1(C(=C)N(C2=CC=CC=C21)CCCCCC(=O)O)C.Br,CC1(C(=C)N(C2=CC=CC=C21)CCCCCC(=O)O)C.Br
233,HYCPK16573,B6,1D3_L19,1D3_L19,1D3_L19,,HY-17567B,Heparin (lithium salt),,9045-22-1,...,,,,,,6,B,44336410,COC1C(C(C(OC1C(=O)O)OC2C(OC(C(C2O)NS(=O)(=O)O)...,COC1C(C(C(OC1C(=O)O)OC2C(OC(C(C2O)NS(=O)(=O)O)...


In [31]:
comp = cas_to_pubchem('143851-98-3')
float(comp.monoisotopic_mass)

599.2186989

In [54]:
cas_to_pubchem("1401-20-3")

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/JSON
DEBUG:pubchempy:Request data: b'name=1401-20-3'
INFO:pubchempy:'PUGREST.NotFound: No CID found that matches the given name'


False