In [67]:
import pandas as pd
from io import StringIO
import requests
from pathlib import Path
import time

In [30]:
def pug_url_request(smile):
    
    # get cid number
    
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f'{prolog}/compound/smiles/{smile}/cids/txt'
    
    r = requests.get(url)
    #assert r.status_code == 200, f'invalid request for {smile}'
    
    print(r.url)
    print(r.text)

In [31]:
smile1 = "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"
pug_url_request(smile1)

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/CC(C)CC1=CC=C(C=C1)C(C)C(=O)O/cids/txt
3672



In [32]:
# forward slash will not work in url path
smile2 = "CC1=C([C@@](SC1=O)(C)/C=C(\C)/C=C)O"
pug_url_request(smile2)

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/CC1=C(%5BC@@%5D(SC1=O)(C)/C=C(%5CC)/C=C)O/cids/txt
Status: 400
Code: PUGREST.BadRequest
Message: Unable to standardize the given structure - perhaps some special characters need to be escaped or data packed in a MIME form?
Detail: error: 
Detail: status: 400
Detail: output: Caught ncbi::CException: Standardization failed
Detail: Output Log:
Detail: Record 1: Error: Unable to convert input into a compound object
Detail: 
Detail: 



In [28]:
def pug_url_request2(smile):
    
    # get cid number
    # encode structure as url argument to avoid special character issues
    
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f'{prolog}/compound/smiles/cids/txt?smiles={smile}'
    
    r = requests.get(url)
    #assert r.status_code == 200, f'invalid request for {smile}'
    
    print(r.url)
    print(r.text)

In [29]:
pug_url_request2(smile2)

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/cids/txt?smiles=CC1=C(%5BC@@%5D(SC1=O)(C)/C=C(%5CC)/C=C)O
135403829



In [26]:
def pug_url_request3(smile):
    
    # get cid number
    # pass structure query as dictionary 
    
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f'{prolog}/compound/smiles/cids/txt'
    
    struct = {'smiles' : smile}
    
    r = requests.get(url, params = struct)
    #assert r.status_code == 200, f'invalid request for {smile}'
    
    print(r.url)
    print(r.text)

In [27]:
pug_url_request3(smile2)

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/cids/txt?smiles=CC1%3DC%28%5BC%40%40%5D%28SC1%3DO%29%28C%29%2FC%3DC%28%5CC%29%2FC%3DC%29O
135403829



In [36]:
# retrieve in csv format
# hbond donor and acceptor counts, TPSA, XLogP

def pug_prop(smile, props):
    
    props_str = ','.join(props)
    
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f'{prolog}/compound/smiles/property/{props_str}/csv?smiles={smile}'
    
    r = requests.get(url)
    print(r.url)
    assert r.status_code == 200, 'invalid request'
    
    df = pd.read_csv(StringIO(r.text))
    
    return df

In [39]:
smile = "C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl"
props = ['HBondDonorCount', 'HBondAcceptorCount', 'TPSA', 'XLogP']
pug_prop(smile, props)

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/property/HBondDonorCount,HBondAcceptorCount,TPSA,XLogP/csv?smiles=C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl


Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,TPSA,XLogP
0,5564,1,2,29.5,5.0


In [52]:
def get_cid(inchi):
    
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f'{prolog}/compound/InChI/cids/txt'
    
    struct = {'inchi' : inchi}
    
    r = requests.get(url, params=struct)
    print(r.url)
    
    assert r.status_code == 200, 'invalid request'
    
    return r.text.strip()
    

In [53]:
inchi = "InChI=1S/C17H14O4S/c1-22(19,20)14-9-7-12(8-10-14)15-11-21-17(18)16(15)13-5-3-2-4-6-13/h2-10H,11H2,1H3"
get_cid(inchi)

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/InChI/cids/txt?inchi=InChI%3D1S%2FC17H14O4S%2Fc1-22%2819%2C20%2914-9-7-12%288-10-14%2915-11-21-17%2818%2916%2815%2913-5-3-2-4-6-13%2Fh2-10H%2C11H2%2C1H3


'5090'

In [56]:
def pug_post(smile):
    
    # use post method for multiple line structure, like sdf
    
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f'{prolog}/compound/smiles/cids/txt'
    
    struct = {'smiles' : smile}
    
    # params sends as parameter and stored in url
    # data not stored in url
    
    #r = requests.post(url, params=struct)
    r = requests.post(url, data=struct)
    
    print(r.url)
    print(r.text)


In [57]:
pug_post(smile2)

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/cids/txt
135403829



In [58]:
mysdf = '''1983
  -OEChem-07241917072D

 20 20  0     0  0  0  0  0  0999 V2000
    2.8660   -2.5950    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    4.5981    1.4050    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    2.8660    1.4050    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    2.8660    0.4050    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.7320   -0.0950    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.0000   -0.0950    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.7320   -1.0950    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.0000   -1.0950    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.8660   -1.5950    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.7320    1.9050    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.7320    2.9050    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    4.2690    0.2150    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
    1.4631    0.2150    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
    2.3291    1.7150    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
    4.2690   -1.4050    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
    1.4631   -1.4050    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
    4.3520    2.9050    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
    3.7320    3.5250    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
    3.1120    2.9050    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
    2.3291   -2.9050    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0
  1  9  1  0  0  0  0
  1 20  1  0  0  0  0
  2 10  2  0  0  0  0
  3  4  1  0  0  0  0
  3 10  1  0  0  0  0
  3 14  1  0  0  0  0
  4  5  2  0  0  0  0
  4  6  1  0  0  0  0
  5  7  1  0  0  0  0
  5 12  1  0  0  0  0
  6  8  2  0  0  0  0
  6 13  1  0  0  0  0
  7  9  2  0  0  0  0
  7 15  1  0  0  0  0
  8  9  1  0  0  0  0
  8 16  1  0  0  0  0
 10 11  1  0  0  0  0
 11 17  1  0  0  0  0
 11 18  1  0  0  0  0
 11 19  1  0  0  0  0
M  END
> <PUBCHEM_COMPOUND_CID>
1983

> <PUBCHEM_COMPOUND_CANONICALIZED>
1

> <PUBCHEM_CACTVS_COMPLEXITY>
139

> <PUBCHEM_CACTVS_HBOND_ACCEPTOR>
2

> <PUBCHEM_CACTVS_HBOND_DONOR>
2

> <PUBCHEM_CACTVS_ROTATABLE_BOND>
1
$$$$
'''

In [63]:
def pug_sdf(sdf, out):
    
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f'{prolog}/compound/sdf/{out}/txt'
    
    sdf = {'sdf' : sdf}
    
    r = requests.post(url, data=sdf)
    
    print(r.url)
    print(r.text)
    

In [64]:
pug_sdf(mysdf, 'cids')

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/sdf/cids/txt
1983



In [66]:
mysdf = Path('data/Structure2D_CID_5288826.sdf').read_text()
pug_sdf(mysdf, 'synonyms')

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/sdf/synonyms/txt
morphine
Morphia
Morphinum
Morphium
Morphina
Morphin
(-)-Morphine
Duromorph
DepoDur
Meconium
Morphinism
Moscontin
Ospalivina
Morfina
MS Contin
l-Morphine
57-27-2
Dulcontin
Nepenthe
Roxanol
MORPHINE SULFATE
Infumorph
Dreamer
Morpho
Avinza
Hocus
Kadian
Unkie
Cube juice
Hard stuff
Oramorph SR
Statex SR
Ms Emma
Morphin [German]
Morfina [Italian]
Duramorph
Morphina [Italian]
M-Eslon
Morphine [BAN]
CCRIS 5762
Dolcontin
HSDB 2134
(5R,6S,9R,13S,14R)-4,5-Epoxy-N-methyl-7-morphinen-3,6-diol
UNII-76I7G6D29C
CHEBI:17303
CHEMBL70
EINECS 200-320-2
(5alpha,6alpha)-17-methyl-7,8-didehydro-4,5-epoxymorphinan-3,6-diol
4,5alpha-Epoxy-17-methyl-7-morphinen-3,6alpha-diol
7,8-Didehydro-4,5-epoxy-17-methyl-morphinan-3,6-diol
(7R,7AS,12BS)-3-METHYL-2,3,4,4A,7,7A-HEXAHYDRO-1H-4,12-METHANO[1]BENZOFURO[3,2-E]ISOQUINOLINE-7,9-DIOL
DEA No. 9300
Morphine Anhydrate
76I7G6D29C
Morphine (BAN)
RMS
(5alpha,6alpha)-Didehydro-4,5-epoxy-17-methylmorphinan-

In [73]:
def pug_post_2a():

    # glob all sdf files
    compounds = list(Path('data').glob('lecture02*.sdf'))
    
    # properties to parse
    props = ['XLogP', 'MolecularWeight', 'HBondDonorCount', 'HBondAcceptorCount', 'TPSA']
    prop_str = ','.join(props)
    
    # setup url
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = f'{prolog}/compound/sdf/property/{prop_str}/csv'
    
    df = pd.DataFrame()
    for compound in compounds:
        
        # read sdf
        sdf = compound.read_text()
        
        # setup request
        sdf_dict = {'sdf' : sdf}
        r = requests.post(url, data=sdf_dict)
        
        # concat csv data
        df = pd.concat([df, pd.read_csv(StringIO(r.text))], ignore_index=True)
        
        # delay
        time.sleep(1)
        
    return df

In [74]:
pug_post_2a()

Unnamed: 0,CID,XLogP,MolecularWeight,HBondDonorCount,HBondAcceptorCount,TPSA
0,126941,-1.8,454.4,5,12,211.0
1,3385,-0.9,130.08,2,3,58.2
2,3657,-1.8,76.055,3,2,75.4
3,667490,0.0,152.18,2,2,85.2
4,2723601,-0.1,167.19,3,2,111.0
