In [57]:
import requests
import time
from collections import defaultdict
import pandas as pd

In [3]:
# get sid from cid
prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
pr_input = 'compound/cid/129825914'
pr_oper = 'sids'
pr_output = 'txt'
url = prolog + '/' + pr_input + '/' + pr_oper + '/' + pr_output
res = requests.get(url)
print(res.text)

341669951



In [6]:
# get sids for multiple cids
pugin = 'compound/cid/129825914,129742624,129783988'
url = prolog + '/' + pugin + '/' + pr_oper + '/' + pr_output
res = requests.get(url)
print(res.text)

341669951
341492923
341577059
345261280
368769438



In [8]:
# group output sids with list_return and output as json
# default json/xml

pugin   = 'compound/cid/129825914,129742624,129783988'
pugoper = 'sids'
pugout = 'json'
pugopt = 'list_return=grouped'
url = prolog + '/' + pugin + '/' + pugoper + '/' + pugout + '?' + pugopt
res = requests.get(url)
print(res.text)

{
  "InformationList": {
    "Information": [
      {
        "CID": 129825914,
        "SID": [
          341669951
        ]
      },
      {
        "CID": 129742624,
        "SID": [
          341492923
        ]
      },
      {
        "CID": 129783988,
        "SID": [
          341577059,
          345261280,
          368769438
        ]
      }
    ]
  }
}



In [10]:
# flat output
# default txt

pugopt = 'list_return=flat'
url = prolog + '/' + pugin + '/' + pugoper + '/' + pugout + '?' + pugopt
res = requests.get(url)
print(res.text)

{
  "IdentifierList": {
    "SID": [
      341492923,
      341577059,
      341669951,
      345261280,
      368769438
    ]
  }
}



In [12]:
# implicit input list
# compound -> cids
# cids -> sids
# substance -> sid

url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/lactose/cids/txt'
res = requests.get(url)
cids = res.text.split()
print('# cids returned: {}'.format(len(cids)))
print(','.join(cids))

url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/lactose/sids/txt'
res = requests.get(url)
sids = res.text.split()
print('# sids returned (method 1): {}'.format(len(sids)))

url = 'http://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/name/lactose/sids/txt'
res = requests.get(url)
sids = res.text.split()
print('# sids returned (method 2): {}'.format(len(sids)))

# cids returned: 1
6134
# sids returned (method 1): 169
# sids returned (method 2): 124


In [13]:
# retrieve json substance records for statins
def ex_1a(sub):
    
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{sub}/sids/json?name_type=word'
    r = requests.get(url)
    assert r.status_code == 200, 'invalid request'
    
    print(r.text)

In [14]:
ex_1a('statin')

{
  "InformationList": {
    "Information": [
      {
        "CID": 12560,
        "SID": [
          5020,
          602912,
          823926,
          855292,
          7847208,
          7887354,
          7979183,
          8149330,
          8159405,
          10321305,
          11335469,
          11335572,
          11335877,
          11360708,
          11360811,
          11361116,
          11362969,
          11363553,
          11365531,
          11366115,
          11368093,
          11368677,
          11374314,
          11376255,
          11376839,
          11461680,
          11461783,
          11462088,
          11484514,
          11484962,
          11488575,
          11489079,
          11492353,
          11493929,
          11494473,
          12012581,
          14720219,
          14840178,
          14864494,
          24894374,
          24894612,
          24894626,
          24894661,
          26611731,
          26681121,
          29280771,
  

In [15]:
# get mixtures containing input molecule

prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
url = prolog + '/compound/name/tylenol/cids/txt?cids_type=component'
res = requests.get(url)
cids = res.text.split()
print(len(cids))
print(cids)

382
['145676293', '145397607', '144644119', '144644084', '144618563', '144618505', '144607898', '144564570', '144003985', '142107926', '140999210', '139507894', '139348435', '139205670', '139200747', '139197242', '139196990', '139196989', '139196988', '139196987', '139196487', '139196486', '139196260', '139194717', '139177862', '139146193', '139146192', '139146191', '139065479', '139063721', '139063720', '139063719', '137528085', '137524012', '136090259', '134821662', '134821661', '134539182', '132232560', '130454883', '129031778', '129031742', '129031741', '129010982', '122707137', '122640664', '122640662', '121470966', '118988662', '118437740', '118437739', '118437726', '118437724', '118437719', '118437718', '118096890', '110191848', '101532036', '91971975', '91844722', '91809174', '91799326', '91373323', '91304928', '91249665', '90190366', '89154873', '88829467', '88808486', '88375685', '88265592', '87793048', '87767189', '87689257', '87616094', '87616016', '87550936', '87316578', '

In [17]:
# get components of a compound

url = prolog + '/compound/cid/' + cids[0] + '/cids/txt?cids_type=component'
res = requests.get(url)
component_cids = res.text.split()
print('CID: {}'.format(cids[0]))
print('Number of components: {}'.format(len(component_cids)))
print(component_cids)

CID: 145676293
Number of components: 2
['1983', '297']


In [50]:
# get cids for each drug

drugs = ['aspirin', 'tylenol', 'advil']

def ex_2a(drugs):
    
    cids = []
    for drug in drugs:

        prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
        url = prolog + f'/compound/name/{drug}/cids/txt'
        r = requests.get(url)
        assert r.status_code == 200, f'invalid request for {drug}'
        
        cids.append(r.text.strip())
        time.sleep(0.2)
        
    # cids of the mixture compounds that contain each drug
    # cids of the components that accur in any of the reutrned mixtures
    mixture_cids = defaultdict(list)
    component_cids = defaultdict(list)
    for cid in cids:
        
        mixture_url = prolog + f'/compound/cid/{cid}/cids/txt?cids_type=component'
        r = requests.get(mixture_url)
        assert r.status_code == 200, f'invalid request for {cid}'
        
        mixture_cids[cid].append(r.text.split())
        time.sleep(0.2)
        
        # get components for first 
        first_component = mixture_cids[cid][0][0]
        component_url = prolog + f'/compound/cid/{first_component}/cids/txt?cids_type=component'
        r = requests.get(component_url)
        assert r.status_code == 200, f'invalid request for {first_component}'
        
        component_cids[first_component].append(r.text.split())
        time.sleep(0.2)
        
    return mixture_cids, component_cids

In [51]:
mixture, component = ex_2a(drugs)

In [52]:
mixture

defaultdict(list,
            {'2244': [['145811440',
               '145731449',
               '145731443',
               '145267670',
               '145237809',
               '145237798',
               '145050868',
               '144416654',
               '144416652',
               '144416647',
               '143824540',
               '143824535',
               '143824519',
               '142968283',
               '142771600',
               '142028898',
               '141571817',
               '141552381',
               '141403477',
               '141382569',
               '141352026',
               '141352020',
               '141152740',
               '140564466',
               '139326835',
               '139205672',
               '139205671',
               '139201528',
               '139196823',
               '139183101',
               '139183100',
               '139179002',
               '139081618',
               '139067080',
               '139063

In [53]:
component

defaultdict(list,
            {'145811440': [['23954', '2244']],
             '145676293': [['1983', '297']],
             '145864119': [['4914', '3672']]})

In [54]:
# get compounds tested in given assay

url = prolog + '/assay/aid/1207599/cids/txt'
res = requests.get(url)
cids = res.text.split()
print(len(cids))
print(cids)

791
['6175', '6197', '8547', '10219', '14169', '17558', '21389', '68050', '84677', '95783', '95996', '142779', '177894', '180548', '182792', '241056', '253602', '302770', '348623', '379338', '408190', '427456', '453048', '456183', '458959', '463795', '467892', '467895', '467898', '467900', '467902', '468692', '493035', '540335', '615754', '628093', '653020', '658095', '659146', '659572', '660337', '660996', '661700', '664853', '665381', '670727', '678644', '679624', '684193', '686636', '692799', '696459', '697239', '701785', '705510', '709466', '711950', '718105', '722343', '726776', '728907', '732311', '742641', '745456', '746602', '759319', '763219', '780973', '783532', '787413', '787416', '805487', '807557', '819039', '819041', '826058', '826108', '826140', '865238', '866779', '871153', '876820', '879749', '899915', '929152', '933766', '934186', '935739', '939076', '940283', '945743', '951335', '951809', '962627', '972880', '973099', '991453', '1000261', '1036940', '1042562', '10466

In [55]:
# get only active compounds

url = prolog + '/assay/aid/1207599/cids/txt?cids_type=active'
res = requests.get(url)
cids = res.text.split()
print(len(cids))
print(cids)

435
['6197', '10219', '14169', '17558', '68050', '177894', '182792', '253602', '348623', '453048', '456183', '458959', '463795', '467892', '467895', '467898', '467900', '540335', '628093', '697239', '701785', '742641', '745456', '807557', '826140', '972880', '973099', '1092462', '1104215', '1104245', '1187199', '1253822', '1272562', '1330474', '1507416', '1591101', '1929483', '1931935', '2226126', '2229100', '2454286', '2526359', '2788193', '2826655', '2840340', '2840651', '2865851', '2871881', '2876588', '2877655', '2895488', '2897031', '2900550', '2917883', '2918568', '2923731', '2946841', '3010592', '3020289', '3098392', '3114195', '3124081', '3124283', '3304735', '3351585', '3732278', '4524296', '4827679', '4970781', '5065884', '5311382', '5322214', '5322341', '5328733', '6404647', '6603435', '7086352', '7292609', '7292627', '7292667', '7292689', '7294801', '7294819', '9549410', '9549480', '9802843', '10066728', '10173796', '10215271', '10237991', '10432767', '11237028', '11534555'

In [56]:
# retrieve compounds tested in any assay targetign human carbonic anyhydrase accession P00918

url = prolog + '/assay/target/accession/P00918/cids/txt'
res = requests.get(url)
cids = res.text.split()
print(len(cids))

23951


In [68]:
# find compounds active against human acetylcholinesterase P08173

def ex_3a(accession, chunk_size):
    
    # get active against accession
    prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    url = prolog + f'/assay/target/accession/{accession}/cids/txt?cids_type=active'
    r = requests.get(url)
    assert r.status_code == 200, f'invalid request {accession}'
    
    # split all cids and chunk
    cids = r.text.split()
    chunks = len(cids) // chunk_size
    
    smiles = []
    for i in range(chunks+1):
        
        idx1 = chunk_size * i 
        idx2 = chunk_size * (i+1)
        
        subset_cids = cids[idx1:idx2]
        cids_str = ','.join(subset_cids)
        
        # get smiles for active cids
        url = prolog + f'/compound/cid/{cids_str}/property/IsomericSMILES/txt'
        r = requests.get(url)
        assert r.status_code == 200, f'invalid request {accession}'
        
        smiles.extend(r.text.split())
        
        # delay
        if i % 5 == 4:
            time.sleep(1)
            
    df = pd.DataFrame({'cid' : cids,
                       'isomeric smiles' : smiles})
    
    return df

In [69]:
ex_3a('P08173', 100)

Unnamed: 0,cid,isomeric smiles
0,5831,C[N+](C)(C)CCOC(=O)N.[Cl-]
1,44337760,CN1CC2C[C@H](CC1C2)OC(=O)C(C3=CC=CC=C3)C4=CC=C...
2,44337747,CC(C1=CC=CC=C1)(C2=CC=CC=C2)C(=O)O[C@H]3CC4CC(...
3,15599355,CN1CC2C[C@@H](CC1C2)OC(=O)C(C3=CC=CC=C3)(C4=CC...
4,44337761,CC(C1=CC=CC=C1)(C2=CC=CC=C2)C(=O)O[C@@H]3CC4CC...
...,...,...
6255,24768606,C1CN(CCN1C2=CC=NC=C2)C(=O)CCNS(=O)(=O)C3=CC=CC...
6256,441071,C1CN2CC3=CCO[C@H]4CC(=O)N5[C@H]6[C@H]4[C@H]3C[...
6257,1734,C[N+]1(CCC(CC1)OC(=O)C(C2=CC=CC=C2)C3=CC=CC=C3)C
6258,5491428,CO/N=C(/C#N)\[C@H]1CN2CCC1CC2
