In [37]:
import pandas as pd
from io import StringIO
import requests
import time

In [66]:
# parse pubchem data

def chem_pug_search(name, prop, print_out=True):
    
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/txt'
    r = requests.get(url)
    
    assert r.status_code == 200, f'invalid request {name}'
    if print_out:
        print(r.text)
    
    return r.text.strip()

def df_out(names, prop):
    
    data = []
    for i, name in enumerate(names):
        prop_out = chem_pug_search(name, prop, print_out=False)
        
        data.append([name, prop_out])
        
        # avoid overloading requests
        # pause after every 5 
        if i % 5 == 4:
            time.sleep(1)
        
    df = pd.DataFrame(data, columns=['name', prop]) 
    return df

In [33]:
# https://chem.libretexts.org/Courses/Intercollegiate_Courses/Cheminformatics_OLCC_(2019)/1._Introduction/1.06%3A_Accessing_PubChem_through_a_Web_Interface#Compound_Properties

# formula for water
print('water formula')
chem_pug_search('water', 'MolecularFormula')

# number of heavy atoms in butadient
print('butadient heavy atoms')
chem_pug_search('butadiene', 'HeavyAtomCount')

# molecular weight of ethanol
print('ethanol molecular weight')
chem_pug_search('ethanol', 'MolecularWeight')

# number of h-bond acceptors in aspiri
print('aspirin h-bond acceptors')
chem_pug_search('aspirin', 'HBondAcceptorCount')

water formula
H2O

butadient heavy atoms
4

ethanol molecular weight
46.070000

aspirin h-bond acceptors
4



'4'

In [67]:
# get canonical smiles for list of chemicals
names = ['cytosine', 'benzene', 'motrin', 'aspirin', 'zolpidem']

df_out(names, 'CanonicalSMILES')

Unnamed: 0,name,CanonicalSMILES
0,cytosine,C1=C(NC(=O)N=C1)N
1,benzene,C1=CC=CC=C1
2,motrin,CC(C)CC1=CC=C(C=C1)C(C)C(=O)O
3,aspirin,CC(=O)OC1=CC=CC=C1C(=O)O
4,zolpidem,CC1=CC=C(C=C1)C2=C(N3C=C(C=CC3=N2)C)CC(=O)N(C)C


In [68]:
# get xlogp values for linear alkanes with 1-12 carbons
alkanes = ['methane', 'ethane', 'propane', 'butane', 'pentane',
           'hexane', 'heptane', 'octane', 'nonane', 'decane']

df_out(alkanes, 'XLogP')

Unnamed: 0,name,XLogP
0,methane,0.6
1,ethane,1.3
2,propane,1.8
3,butane,2.9
4,pentane,3.4
5,hexane,3.9
6,heptane,4.4
7,octane,3.9
8,nonane,4.5
9,decane,5.0


In [69]:
# get isomeric smiles for 20 amino acids
aa_list = ['L-alanine', 'L-cysteine', 'L-aspartate', 'L-glutamate', 'L-phenylalanine',
      'L-glycine', 'L-histidine', 'L-isoleucine', 'L-lysine', 'L-leucine',
      'L-methionine', 'L-asparagine', 'L-proline', 'L-glutamine', 'L-arginine', 
      'L-serine', 'L-threonine', 'L-valine', 'L-tryptophan', 'L-tyrosine']

df_out(aa_list, 'IsomericSMILES')

Unnamed: 0,name,IsomericSMILES
0,L-alanine,C[C@@H](C(=O)O)N
1,L-cysteine,C([C@@H](C(=O)O)N)S
2,L-aspartate,C([C@@H](C(=O)O)N)C(=O)O
3,L-glutamate,C(CC(=O)O)[C@@H](C(=O)O)N\nC(CC(=O)[O-])[C@@H]...
4,L-phenylalanine,C1=CC=C(C=C1)C[C@@H](C(=O)O)N
5,L-glycine,C(C(=O)O)N
6,L-histidine,C1=C(NC=N1)C[C@@H](C(=O)O)N
7,L-isoleucine,CC[C@H](C)[C@@H](C(=O)O)N
8,L-lysine,C(CCN)C[C@@H](C(=O)O)N
9,L-leucine,CC(C)C[C@@H](C(=O)O)N


In [62]:
def multiple_prop_search(cids, props, chunk_size=10):

    # format properties into comma separated 
    prop_str = ','.join(props)
    
    # chunk cids 
    if len(cids) > chunk_size:
        chunks = (len(cids) // chunk_size) + 1
        
    # else less than 10 samples
    else:
        chunks = 1
   
    # loop over each chunk
    df = pd.DataFrame()
    for i in range(chunks):
        idx1 = 10*i
        idx2 = 10*(i+1)
        
        # get properties for cids
        cids_str = ','.join(map(str, cids[idx1:idx2]))
        url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cids_str}/property/{prop_str}/csv'
        r = requests.get(url)
        
        # concat chunk data into single df
        df = pd.concat([df, pd.read_csv(StringIO(r.text))], ignore_index=True)
        
        # delay
        if i % 5 == 4:
            time.sleep(1)
        
    return df

In [57]:
cids = ['4485', '4499', '5734', '8082']
props = ['HBondDonorCount', 'HbondAcceptorCount', 'XLogP', 'TPSA']
multiple_prop_search(cids, props)

Unnamed: 0,CID,HBondDonorCount,HBondAcceptorCount,XLogP,TPSA
0,4485,1,7,2.2,110.0
1,4499,1,7,3.3,110.0
2,5734,1,5,0.2,94.6
3,8082,1,1,0.8,12.0


In [63]:
cids = [ 443422,  72301,   8082,    4485,    5353740, 5282230, 5282138, 1547484, 941361, 5734,  \
         5494,    5422,    5417,    5290,    5245,    5026,    4746,    4507,    4499,   4497,  \
         4494,    4474,    4418,    4386,    4009,    4008,    3949,    3926,    3878,   3784,  \
         3698,    3547,    3546,    3336,    3333,    3236,    3076,    2585,    2520,   2351,  \
         2312,    2162,    1236,    1234,    292331,  275182,  235244,  108144,  104972, 77157, \
         5942250, 5311217, 4564402, 4715169, 5311501]
multiple_prop_search(cids, props)

Unnamed: 0,CID,HeavyAtomCount,RotatableBondCount,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount,TPSA,IsomericSMILES
0,443422,25,2,339.4,3.1,0,5,40.2,COC1=C(C2=C(C[C@@H]3C4=CC5=C(C=C4CCN3C2)OCO5)C...
1,72301,26,4,355.4,3.2,0,5,40.2,COC1=C(C2=C(C[C@H]3C4=CC(=C(C=C4CCN3C2)OC)OC)C...
2,8082,6,0,85.15,0.8,1,1,12.0,C1CCNCC1
3,4485,25,5,346.3,2.2,1,7,110.0,CC1=C(C(C(=C(N1)C)C(=O)OC)C2=CC=CC=C2[N+](=O)[...
4,5353740,30,7,416.5,3.5,2,5,76.0,CC1=C(C(CCC1)(C)C)CC/C(=C\CCC2=CCC(OC2O)C3=CC(...
5,5282230,24,6,327.3,3.2,2,5,84.9,COC1=C(C=C(C=C1)/C=C/C(=O)NC2=CC=CC=C2C(=O)O)OC
6,5282138,36,11,492.5,4.4,1,8,120.0,CC1=C(C(C(=C(N1)C)C(=O)OC/C=C/C2=CC=CC=C2)C3=C...
7,1547484,28,6,368.5,5.8,0,2,6.5,C1CN(CCN1C/C=C/C2=CC=CC=C2)C(C3=CC=CC=C3)C4=CC...
8,941361,30,6,404.5,6.0,0,4,6.5,C1CN(CCN1C/C=C/C2=CC=CC=C2)C(C3=CC=C(C=C3)F)C4...
9,5734,14,2,212.23,0.2,1,5,94.6,C1=CC=C2C(=C1)C(=NO2)CS(=O)(=O)N


In [64]:
# antiinflammatory agents
cids = [ 471, 1981, 2005, 2097, 2151, 2198, 2206, 2214, 2244, 2307, 2308, 2313, 2355, 2396, 2449, 2462, 2466, 2581, 2662, 2794, 2863, 3000, 3003, 3033, 3056, 3059, 3111, 3177, 3194, 3230, 3242, 3282, 3308, 3332, 3335, 3342, 3360, 3371, 3379, 3382, 3384, 3394, 3495, 3553, 3612, 3672, 3715, 3716, 3718, 3778, 3824, 3825, 3826, 3935, 3946, 3965, 4009, 4037, 4038, 4044, 4075, 4159, 4237, 4386, 4409, 4413, 4487, 4488, 4495, 4534, 4553, 4614, 4641, 4671, 4692, 4781, 4888, 4895, 4921, 5059, 5090, 5147, 5161, 5208, 5228, 5339, 5352, 5359, 5362, 5468, 5469, 5475, 5480, 5509, 5733, 5743, 5744, 5745, 5753, 5754, 5755, 5834, 5865, 5875, 5876, 5877, 6094, 6213, 6215, 6247, 6436, 6741, 7090, 7497, 8522, 9053, 9231, 9642, 9782, 9878, 10114, 10154, 10170, 10185, 10206, 12555, 12938, 13802, 14982, 15209, 16490, 16533, 16623, 16639, 16752, 16923, 17198, 19161, 20469, 21102, 21700, 21800, 21826, 21975, 22419, 23205, 26098, 26248, 26318, 28718, 28871, 30869, 30870, 30951, 31307, 31378, 31508, 31635, 31799, 31800, 32153, 32327, 32798, 33958, 35375, 35455, 35935, 36833, 37425, 38081, 38503, 39212, 39941, 40000, 40632, 41643, 43261, 44219, 47462, 47795, 50294, 50295, 51717, 54445, 54585, 57782, 59757, 60164, 60490, 60542, 60712, 60726, 60864, 61486, 62074, 62924, 63006, 63019, 64704, 64738, 64746, 64747, 64927, 64945, 64971, 64982, 65394, 65464, 65655, 65679, 65762, 66249, 67417, 68700, 68704, 68706, 68731, 68749, 68819, 68865, 68869, 68917, 71246, 71354, 71364, 71386, 71398, 71414, 71415, 71771, 72158, 72300, 73400, 82153, 84003, 84429, 90763, 91626, 91670, 100472, 102011, 104762, 104943, 107641, 107738, 107793, 108068, 108130, 114753, 114840, 114917, 114999, 115239, 119032, 119286, 119365, 119607, 119828, 119871, 121928, 121957, 122139, 122179, 122182, 123619, 123673, 123723, 124978, 128191, 128229, 128571, 133021, 134896, 146364, 151075, 151166, 152165, 155354, 155761, 156391, 158103, 159557, 162666, 164676, 167928, 168928, 174093, 174277, 176155, 177976, 180604, 183088, 189821, 192156, 196122, 196840, 196841, 200674, 201587, 219121, 222786, 229860, 235244, 236702, 259846, 263373, 275182, 292331, 425990, 439503, 439533, 441335, 441336, 442534, 442993, 443943, 443949, 443967, 444036, 445154, 445858, 446925, 479503, 485711, 490428, 501254, 522325, 546807, 578771, 584547, 610479, 633091, 633097, 636374, 636398, 656604, 656656, 656852, 657238, 667550, 927704, 969510, 969516, 1548887, 1548910, 2737488, 3033890, 3033980, 3045402, 3051696, 3055172, 4129359, 4306515, 4483645, 5018304, 5185849, 5280802, 5280914, 5280915, 5281004, 5281071, 5281515, 5281522, 5281792, 5282183, 5282193, 5282230, 5282387, 5282402, 5282492, 5283542, 5283734, 5284538, 5284539, 5311051, 5311052, 5311066, 5311067, 5311093, 5311101, 5311108, 5311169, 5311180, 5318517, 5320420, 5322111, 5352624, 5353725, 5353726, 5353740, 5353864, 5354499, 5377381, 5420804, 5420805, 5458396, 5472495, 5481958, 5701991, 5702036, 5702148, 5702212, 5702252, 5702287, 5745214, 5942250, 6420050, 6429274, 6437368, 6437387, 6438873, 6447131, 6453785, 6473881, 6509979, 6708733, 6710677, 6714002, 6917783, 6917852, 6917894, 6918172, 6918173, 6918332, 6918445, 6918452, 6918612, 6925666, 7060958, 7251185, 9554199, 9798098, 9799453, 9841438, 9843941, 9846332, 9865808, 9868219, 9869053, 9871508, 9875547, 9883509, 9897518, 9897771, 9907157, 9913795, 9919776, 9926694, 9934547, 10363606, 10918539, 11158972, 11513733, 11561674, 11616712, 11870423, 11949636, 11954221, 11954316, 11954353, 11954369, 11957468, 11961431, 11972243, 11972532, 12300053, 12313906, 12313911, 12606303, 12634263, 12714644, 12874922, 13018150, 13020033, 13041095, 14010989, 14515707, 14798494, 15895902, 16051947, 16132369, 16213022, 16213698, 16218996, 16219353, 16220118, 16759566, 16760658, 17750985, 17753757, 18526330, 18632363, 18647121, 18943026, 20054915, 21120116, 21637635, 21637642, 21893738, 21893804, 21982135, 22141508, 22811280, 23509770, 23631982, 23653552, 23657872, 23663407, 23663409, 23663418, 23663959, 23663989, 23665411, 23665999, 23667642, 23669636, 23674183, 23674255, 23674745, 23675763, 23680530, 23681059, 23684814, 23688663, 23693301, 23694214, 23702389, 24181458, 24721429, 24761485, 24799587, 24847961, 24847981, 24867460, 24867465, 24867475, 24883465, 24916955, 25077872, 25113755, 25796773, 40469526, 44119558, 44202892, 44260118, 44266812, 44386560, 45006151, 45006158, 45039955, 45356876, 45356931, 45357558, 45357932, 45358013, 45358120, 45358130, 45358140, 45358148, 45358149, 45488525, 46174093, 46397498, 46780650, 46780910, 46783539, 46783786, 46783814, 46863906, 46878350, 46882877, 50989825, 51026956, 51340230, 51398089, 53384387, 53394893, 53486221, 53486290, 53486322, 54194814, 54605501, 54675840, 54676228, 54677470, 54677971, 54677972, 54677977, 54682045, 54684589, 54690031, 54697648, 54708862, 54714524, 56841932, 56842111, 56845155, 57347755, 57486087, 67668959, 67804972, 67986221, 70470286, 70678885, 71306882, 71587162, 72774967, 72941490, 72941625, 73758129, 73759663, 73759808, 74787565, 77906397, 78577433, 90488794, 91711382, 91826463, 91873711, 91881846, 92131836, 92462493, 102004404, 102601886, 117072385, 117072403, 117072410, 118701141, 118701402, 118984459, 122130078, 122130111, 122130185, 122130213, 122130768, 122173054, 122173183, 122361610, 123134657, 124081055, 124463365, 126968472, 126968501, 126968801, 126969212, 126969455, 129009998, 129010022, 129010033, 129010043, 129316829, 129317859, 129317898, 129628207, 129628892, 129670532, 129735029, 131632430, 131635023, 131676243, 131750284, 131954647, 131954667, 132399051, 132399058, 133112890, 133126366, 133126370, 133562807, 133659920, 133687604, 134129698, 134159361, 134460917, 134612785, 134687786, 134688123, 134688323, 134688977, 134689786, 134693106, 134693125, 134693234, 134694728, 134694860, 135413496, 135413505, 135414247, 135484078, 135515521, 135565709, 136040192, 137177332, 137699687, 137705034, 137705717, 137705725, 137705994, 137706376, 137706400, 137795135, 138059757, 138107776, 138113311, 138113507, 138113581, 138114182, 138114743]
len(cids)

708

In [65]:
props = ['HeavyAtomCount', 'RotatableBondCount', 'MolecularWeight', 
         'XLogP', 'HBondDonorCount', 'HBondAcceptorCount', 
         'TPSA', 'IsomericSMILES']
multiple_prop_search(cids, props)

Unnamed: 0,CID,HeavyAtomCount,RotatableBondCount,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount,TPSA,IsomericSMILES
0,471,22,1,304.25,1.5,5,7,127.0,C1=CC(=C(C=C1C2C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O
1,1981,29,7,415.80,4.2,1,6,94.8,CC1=C(C2=C(N1C(=O)C3=CC=C(C=C3)Cl)C=CC(=C2)OC)...
2,2005,12,6,173.21,0.1,2,3,66.4,CC(=O)NCCCCCC(=O)O
3,2097,16,5,219.28,3.2,2,3,49.3,CC(C1=CC=C(C=C1)NCC(=C)C)C(=O)O
4,2151,15,1,203.24,0.1,1,3,49.6,CC1=C(C(=O)N(N1C)C2=CC=CC=C2)N
...,...,...,...,...,...,...,...,...,...
703,138113311,96,30,1364.60,,16,20,589.0,CC(=O)O.C1CC[C@H]2[C@@H](C1)CC(N2C(=O)C3CC4=CC...
704,138113507,33,7,462.50,1.8,3,8,138.0,C[C@]12CCC(=O)C=C1CC[C@H]3[C@@H]2C(C[C@]4([C@@...
705,138113581,55,7,781.00,2.5,9,13,219.0,CC1C(C(C(C(O1)OC2CC[C@]3([C@@H]([C@]2(C)CO)CC[...
706,138114182,55,6,781.00,2.5,8,13,208.0,CC1C(C(C([C@@H](O1)OC2CC[C@]3([C@H]([C@]2(C)CO...
