In [21]:
import pandas as pd
import requests
import time

In [39]:
# parse pubchem data

def chem_pug_search(name, prop, print_out=True):
    
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/txt'
    r = requests.get(url)
    
    assert r.status_code == 200, f'invalid request {name}'
    if print_out:
        print(r.text)
    
    return r.text.strip()

def df_out(names, prop):
    
    data = []
    for i, name in enumerate(names):
        prop_out = chem_pug_search(name, prop, print_out=False)
        data.append([name, prop_out])
        
        # avoid overloading requests
        # pause after every 5 
        if i % 5 == 4:
            time.sleep(1)
        
    df = pd.DataFrame(data, columns=['name', 'prop']) 
    return df

In [33]:
# https://chem.libretexts.org/Courses/Intercollegiate_Courses/Cheminformatics_OLCC_(2019)/1._Introduction/1.06%3A_Accessing_PubChem_through_a_Web_Interface#Compound_Properties

# formula for water
print('water formula')
chem_pug_search('water', 'MolecularFormula')

# number of heavy atoms in butadient
print('butadient heavy atoms')
chem_pug_search('butadiene', 'HeavyAtomCount')

# molecular weight of ethanol
print('ethanol molecular weight')
chem_pug_search('ethanol', 'MolecularWeight')

# number of h-bond acceptors in aspiri
print('aspirin h-bond acceptors')
chem_pug_search('aspirin', 'HBondAcceptorCount')

water formula
H2O

butadient heavy atoms
4

ethanol molecular weight
46.070000

aspirin h-bond acceptors
4



'4'

In [34]:
# get canonical smiles for list of chemicals
names = ['cytosine', 'benzene', 'motrin', 'aspirin', 'zolpidem']

df_out(names, 'CanonicalSMILES')

Unnamed: 0,name,prop
0,cytosine,C1=C(NC(=O)N=C1)N
1,benzene,C1=CC=CC=C1
2,motrin,CC(C)CC1=CC=C(C=C1)C(C)C(=O)O
3,aspirin,CC(=O)OC1=CC=CC=C1C(=O)O
4,zolpidem,CC1=CC=C(C=C1)C2=C(N3C=C(C=CC3=N2)C)CC(=O)N(C)C


In [36]:
# get xlogp values for linear alkanes with 1-12 carbons
alkanes = ['methane', 'ethane', 'propane', 'butane', 'pentane',
           'hexane', 'heptane', 'octane', 'nonane', 'decane']

df_out(alkanes, 'XLogP')

Unnamed: 0,name,prop
0,methane,0.6
1,ethane,1.3
2,propane,1.8
3,butane,2.9
4,pentane,3.4
5,hexane,3.9
6,heptane,4.4
7,octane,3.9
8,nonane,4.5
9,decane,5.0


In [41]:
# get isomeric smiles for 20 amino acids
aa_list = ['L-alanine', 'L-cysteine', 'L-aspartate', 'L-glutamate', 'L-phenylalanine',
      'L-glycine', 'L-histidine', 'L-isoleucine', 'L-lysine', 'L-leucine',
      'L-methionine', 'L-asparagine', 'L-proline', 'L-glutamine', 'L-arginine', 
      'L-serine', 'L-threonine', 'L-valine', 'L-tryptophan', 'L-tyrosine']

df_out(aa_list, 'IsomericSMILES')

Unnamed: 0,name,prop
0,L-alanine,C[C@@H](C(=O)O)N
1,L-cysteine,C([C@@H](C(=O)O)N)S
2,L-aspartate,C([C@@H](C(=O)O)N)C(=O)O
3,L-glutamate,C(CC(=O)O)[C@@H](C(=O)O)N\nC(CC(=O)[O-])[C@@H]...
4,L-phenylalanine,C1=CC=C(C=C1)C[C@@H](C(=O)O)N
5,L-glycine,C(C(=O)O)N
6,L-histidine,C1=C(NC=N1)C[C@@H](C(=O)O)N
7,L-isoleucine,CC[C@H](C)[C@@H](C(=O)O)N
8,L-lysine,C(CCN)C[C@@H](C(=O)O)N
9,L-leucine,CC(C)C[C@@H](C(=O)O)N
