# Warning: untested and written for python 2.7

This code was part of a larger codebase, but was used to produce `lincs_small_molecules.json.gz` and `lincs_small_molecules.tsv`.

In [2]:
import collections
import csv
import math
import json
import os
import gzip
import pprint
import sys

import requests
import pubchempy

In [5]:
# LINCS API variables
# http://api.lincscloud.org/
api_version = 'a2'
service = 'pertinfo'
api_url = 'http://api.lincscloud.org/{}/{}'.format(api_version, service)
with open(os.path.join('private', 'apikey.txt')) as read_file:
    api_key = read_file.read().rstrip()
block_size = 100 # max is 1000

In [None]:
def json_from_url(base, params):
    """Parse and return a json file obtained from the specified url."""
    response = requests.get(base, params=params)
    return response.json()

def query_links_api(query, verbose=False):
    """ """
    url_data = {'q': query, 'l': block_size, 'sk': 0, 'user_key': api_key, 'c': 'true'}
    
    num_docs = json_from_url(api_url, url_data)['count']
    num_blocks = int(math.ceil(float(num_docs) / block_size))
    del url_data['c']
    if verbose:
        print '{} results: splitting query into {} chunks of {}.'.format(num_docs, num_blocks, block_size)
    
    results = list()
    for i in range(num_blocks):
        if verbose:
            print 'Chunk {}/{}'.format(i + 1, num_blocks)
        url_data['sk'] =  i * block_size
        results += json_from_url(api_url, url_data)
    
    return results

In [None]:
query = '{"pert_type":"trt_cp"}'
compounds = query_links_api(query)

In [None]:
path = os.path.join('data', 'lincs_small_molecules.json.gz')
with gzip.open(path, 'w') as write_file:
    json.dump(compounds, write_file)

## Start here to read lincs catalog from json rather than REST api
query pubchem for parent compounds

In [None]:
path = os.path.join('data', 'lincs_small_molecules.json.gz')
with gzip.open(path) as read_file:
    compounds = json.load(read_file)

In [None]:
# Set missing pubchem_cid records to None
for compound in compounds:
    pubchem_cid = compound.get('pubchem_cid')
    try:
        pubchem_cid = int(pubchem_cid)
    except (ValueError, TypeError) as e:
        compound['pubchem_cid'] = None
        continue
    if pubchem_cid <= 0:
        compound['pubchem_cid'] = None

In [None]:
def get_pubchem_parent(cid, orphans_as_self=True):
    """
    From a pubchem_cid, retreive the parent compound's cid.
    If function is unsuccesful in retrieving a single parent,
    `orphans_as_self = True` returns `cid` rather than None.
    
    According to pubmed:
    
    > A parent is conceptually the "important" part of the molecule
    > when the molecule has more than one covalent component.
    > Specifically, a parent component must have at least one carbon
    > and contain at least 70% of the heavy (non-hydrogen) atoms of
    > all the unique covalent units (ignoring stoichiometry).
    > Note that this is a very empirical definition and is subject to change.

    A parallel query can be executed using the REST PUG API:
    http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/11477084/cids/XML?cids_type=parent
    """
    assert cid
    
    try:
        parent_cids = pubchempy.get_cids(identifier=cid, namespace='cid', domain='compound', cids_type='parent')
    except pubchempy.BadRequestError as e:
        print 'Error getting parent of {}. {}'.format(cid, e)
        return cid if orphans_as_self else None
    try:
        parent_cid, = parent_cids
        return parent_cid
    except ValueError:
        print 'Error getting parent of {}. Parents retreived: {}'.format(cid, parent_cids)
    return cid if orphans_as_self else None

In [None]:
for i, compound in enumerate(compounds):
    if i % 1000 == 0:
        print '{} / {} compounds complete'.format(i, len(compounds))
    if 'parent_cid' in compound:
        #print 'parent cached for {}'.format(compound['pert_iname'])
        continue
    pubchem_cid = compound.get('pubchem_cid')
    if not pubchem_cid:
        print 'No pubchem_cid record for {}'.format(compound['pert_iname'])
        compound['parent_cid'] = None
        continue
    compound['parent_cid'] = get_pubchem_parent(pubchem_cid)
    sys.stdout.flush()

In [None]:
# http://api.lincscloud.org/a2/docs/pertinfo
fields = ['pert_id', 'pert_iname', 'pubchem_cid', 'parent_cid', 'molecular_formula',
          'molecular_wt', 'pert_vendor', 'num_gold', 'num_inst', 'num_sig', 
          'canonical_smiles', 'inchi_key', 'inchi_string']

path = os.path.join('data', 'lincs_small_molecules.tsv')
with open(path, 'w') as write_file:
    writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=fields, extrasaction='ignore')
    writer.writeheader()
    writer.writerows(compounds)