In [1]:
import os
import csv
import collections
import json
import gzip
import io

import requests

In [2]:
path = os.path.join('data', 'drugbank.tsv')
with open(path) as read_file:
    reader = csv.DictReader(read_file, delimiter='\t')
    drugbank = list(reader)

drugbank_ids = [drug['drugbank_id'] for drug in drugbank]
assert len(drugbank_ids) == len(set(drugbank_ids))
collections.Counter(drug['type'] for drug in drugbank)

Counter({'small molecule': 7469, 'biotech': 290})

In [3]:
cpd_search_url = 'https://www.ebi.ac.uk/unichem/rest/cpd_search'

source_to_id = {
    None: 0,
    'chembl': 1,
    'drugbank': 2,
    'pdb': 3,
    'iuphar': 4,
    'pubchem_dotf': 5,
    'kegg_ligand': 6,
    'chebi': 7,
    'nih_ncc': 8,
    'zinc': 9,
    'emolecules': 10,
    'ibm': 11,
    'atlas': 12,
    'ibm_patents': 13,
    'fdasrs': 14,
    'surechembl': 15,
    'pharmgkb': 17,
    'hmdb': 18,
    'selleck': 20,
    'pubchem_tpharma': 21,
    'pubchem': 22,
    'mcule': 23,
    'nmrshiftdb2': 24,
    'lincs': 25,
    'actor': 26,
    'recon': 27,
    'molport': 28,
    'nikkaji': 29,
    'bindingdb': 31,
}

id_to_source = {v: k for k, v in source_to_id.items()}

def connectivity_query(source, compound_id, target = None):
    """
    https://www.ebi.ac.uk/unichem/info/widesearchInfo
    """
    url = '{base_url}/{src_compound_id}/{src_id}/{A}/{B}/{C}/{D}/{E}/{F}/{G}/{H}'.format(
        base_url = cpd_search_url,
        src_compound_id = compound_id,
        src_id = source_to_id[source],
        A = source_to_id[target], # Sources
        B = 0, # Pattern
        C = 0, # Component Mapping
        D = 0, # Frequency Block
        E = 0, # InChI Length Block
        F = 0, # UniChem Labels
        G = 0, # Assignment Status
        H = 1, # Data Structure
    )
    response = requests.get(url)
    response = response.json()
    if 'error' in response:
        print(response['error'])
        return None
    for assignment in response.values():
        header = assignment.pop(0)
        for match in assignment:
            yield collections.OrderedDict(zip(header, match))

In [None]:
source = 'drugbank'
compound_id = drugbank_ids[1000]
matches = list(connectivity_query(source, compound_id))
print(json.dumps(matches, indent = 2))

In [None]:
# mapping writer
mapping_path = os.path.join('data', 'mapping.tsv.gz')
mapping_file = gzip.open(mapping_path, 'wb')
mapping_buffer = io.TextIOWrapper(mapping_file, line_buffering = True)
mapping_fields = ['drugbank_id', 'drugbank_name', 'src_id', 'source_name', 'src_compound_id',
              'C', 'Query_InChIKey', 'CpdId_InChIKey', 'Full_Query_InChI', 'Full_CpdId_InChI',
              'Matching_Query_InChI', 'Matching_CpdId_InChI', 'b', 'i', 'm', 'p', 's', 't']
mapping_writer = csv.DictWriter(mapping_buffer, delimiter = '\t', fieldnames = mapping_fields, extrasaction = 'ignore')
mapping_writer.writeheader()

# mapping counts writer
count_path = os.path.join('data', 'mapping-counts.tsv')
count_file = open(count_path, 'w')
source_names = [id_to_source[i] for i in sorted(set(id_to_source) - {0})]
count_fields = ['drugbank_id', 'drugbank_name'] + source_names
count_writer = csv.DictWriter(count_file, delimiter = '\t', fieldnames = count_fields, restval = 0)
count_writer.writeheader()


for drug in drugbank:
    if drug['type'] != 'small molecule':
        continue
    drugbank_id = drug['drugbank_id']
    drugbank_name = drug['name']
    print(drugbank_id, drugbank_name)
    query_matches = list(connectivity_query(source, drugbank_id))
    for match in query_matches:
        match['drugbank_id'] = drugbank_id
        match['drugbank_name'] = drugbank_name
        match['source_name'] = id_to_source[int(match['src_id'])]
        mapping_writer.writerow(match)
    
    count = collections.Counter(match['source_name'] for match in query_matches)
    count = collections.defaultdict(int, count)
    count['drugbank_id'] = drugbank_id
    count['drugbank_name'] = drugbank_name
    count_writer.writerow(count)

mapping_file.close()
count_file.close()