In [None]:
import os
import csv
import collections
import json
import gzip

import requests

In [62]:
path = os.path.join('data', 'drugbank.tsv')
with open(path) as read_file:
    reader = csv.DictReader(read_file, delimiter='\t')
    drugbank = list(reader)

drugbank_ids = [drug['drugbank_id'] for drug in drugbank]
assert len(drugbank_ids) == len(set(drugbank_ids))
collections.Counter(drug['type'] for drug in drugbank)

Counter({'small molecule': 7469, 'biotech': 290})

In [63]:
cpd_search_url = 'https://www.ebi.ac.uk/unichem/rest/cpd_search'

source_to_id = {
    None: 0,
    'chembl': 1,
    'drugbank': 2,
    'pdb': 3,
    'iuphar': 4,
    'pubchem_dotf': 5,
    'kegg_ligand': 6,
    'chebi': 7,
    'nih_ncc': 8,
    'zinc': 9,
    'emolecules': 10,
    'ibm': 11,
    'atlas': 12,
    'ibm_patents': 13,
    'fdasrs': 14,
    'surechembl': 15,
    'pharmgkb': 17,
    'hmdb': 18,
    'selleck': 20,
    'pubchem_tpharma': 21,
    'pubchem': 22,
    'mcule': 23,
    'nmrshiftdb2': 24,
    'lincs': 25,
    'actor': 26,
    'recon': 27,
    'molport': 28,
    'nikkaji': 29,
    'bindingdb': 31,
}

id_to_source = {v: k for k, v in source_to_id.items()}

def connectivity_query(source, compound_id, target = None):
    """
    https://www.ebi.ac.uk/unichem/info/widesearchInfo
    """
    url = '{base_url}/{src_compound_id}/{src_id}/{A}/{B}/{C}/{D}/{E}/{F}/{G}/{H}'.format(
        base_url = cpd_search_url,
        src_compound_id = compound_id,
        src_id = source_to_id[source],
        A = source_to_id[target], # Sources
        B = 0, # Pattern
        C = 0, # Component Mapping
        D = 0, # Frequency Block
        E = 0, # InChI Length Block
        F = 0, # UniChem Labels
        G = 0, # Assignment Status
        H = 1, # Data Structure
    )
    response = requests.get(url)
    response = response.json()
    if 'error' in response:
        print(response['error'])
        return None
    for assignment in response.values():
        header = assignment.pop(0)
        for match in assignment:
            yield collections.OrderedDict(zip(header, match))

In [64]:
source = 'drugbank'
compound_id = drugbank_ids[1000]
matches = list(connectivity_query(source, compound_id))
print(json.dumps(matches, indent = 2))

[
  {
    "src_compound_id": "6C26C73DEEF3580AFA2799A579D058EC",
    "src_id": "11",
    "aux_src": null,
    "assignment": "1",
    "label": "",
    "C": "0",
    "Full_Query_InChI": "InChI=1S/C14H14N2O/c1-14(2,12-6-4-8-16-10-12)13(17)11-5-3-7-15-9-11/h3-10H,1-2H3",
    "Full_CpdId_InChI": "InChI=1S/C14H14N2O/c1-14(2,12-6-4-8-16-10-12)13(17)11-5-3-7-15-9-11/h3-10H,1-2H3",
    "Matching_Query_InChI": "InChI=1S/C14H14N2O/c1-14(2,12-6-4-8-16-10-12)13(17)11-5-3-7-15-9-11/h3-10H,1-2H3",
    "Matching_CpdId_InChI": "InChI=1S/C14H14N2O/c1-14(2,12-6-4-8-16-10-12)13(17)11-5-3-7-15-9-11/h3-10H,1-2H3",
    "p": "0",
    "b": "0",
    "t": "0",
    "m": "0",
    "s": "0",
    "i": "0",
    "Query_InChIKey": "FJLBFSROUSIWMA-UHFFFAOYSA-N",
    "CpdId_InChIKey": "FJLBFSROUSIWMA-UHFFFAOYSA-N"
  },
  {
    "src_compound_id": "14749227",
    "src_id": "21",
    "aux_src": null,
    "assignment": "1",
    "label": "",
    "C": "0",
    "Full_Query_InChI": "InChI=1S/C14H14N2O/c1-14(2,12-6-4-8-16-10-12)13

In [None]:
counts = list()
matches = list()

for drug in drugbank:
    if drug['type'] != 'small molecule':
        continue
    drugbank_id = drug['drugbank_id']
    drugbank_name = drug['name']
    print(drugbank_id, drugbank_name)
    query_matches = list(connectivity_query(source, drugbank_id))
    for match in query_matches:
        match['drugbank_id'] = drugbank_id
        match['drugbank_name'] = drugbank_name
        match['source_name'] = id_to_source[int(match['src_id'])]
        matches.append(match)
    count = collections.Counter(match['source_name'] for match in query_matches)
    count = collections.defaultdict(int, count)
    count['drugbank_id'] = drugbank_id
    count['drugbank_name'] = drugbank_name
    counts.append(count)

DB00014 Goserelin
DB00035 Desmopressin
DB00050 Cetrorelix
DB00091 Cyclosporine
DB00093 Felypressin
DB00104 Octreotide
DB00114 Pyridoxal Phosphate
DB00115 Cyanocobalamin
No currently assigned Standard InChIKey could be found for this src_comound_id in UniChem 
DB00116 Tetrahydrofolic acid
DB00117 L-Histidine
DB00118 S-Adenosylmethionine
DB00119 Pyruvic acid
DB00120 L-Phenylalanine
DB00121 Biotin
DB00122 Choline
DB00123 L-Lysine
DB00125 L-Arginine
DB00126 Vitamin C
DB00127 Spermine
DB00128 L-Aspartic Acid
DB00129 L-Ornithine
DB00130 L-Glutamine
DB00131 Adenosine monophosphate
DB00132 Alpha-Linolenic Acid
DB00133 L-Serine
DB00134 L-Methionine
DB00135 L-Tyrosine
DB00136 Calcitriol
DB00137 Xanthophyll
DB00138 L-Cystine
DB00139 Succinic acid
DB00140 Riboflavin
DB00141 N-Acetyl-D-glucosamine
DB00142 L-Glutamic Acid
DB00143 Glutathione
DB00144 Phosphatidylserine
DB00145 Glycine
DB00146 Calcidiol
DB00147 Pyridoxal
DB00148 Creatine
DB00149 L-Leucine
DB00150 L-Tryptophan
DB00151 L-Cysteine
DB0015

In [88]:
# Save mapping counts
path = os.path.join('data', 'mapping-counts.tsv')
with open(path, 'w') as write_file:
    source_names = [id_to_source[i] for i in sorted(set(id_to_source) - {0})]
    fieldnames = ['drugbank_id', 'drugbank_name'] + source_names
    writer = csv.DictWriter(write_file, delimiter = '\t', fieldnames = fieldnames, restval = 0)
    writer.writeheader()
    writer.writerows(counts)

In [None]:
# Save mappings
path = os.path.join('data', 'mapping.tsv.gz')
with gzip.open(path, 'w') as write_file:
    fieldnames = ['drugbank_id', 'drugbank_name', 'source_name'] + list(matches[0].keys())
    writer = csv.DictWriter(write_file, delimiter = '\t', fieldnames = fieldnames)
    writer.writeheader()
    writer.writerows(matches)