Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module.

In [1]:
import os
import csv
import gzip

import xml.etree.ElementTree as ET

In [2]:
xml_path = os.path.join('download', 'drugbank.xml.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

In [14]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = dict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['groups'] = [group.text for group in drug.findall(ns + "groups/" + ns + "group")]
    row['atc_codes'] = [code.get('code') for code in drug.findall(ns + "atc-codes/" + ns + "atc-code")]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    rows.append(row)

In [15]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [16]:
fieldnames = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'inchikey', 'inchi']

path = os.path.join('data', 'drugbank.tsv')
with open(path, 'w') as write_file:
    writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

In [17]:
# write slim drugbank
fieldnames = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'inchikey', 'inchi']

path = os.path.join('data', 'drugbank-slim.tsv')
with open(path, 'w') as write_file:
    writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=fieldnames)
    writer.writeheader()
    for row in rows:
        if row['type'] != 'small molecule':
            #print(row['type'])
            continue
        if not row['inchi']:
            #print(row['inchi'])
            continue
        if not 'approved' in row['groups'].split('|'):
            #print(row['groups'])
            continue
        writer.writerow(row)