In [1]:
import glob
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd

In [97]:
def names(drug):
    lst = []
    for i in drug.select('drug'):
        if i.parent.name == 'drugbank':
            name = i.find('name').text
            dbid = i.find('drugbank-id').text
            synonyms = ''
            tmp = []
            for j in i.select('synonyms'):
                for k in j.select('synonym'):
                    tmp.append(k.text)
            synonyms = '; '.join(tmp)
            lst.append([dbid, name,synonyms])
        else:
            pass
    return(lst)

def targets(drug):
    lst = []
    for i in drug.select('drug'):
        dbid = i.find('drugbank-id').text
        target_UniProtKB = ''
        for j in i.select('targets'):
            for o in j.select('target'):
                try:
                    target_geneName = o.find('gene-name').text
                except:
                    target_geneName = ''
                for k in o.select('external-identifier'):
                    if k.find('resource').text == 'UniProtKB':
                        target_UniProtKB = k.find('identifier').text
                for k in o.select('actions'):
                    try:
                        target_action = k.find('action').text
                    except:
                        target_action = ''
                lst.append([dbid, target_geneName, target_UniProtKB, target_action])
    return(lst)

def interactions(drug):
    lst = []
    for i in drug.select('drug'):
        dbid = i.find('drugbank-id').text
        interactor_dbid = ''
        interaction_type = ''
        for j in i.select('drug-interactions'):
            for k in j.select('drug-interaction'):
                interactor_dbid = k.find('drugbank-id').text
                interaction_type = k.find('description').text
                lst.append([dbid, interactor_dbid, interaction_type])
    return(lst)

filelist = glob.glob('./DrugBank_full_database_v5/*.xml')

lst_names = []
lst_targets  = []
lst_interactions = []

for file in tqdm(filelist):
    infile = open(file, 'r', encoding='utf-8')
    contents = infile.read()
    soup = BeautifulSoup(contents, 'lxml-xml')
    drugs = soup.select('drugbank')
    for drug in drugs:
        lst_names.append(pd.DataFrame.from_records(names(drug), columns=['DrugBankID','DrugName','DrugSynonyms']))
        lst_targets.append(pd.DataFrame.from_records(targets(drug), columns=['DrugBankID','TargetGeneSymbol','TargetUniProtID','Action']))
        lst_interactions.append(pd.DataFrame.from_records(interactions(drug), columns=['DrugBankID_1','DrugBankID_2','Description']))

df_names = pd.concat(lst_names)
df_targets = pd.concat(lst_targets)
df_interactions = pd.concat(lst_interactions)

100%|██████████| 501/501 [24:44<00:00,  2.96s/it] 


In [99]:
df_names.to_csv('DrugBank_names.tsv', index=False, sep='\t')
df_targets.to_csv('DrugBank_targets.tsv', index=False, sep='\t')
df_interactions.to_csv('DrugBank_interactions.tsv', index=False, sep='\t')