# Create a single dataset of Compound-Gene binding relationships from BindingDB and DrugBank

In [1]:
import sys
import itertools

import pandas

sys.path.insert(0, '../')
import utils

In [2]:
def split_and_clean_ids(id_str, sep='|'):
    """Return a set of unique non-empty IDs from a joined string."""
    if not id_str or pandas.isnull(id_str):
        return set()
    id_set = set(id_str.split(sep))
    id_set.discard('')
    return id_set

## Read datasets

In [3]:
# Read BindingDB
commit = '28dc70275103a233a2f02024082adcea45102a96'
url = utils.rawgit('dhimmel', 'bindingdb', commit, 'data/bindings-drugbank-gene.tsv')
binding_df = pandas.read_table(url)
# Filter for micromolar binding affinity
binding_df = binding_df[binding_df.affinity_nM <= 1000]
# Parse compound fields
binding_df.sources = binding_df.sources.map(lambda x: split_and_clean_ids(x, ','))
binding_df.pubmeds = binding_df.pubmeds.map(lambda x: split_and_clean_ids(x, ','))
binding_df = binding_df.rename(columns={'entrez_gene': 'entrez_gene_id', 'pubmeds': 'pubmed_ids'})
binding_df.head(2)

Unnamed: 0,drugbank_id,entrez_gene_id,affinity_nM,n_pairs,sources,pubmed_ids,drugbank_name,drugbank_approved,gene_symbol
0,DB00035,552,62.4,1,set([ChEMBL]),set([15084136]),Desmopressin,1,AVPR1A
1,DB00035,553,5.8,1,set([ChEMBL]),set([15084136]),Desmopressin,1,AVPR1B


In [4]:
# Read DrubBank compound-gene interaction
commit = '3e87872db5fca5ac427ce27464ab945c0ceb4ec6'
url = utils.rawgit('dhimmel', 'drugbank', commit, 'data/proteins.tsv')
drugbank_protein_df = pandas.read_table(url)
drugbank_protein_df.pubmed_ids = drugbank_protein_df.pubmed_ids.map(
    lambda x: split_and_clean_ids(x, '|'))
drugbank_protein_df.actions = drugbank_protein_df.actions.map(
    lambda x: split_and_clean_ids(x, '|'))
drugbank_protein_df['sources'] = drugbank_protein_df.apply(
    lambda x: set(['DrugBank ({})'.format(x['category'])]), axis=1)
drugbank_protein_df.head(2)

Unnamed: 0,drugbank_id,category,uniprot_id,entrez_gene_id,organism,known_action,actions,pubmed_ids,sources
0,DB00001,target,P00734,2147,Human,yes,set([inhibitor]),"set([11807012, 10912644, 11467439, 10505536, 1...",set([DrugBank (target)])
1,DB00006,target,P00734,2147,Human,yes,set([inhibitor]),"set([11060732, 11833835, 11504570, 11923794, 1...",set([DrugBank (target)])


In [5]:
# DrugBank gene-protein interactions by category
drugbank_protein_df.category.value_counts()

target         14282
enzyme          3547
transporter     1757
carrier          320
Name: category, dtype: int64

## Combine BindingDB and DrugBank

In [6]:
# Create a combined dataset of BindingDB and DrugBank by appending all rows
long_df = drugbank_protein_df[['drugbank_id', 'entrez_gene_id', 'sources', 'pubmed_ids', 'actions']].append(
        binding_df[['drugbank_id', 'entrez_gene_id', 'affinity_nM', 'sources', 'pubmed_ids']])
long_df.actions = long_df.actions.map(lambda x: x if pandas.notnull(x) else set())
long_df.head()

Unnamed: 0,actions,affinity_nM,drugbank_id,entrez_gene_id,pubmed_ids,sources
0,set([inhibitor]),,DB00001,2147,"set([11807012, 10912644, 11467439, 10505536, 1...",set([DrugBank (target)])
1,set([inhibitor]),,DB00006,2147,"set([11060732, 11833835, 11504570, 11923794, 1...",set([DrugBank (target)])
2,set([activator]),,DB00025,2147,"set([10688826, 2402772, 12463731, 1139038, 309...",set([DrugBank (enzyme)])
3,set([]),,DB00055,2147,"set([12062545, 12296618, 12070133, 12208873, 1...",set([DrugBank (target)])
4,set([]),,DB00100,2147,"set([11019961, 10648407, 10499905, 10498586, 1...",set([DrugBank (target)])


In [7]:
def get_license(sources):
    """Return the license of a binding relationship based on its sources."""
    sources = frozenset(sources)
    if sources == frozenset({'BindingDB'}):
        return 'CC-BY 3.0'
    if sources == frozenset({'ChEMBL'}):
        return 'CC-BY-SA 3.0'
    if sources == frozenset({'BindingDB', 'ChEMBL'}):
        return 'CC-BY-SA 3.0'
    return None

def condense(df):
    """Combine gene-compound relationships"""
    row = pandas.Series()
    row['sources'] = set(itertools.chain.from_iterable(df.sources))
    row['pubmed_ids'] = set(itertools.chain.from_iterable(df.pubmed_ids))
    row['actions'] = set(itertools.chain.from_iterable(df.actions))
    row['affinity_nM'] = df.affinity_nM.mean(skipna=True)
    row['license'] = get_license(row['sources'])
    return row

condensed_df = long_df.groupby(['drugbank_id', 'entrez_gene_id']).apply(condense).reset_index()

In [8]:
# Number of condensed bindings
len(condensed_df)

23191

In [9]:
# Convert compound fields into pipe-delimited strings
for column in 'sources', 'pubmed_ids', 'actions':
    condensed_df[column] = condensed_df[column].map(lambda x: '|'.join(sorted(x)))

In [10]:
# Save condensed bindings
condensed_df.to_csv('CbG-binding.tsv', sep='\t', index=False)

In [11]:
condensed_df.head(2)

Unnamed: 0,drugbank_id,entrez_gene_id,sources,pubmed_ids,actions,affinity_nM,license
0,DB00001,2147,DrugBank (target),10505536|10912644|11055889|11467439|11752352|1...,inhibitor,,
1,DB00002,712,DrugBank (target),17016423|17139284,,,
