In [70]:
import os
import csv
import gzip
import io
import pprint
import collections

In [71]:
def read_bindingdb(path):
    """
    Field documentation: https://www.bindingdb.org/bind/chemsearch/marvin/BindingDB-TSV-Format.pdf
    """
    read_file = gzip.open(path, 'rb')
    text = io.TextIOWrapper(read_file)
    reader = csv.DictReader(text, delimiter='\t')
    for row in reader:
        yield row
    read_file.close()

path = os.path.join('download', 'BindingDB_All_2015m3.tsv.gz')
bindingdb_generator = read_bindingdb(path)

In [73]:
uniprots = list()

for i, row in enumerate(bindingdb_generator):
    #if i > 10000:
    #    break
    bindingdb_id = row['BindingDB MonomerID']
    source = row['Curation/DataSource']
    uniprot = row['UniProt (SwissProt) Primary ID of Target Chain']
    uniprots.append(uniprot)
    #if uniprot:
    #    print(uniprot)
    #pprint.pprint(row)

collections.Counter(uniprots)

Counter({None: 1115637, '': 1})

In [90]:
target_fields = [
    'BindingDB Target Chain  Sequence',
    'PDB ID(s) of Target Chain',
    'UniProt (SwissProt) Recommended Name of Target Chain',
    'UniProt (SwissProt) Entry Name of Target Chain',
    'UniProt (SwissProt) Primary ID of Target Chain',
    'UniProt (SwissProt) Secondary ID(s) of Target Chain',
    'UniProt (SwissProt) Alternative ID(s) of Target Chain',
    'UniProt (TrEMBL) Submitted Name of Target Chain',
    'UniProt (TrEMBL) Entry Name of Target Chain',
    'UniProt (TrEMBL) Primary ID of Target Chain',
    'UniProt (TrEMBL) Secondary ID(s) of Target Chain',
    'UniProt (TrEMBL) Alternative ID(s) of Target Chain',
]

chains_key = 'Number of Protein Chains in Target (>1 implies a multichain complex)'

def read_bindingdb(path):
    """
    Field documentation: https://www.bindingdb.org/bind/chemsearch/marvin/BindingDB-TSV-Format.pdf
    """
    read_file = gzip.open(path, 'rb')
    text = io.TextIOWrapper(read_file)
    reader = csv.reader(text, delimiter='\t')
    header = next(reader)
    chains_index = header.index(chains_key)
    len_target_fields = len(target_fields)
    taget0_index = chains_index + 1 + len_target_fields
    ligand_fields = header[:chains_index + 1]
    chains = list()
    for row in reader:
        row = [x if x else None for x in row]
        ligand_values = row[:chains_index + 1]
        rowdict = collections.OrderedDict(zip(ligand_fields, ligand_values))
        for i in range(int(rowdict[chains_key])):
            target_values = row[taget0_index + i * len_target_fields : taget0_index + (i + 1) * len_target_fields]
            chain = collections.OrderedDict(zip(target_fields, target_values))
            chains.append(chain)
        rowdict['chains'] = chains
        yield rowdict
    read_file.close()


In [93]:
path = os.path.join('download', 'BindingDB_All_2015m3.tsv.gz')
bindingdb_generator = read_bindingdb(path)

for i, row in enumerate(bindingdb_generator):
    if i > 2:
        break
    bindingdb_id = row['BindingDB MonomerID']
    source = row['Curation/DataSource']
    pprint.pprint(row)


{'BindingDB Reactant_set_id': '1',
 'Ligand SMILES': 'COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1',
 'Ligand InChI': 'InChI=1S/C22H24BrFN4O2/c1-28-7-5-14(6-8-28)12-30-21-11-19-16(10-20(21)29-2)22(26-13-25-19)27-18-4-3-15(23)9-17(18)24/h3-4,9-11,13-14H,5-8,12H2,1-2H3,(H,25,26,27)',
 'Ligand InChI Key': 'UHTHHESEBZOYNR-UHFFFAOYSA-N',
 'BindingDB MonomerID': '21',
 'BindingDB Ligand Name': 'VANDETANIB',
 'Target Name Assigned by Curator or DataSource': 'HIV-1 Protease',
 'Target Source Organism According to Curator or DataSource': 'Human '
                                                              'immunodeficiency '
                                                              'virus 1',
 'Ki (nM)': ' .24',
 'IC50 (nM)': None,
 'Kd (nM)': None,
 'EC50 (nM)': None,
 'kon (M-1-s-1)': None,
 'koff (s-1)': None,
 'pH': '5.5',
 'Temp (C)': '37.00 C',
 'Curation/DataSource': 'Curated from the literature by BindingDB',
 'Article DOI': '10.1021/jm9602571',
 'PMID': '8784449',
 'PubChem AID'

In [None]:
path = os.path.join('download', 'BindingDB_All_2015m3.tsv.gz')
bindingdb_generator = read_bindingdb(path)

collections.Counter(int(row[chains_key]) for row in bindingdb_generator)