From the original full BindingDB database, we extracted a smaller, more manageable dataset by selecting only ligand-target pairs studied on viral organisms associated with sexually transmitted diseases, such as Human Immunodeficiency Virus (HIV). This subset is saved in the same `.tsv` file format as the original database.

For more details on the extraction process, refer to [`./src/data/std_extraction.ipynb`](./src/data/std_extraction.ipynb).


In [9]:
#import libraries and scripts

import pandas as pd
import numpy as np
from src.utils import data_utils, evaluation_utils, general_utils

In [10]:
df = pd.read_csv('BindingSTD.tsv', sep = '\t', on_bad_lines='skip', low_memory = False)

In [11]:
df = data_utils.select_metric(df, 'EC50 (nM)')

In [12]:
df.shape

(2372, 194)

In [13]:
df = data_utils.clean_na_columns(df)

In [14]:
df.shape

(2372, 26)

In [15]:
df

Unnamed: 0,BindingDB Reactant_set_id,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB MonomerID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,EC50 (nM),Curation/DataSource,...,Link to Target in BindingDB,Link to Ligand-Target Pair in BindingDB,PubChem CID,PubChem SID,Number of Protein Chains in Target (>1 implies a multichain complex),BindingDB Target Chain Sequence,PDB ID(s) of Target Chain,UniProt (TrEMBL) Submitted Name of Target Chain,UniProt (TrEMBL) Entry Name of Target Chain,UniProt (TrEMBL) Primary ID of Target Chain
5273,50518.0,CC(C)(C)c1ccc(-c2cn[nH]c2OCC(=O)Nc2ccc(cc2Cl)-...,"InChI=1S/C29H27Cl2N3O4/c1-29(2,3)20-9-10-21(23...",JLJKYEKBNAPOGV-UHFFFAOYSA-N,27603.0,2-{4-[4-(2-{[4-(4-tert-butyl-2-chlorophenyl)-1...,Gag-Pol polyprotein [588-1027],Human immunodeficiency virus 1,4869.0,Curated from the literature by BindingDB,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,25218440.0,57560117.0,2.0,MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAV...,"1C0T,1C0U,1C1B,1C1C,1DTQ,1DTT,1E6J,1EP4,1ESK,1...",,,
5299,50544.0,CC(C)(C)c1ccc(-c2cn[nH]c2OCC(=O)Nc2ccc(cc2Cl)-...,"InChI=1S/C29H27Cl2N3O4/c1-29(2,3)20-9-10-21(23...",JLJKYEKBNAPOGV-UHFFFAOYSA-N,27603.0,2-{4-[4-(2-{[4-(4-tert-butyl-2-chlorophenyl)-1...,"Gag-Pol polyprotein [588-1027,K691N,Y769C]/[58...",Human immunodeficiency virus 1,444.0,Curated from the literature by BindingDB,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,25218440.0,57560117.0,2.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,"1EET,1HAR,1HYS,1IKW,1LWE,1O1W,1UWB,2YKM,2YKN,2...",,,
5300,50545.0,Cc1[nH]nc(OCC(=O)Nc2ccc(cc2Cl)C#CC(C)(C)CO)c1-...,InChI=1S/C28H31Cl2N3O3/c1-17-25(20-9-8-19(14-2...,IVPUMMMSBHDOHK-UHFFFAOYSA-N,27604.0,2-{[4-(4-tert-butyl-2-chlorophenyl)-5-methyl-1...,"Gag-Pol polyprotein [588-1027,K691N,Y769C]/[58...",Human immunodeficiency virus 1,177.0,Curated from the literature by BindingDB,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,25218441.0,57560118.0,2.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,"1EET,1HAR,1HYS,1IKW,1LWE,1O1W,1UWB,2YKM,2YKN,2...",,,
5301,50546.0,Cc1[nH]nc(OCC(=O)Nc2ccc(cc2Cl)C#CC(C)(C)C(O)=O...,InChI=1S/C28H29Cl2N3O4/c1-16-24(19-9-8-18(14-2...,VIKLGGPFTSLKCS-UHFFFAOYSA-N,27605.0,4-[4-(2-{[4-(4-tert-butyl-2-chlorophenyl)-5-me...,"Gag-Pol polyprotein [588-1027,K691N,Y769C]/[58...",Human immunodeficiency virus 1,73.0,Curated from the literature by BindingDB,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,25218442.0,57560119.0,2.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,"1EET,1HAR,1HYS,1IKW,1LWE,1O1W,1UWB,2YKM,2YKN,2...",,,
5304,50549.0,Fc1c(Cc2n[nH]c3c(cccc23)C#N)ccc(Br)c1Oc1cc(Cl)...,InChI=1S/C22H11BrClFN4O/c23-18-5-4-13(8-19-17-...,KHGOZPBXQQRDPS-UHFFFAOYSA-N,27609.0,3-{[4-bromo-3-(3-chloro-5-cyanophenoxy)-2-fluo...,Gag-Pol polyprotein [588-1027],Human immunodeficiency virus 1,5.0,Curated from the literature by BindingDB,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,24885899.0,57560123.0,2.0,MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAV...,"1C0T,1C0U,1C1B,1C1C,1DTQ,1DTT,1E6J,1EP4,1ESK,1...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29855,51483932.0,Cc1cc(\C=C\C#N)cc(C)c1-n1cnc2cnc(Nc3ccc(cc3)C#...,InChI=1S/C23H17N7/c1-15-10-18(4-3-9-24)11-16(2...,GBIZDABECURRFX-ONEGZZNKSA-N,50597919.0,CHEMBL5187795,Reverse transcriptase,Human immunodeficiency virus 1,1.5,ChEMBL,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,16074499.0,482608771.0,1.0,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,Reverse transcriptase,Q9WKE8_9HIV1,Q9WKE8
29856,51483933.0,Cc1cc(\C=C\C#N)cc(C)c1Nc1ccnc(Nc2ccc(cc2)C#N)n1,InChI=1S/C22H18N6/c1-15-12-18(4-3-10-23)13-16(...,YIBOMRUWOWDFLG-ONEGZZNKSA-N,222178.0,Rilpivirine,Reverse transcriptase,Human immunodeficiency virus 1,2.3,ChEMBL,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,6451164.0,335961047.0,1.0,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,Reverse transcriptase,Q9WKE8_9HIV1,Q9WKE8
29857,51483934.0,Cc1cc(\C=C\C#N)cc(C)c1Nc1ncc2ncn(-c3ccc(cc3)C#...,InChI=1S/C23H17N7/c1-15-10-18(4-3-9-24)11-16(2...,SSZIHPTWVQZXAO-ONEGZZNKSA-N,50597918.0,CHEMBL5206708,Reverse transcriptase,Human immunodeficiency virus 1,91.0,ChEMBL,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,129906818.0,482608770.0,1.0,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,Reverse transcriptase,Q9WKE8_9HIV1,Q9WKE8
29858,51483935.0,Cc1cc(\C=C\C#N)cc(C)c1-n1cnc2cnc(Nc3ccc(cc3)C#...,InChI=1S/C23H17N7/c1-15-10-18(4-3-9-24)11-16(2...,GBIZDABECURRFX-ONEGZZNKSA-N,50597919.0,CHEMBL5187795,Reverse transcriptase,Human immunodeficiency virus 1,9.6,ChEMBL,...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,16074499.0,482608771.0,1.0,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,Reverse transcriptase,Q9WKE8_9HIV1,Q9WKE8
