In [1]:
#import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#load the full dataset

df = pd.read_csv('BindingDB_All.tsv', sep = '\t', on_bad_lines='skip', low_memory=False)

In [3]:
#print all the columns to explore the database content

for idx, column in enumerate(df.columns):
    print(f"{idx}: {column}")

0: BindingDB Reactant_set_id
1: Ligand SMILES
2: Ligand InChI
3: Ligand InChI Key
4: BindingDB MonomerID
5: BindingDB Ligand Name
6: Target Name
7: Target Source Organism According to Curator or DataSource
8: Ki (nM)
9: IC50 (nM)
10: Kd (nM)
11: EC50 (nM)
12: kon (M-1-s-1)
13: koff (s-1)
14: pH
15: Temp (C)
16: Curation/DataSource
17: Article DOI
18: BindingDB Entry DOI
19: PMID
20: PubChem AID
21: Patent Number
22: Authors
23: Institution
24: Link to Ligand in BindingDB
25: Link to Target in BindingDB
26: Link to Ligand-Target Pair in BindingDB
27: Ligand HET ID in PDB
28: PDB ID(s) for Ligand-Target Complex
29: PubChem CID
30: PubChem SID
31: ChEBI ID of Ligand
32: ChEMBL ID of Ligand
33: DrugBank ID of Ligand
34: IUPHAR_GRAC ID of Ligand
35: KEGG ID of Ligand
36: ZINC ID of Ligand
37: Number of Protein Chains in Target (>1 implies a multichain complex)
38: BindingDB Target Chain Sequence
39: PDB ID(s) of Target Chain
40: UniProt (SwissProt) Recommended Name of Target Chain
41: UniPr

In [4]:
#print all the target organisms

df['Target Source Organism According to Curator or DataSource'].unique()

array(['Human immunodeficiency virus 1', 'Human immunodeficiency virus',
       'Homo sapiens', nan, 'Human immunodeficiency virus 2',
       'Rattus norvegicus', 'Mus musculus', 'Klebsiella pneumoniae',
       'Pseudomonas aeruginosa', 'Bos taurus', 'Oryctolagus cuniculus',
       'Abelson murine leukemia virus', 'Gallus gallus',
       'Avian sarcoma virus', 'Hepatitis C virus',
       'Influenza A virus (A/ruddy turnstone/NJ/60/1985(N9))',
       'Influenza B virus (strain B/Lee/1940)',
       'Influenza A virus (strain A/Singapore/1/1957 H2N2)',
       'Influenza B virus',
       'Influenza A virus (A/Puerto Rico/8/34/Mount Sinai/Wi(H1N1))',
       'Influenza B virus (B/Victoria/517/2005)',
       'Influenza B virus (B/Memphis/3/93)',
       'Influenza A virus (strain A/Tokyo/3/1967 H2N2)',
       'Influenza A virus (A/Shangdong/9/1993(H3N2))',
       'Influenza B virus (strain B/Memphis/3/1989)',
       'Marthasterias glacialis', 'Porcellium fiumanum',
       'Plasmodium falciparu

In [5]:
#we manually picked all the sexually transmittable pathogens from the list, only selecting viruses for further analyses

sexually_transmitted_pathogens = [
    'Human immunodeficiency virus 1',
    'Human immunodeficiency virus 2',
    'Human immunodeficiency virus type 1 group M subtype B (isolate MN)',
    'Hepatitis C virus',
    'Hepatitis C virus genotype 1a (isolate H)',
    'Hepatitis C virus genotype 1b (isolate Taiwan)',
    'Hepatitis C virus genotype 1b (isolate BK)',
    'Hepatitis C virus genotype 2b (isolate HC-J8)',
    'Hepatitis C virus genotype 2',
    'Hepatitis C virus genotype 3a (isolate NZL1)',
    'Hepatitis C virus genotype 4a (isolate ED43)',
    'Hepatitis C virus genotype 6a (isolate EUHK2)',
    'Human herpesvirus 1',
    'Human herpesvirus 2',
    'Human herpesvirus 4',
    'Human herpesvirus 8',
    'Human T-lymphotropic virus 1',
    'Human cytomegalovirus',
    #'Trichomonas vaginalis G3'   bacterial
]

#we create a new database of binding experiments conducted only on sexually transmittable viruses

std = df[df['Target Source Organism According to Curator or DataSource'].isin(sexually_transmitted_pathogens)]

In [6]:
#check the length of our new database

len(std)

30156

In [7]:
#check how many rows are available for each virus

std.value_counts('Target Source Organism According to Curator or DataSource')

Target Source Organism According to Curator or DataSource
Human immunodeficiency virus 1                                        21780
Hepatitis C virus                                                      4828
Hepatitis C virus genotype 1a (isolate H)                              1535
Human herpesvirus 8                                                    1419
Hepatitis C virus genotype 1b (isolate BK)                              233
Human herpesvirus 2                                                     114
Human herpesvirus 1                                                      80
Human immunodeficiency virus 2                                           55
Hepatitis C virus genotype 3a (isolate NZL1)                             45
Hepatitis C virus genotype 2b (isolate HC-J8)                            19
Human T-lymphotropic virus 1                                             18
Hepatitis C virus genotype 1b (isolate Taiwan)                           12
Human immunodeficiency virus t

In [8]:
#save the rows on STD in a separate database for further analyses

std.to_csv('BindingSTD.tsv', sep = '\t', index = False)