In [None]:
import os
import pandas as pd
import warnings

----------------------------
> ## Handling the dataset's size and selecting needed data
#### *1) Selecting CDKs as protein targets*
The initial database was too heavy to be entirely loaded so we decided to divide it into 6 dataframes. Then, for every dataframe (df1, df2, df3...), we selected the protein targets of interest: in our case the Cyclin-dependant Kinases (CDKs). 

In [None]:
# Sample 1
df1 = pd.read_csv('data_BD\data_BD.tsv', sep='\t', on_bad_lines='skip', nrows=500000) # selection of the first 500'000 rows of the initial data
df1_cleaned = df1[df1['Target Name'].str.contains('Cyclin-dependent kinase', na=False)] # selection of CDKs targets for our analysis

# Sample 2
df2 = pd.read_csv('data_BD\data_BD.tsv', sep='\t', on_bad_lines='skip', skiprows=(1,500000), nrows=500000)
df2_cleaned = df2[df2['Target Name'].str.contains('Cyclin-dependent kinase', na=False)]

# Sample 3
df3 = pd.read_csv('data_BD\data_BD.tsv', sep='\t', on_bad_lines='skip', skiprows=(1,1000000), nrows=500000)
df3_cleaned = df3[df3['Target Name'].str.contains('Cyclin-dependent kinase', na=False)]

# Sample 4
df4 = pd.read_csv('data_BD\data_BD.tsv', sep='\t', on_bad_lines='skip', skiprows=(1,1500000), nrows=500000)
df4_cleaned = df4[df4['Target Name'].str.contains('Cyclin-dependent kinase', na=False)]

# Sample 5
df5 = pd.read_csv('data_BD\data_BD.tsv', sep='\t', on_bad_lines='skip', skiprows=(1,2000000), nrows=500000)
df5_cleaned = df5[df5['Target Name'].str.contains('Cyclin-dependent kinase', na=False)]

# Sample 6
df6 = pd.read_csv('data_BD\data_BD.tsv', sep='\t', on_bad_lines='skip', skiprows=(1,2500000), nrows=500000)
df6_cleaned = df6[df5['Target Name'].str.contains('Cyclin-dependent kinase', na=False)]

In [None]:
# Concatenation of the 6 obtained dataframes
df_cleaned = pd.concat([df1_cleaned, df2_cleaned, df3_cleaned, df4_cleaned, df5_cleaned, df6_cleaned], axis=0)
print(f"Shape of the reduced-size dataframe: {df_cleaned.shape}")

# Creation of a tsv file from the dataframe 
df_cleaned.to_csv("BindingDB_cleaned.tsv", sep='\t', index=False) 

(52494, 194)

#### *2) Selecting columns necessary for our analysis*


In [18]:
df_cleaned.columns

Index(['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name',
       'Target Source Organism According to Curator or DataSource', 'Ki (nM)',
       'IC50 (nM)',
       ...
       'UniProt (SwissProt) Recommended Name of Target Chain.12',
       'UniProt (SwissProt) Entry Name of Target Chain.12',
       'UniProt (SwissProt) Primary ID of Target Chain.12',
       'UniProt (SwissProt) Secondary ID(s) of Target Chain.12',
       'UniProt (SwissProt) Alternative ID(s) of Target Chain.12',
       'UniProt (TrEMBL) Submitted Name of Target Chain.12',
       'UniProt (TrEMBL) Entry Name of Target Chain.12',
       'UniProt (TrEMBL) Primary ID of Target Chain.12',
       'UniProt (TrEMBL) Secondary ID(s) of Target Chain.12',
       'UniProt (TrEMBL) Alternative ID(s) of Target Chain.12'],
      dtype='object', length=194)

From all these columns, only a few will be needed to make our analysis. We therefore decided to choose: 
- `The ligand SMILE` in case we want to analyse the molecular structure further in the project
- `The ligand and target names` as identifiers to analyse them
- `All the binding affinity constants` to analyse the affinity between the ligands and targets
- `The pH and the temperature` to get the experimantal conditions
- `The sequence for each chain of the target` to analyse the amino acids sequences involved in the binding

In [21]:
df_col_cleaned = df_cleaned[['Ligand SMILES', 'BindingDB Ligand Name', 'Target Name', 'Target Source Organism According to Curator or DataSource', 
                            'Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)', 'pH', 'Temp (C)', 
                            'Number of Protein Chains in Target (>1 implies a multichain complex)', 'BindingDB Target Chain Sequence', 
                            'UniProt (SwissProt) Entry Name of Target Chain', 'UniProt (TrEMBL) Entry Name of Target Chain', 'BindingDB Target Chain Sequence.1', 
                            'UniProt (SwissProt) Entry Name of Target Chain.1', 'UniProt (TrEMBL) Entry Name of Target Chain.1', 'BindingDB Target Chain Sequence.2', 
                            'UniProt (SwissProt) Entry Name of Target Chain.2', 'UniProt (TrEMBL) Entry Name of Target Chain.2']]
df_col_cleaned.shape
df_col_cleaned.head(10)

Unnamed: 0,Ligand SMILES,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),kon (M-1-s-1),koff (s-1),...,Number of Protein Chains in Target (>1 implies a multichain complex),BindingDB Target Chain Sequence,UniProt (SwissProt) Entry Name of Target Chain,UniProt (TrEMBL) Entry Name of Target Chain,BindingDB Target Chain Sequence.1,UniProt (SwissProt) Entry Name of Target Chain.1,UniProt (TrEMBL) Entry Name of Target Chain.1,BindingDB Target Chain Sequence.2,UniProt (SwissProt) Entry Name of Target Chain.2,UniProt (TrEMBL) Entry Name of Target Chain.2
3817,COc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1,3-((4-Methoxyphenyl)-amino)-4-((3-chlorophenyl...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,5800,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3818,Oc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1,3-((4-Hydroxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,>1000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3835,COc1cccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1,3-((3-Methoxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,4300,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3836,Oc1cccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1,3-((3-Hydroxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,3000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3837,CC(C)(C)OC(=O)Nc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl...,3-((4-(N-BOC-amino)phenyl)amino)-4-((3-chlorop...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,57000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3838,Nc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1,3-((4-Aminophenyl)amino)-4-((3-chlorophenyl)am...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,<10000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3855,CN(C)c1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1,3-((4-(Dimethylamino)phenyl)amino)-4-((3-chlor...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,350,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3856,Clc1cccc(Nc2ncnc3n[nH]c(NCc4ccccc4)c23)c1,3-(Benzylamino)-4-((3-chlorophenyl)amino)-1H-p...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,7400,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3857,Clc1cccc(CNc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1,3-((3-Chlorobenzyl)amino)-4-((3-chlorophenyl)a...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,16000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3858,COc1cccc(CNc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1,3-((3-Methoxybenzyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,3100,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,


In [None]:
# Creation of a tsv file from the dataframe that we will use in our analysis
df_col_cleaned.to_csv("BindingDB_col_cleaned.tsv", sep='\t', index=False)