In [None]:
from rdkit.Chem import Descriptors
import rdkit.Chem as Chem
import pandas as pd
import numpy as np
import re

In [None]:
file = 'data/preprocessing/bindingdb.tsv'
outputfile = 'data/preprocessing/cleaned_bindingdb.csv'

In [None]:
extra_columns_string = 'BindingDB Target Chain  Sequence\tPDB ID(s) of Target Chain\tUniProt (SwissProt) Recommended Name of Target Chain\tUniProt (SwissProt) Entry Name of Target Chain\tUniProt (SwissProt) Primary ID of Target Chain\tUniProt (SwissProt) Secondary ID(s) of Target Chain\tUniProt (SwissProt) Alternative ID(s) of Target Chain\tUniProt (TrEMBL) Submitted Name of Target Chain\tUniProt (TrEMBL) Entry Name of Target Chain\tUniProt (TrEMBL) Primary ID of Target Chain\tUniProt (TrEMBL) Secondary ID(s) of Target Chain\tUniProt (TrEMBL) Alternative ID(s) of Target Chain'
extra_columns = extra_columns_string.split('\t')

#columns to be used
columns_keep = ["BindingDB Reactant_set_id", "Ligand SMILES",  "Target Name Assigned by Curator or DataSource", "Ki (nM)", "Ki (nM)>or<", "IC50 (nM)", "IC50 (nM)>or<", "Kd (nM)>or<", "Kd (nM)", "EC50 (nM)", "EC50 (nM)>or<", "pH", "Temp (C)", "Number of Protein Chains in Target (>1 implies a multichain complex)",'BindingDB Target Chain  Sequence']
for i in range(1,19):
    columns_keep.append(f'BindingDB Target Chain  Sequence.{i}')
    
#columns that need string replacing
columns_to_replace = ['Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)']
numeric_cols = columns_to_replace + ['BindingDB Reactant_set_id', 'Number of Protein Chains in Target (>1 implies a multichain complex)' ]

In [None]:
def add_column(df, columns):
    for column in columns:
        conditions = [
            df[column].str.contains('<', regex=True, na = False),
            df[column].str.contains('>', regex=True, na = False)
        ]
        choices = [ -1, 1]
        df[f'{column}>or<'] = np.select(conditions, choices, default=0)


In [None]:
def convert_to_weight(smiles):
    try:
        m = Chem.MolFromSmiles(smiles)
        return Descriptors.MolWt(m)
    except:
        return None

In [None]:
#columns that need to be shifted
columns_for_shifting = ['Number of Protein Chains in Target (>1 implies a multichain complex)', 'BindingDB Target Chain  Sequence','PDB ID(s) of Target Chain']

def shift(df, indexes, columns = columns_for_shifting):
    for index in indexes:
        for i in range(len(columns)-1):
            value = df.loc[index, columns[i+1]]
            df.loc[index, columns[i]] = value
        

In [None]:
header = True
df = pd.read_csv(file, sep = '\t', chunksize=100000)
for chunk in df:
    #add -1 to <, 1 to > and else 0 to a new column for each of the columns to replace
    add_column(chunk, columns_to_replace)
        
    #replace the symbols in the numeric files
    for columns in columns_to_replace:
        chunk[columns] = chunk[columns].str.replace('[><]', '')
        chunk[columns] = chunk[columns].replace(r'^\s*$', np.nan, regex=True)
   
    
    #to_numeric for the numeric columns
    chunk[numeric_cols] = chunk[numeric_cols].apply(pd.to_numeric, axis=1)
    
    #shifting the part where number of protein chains is null
    if chunk["Number of Protein Chains in Target (>1 implies a multichain complex)"].isnull().any():
        indexes = chunk[chunk["Number of Protein Chains in Target (>1 implies a multichain complex)"].isnull()]
        shift(chunk, indexes.index)
        
    #drop unwanted columns
    chunk = chunk[columns_keep]
    
    #calculate the molecular weight and drop the NaNs
    chunk['MolWt'] = chunk['Ligand SMILES'].apply(convert_to_weight)
    chunk = chunk[chunk['MolWt'].notna()]
    
#     output to csv
    if header:
        chunk.to_csv(outputfile)
        header = False
    else:
        chunk.to_csv(outputfile, mode='a', header= None)

    #melt the file using extra columns
#     chunk = pd.melt(chunk, id_vars = extra_columns)
