In [65]:
###
# File name: Preprocess.ipynb (jupyter notebook)
# Description: load raw data and preprocess
# Created on: 2024-05-01
# Modification History
#   - 2024-05-20 (edited by Gyumin Lee): Integrating several preprocessing steps
# Version: 0.1
###

## Load data

In [66]:
import pubchempy as pcp
import requests
import json
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',200,'display.max_columns',50)
import csv
import time
import pickle
import os
import sys
import pubchempy as pcp

from tqdm import tqdm
sys.path.append(os.path.join(RDConfig.RDContribDir,'SA_Score'))
from sascorer import calculateScore
from sklearn.model_selection import train_test_split
from scipy import sparse as sps

In [67]:
root = "/home2/glee/Drug_Discovery_Research"
data_path = os.path.join(root, "data")

In [None]:
# rawdata = pd.read_csv(os.path.join(data_path, "preprocessed/golden_set_20240520.csv"))

In [68]:
cols_required = ["PubChem CID", "PubChem SID", "BindingDB Ligand Name", "Ligand SMILES", "Ligand InChI", "UniProt (SwissProt) Primary ID of Target Chain", "Target Name", "UniProt (SwissProt) Entry Name of Target Chain", "BindingDB Target Chain Sequence", "PubChem AID", "Ki (nM)", "Kd (nM)", "IC50 (nM)", "EC50 (nM)"]
dict_to_convert_colnames = {"PubChem CID": "C_cid",
                            "PubChem SID": "C_sid",
                            "BindingDB Ligand Name": "C_name",
                            "Ligand SMILES": "C_seq_lig_smiles",
                            "Ligand InChI": "C_seq_inchi",
                            "UniProt (SwissProt) Primary ID of Target Chain": "T_id",
                            "Target Name": "T_name",
                            "UniProt (SwissProt) Entry Name of Target Chain": "T_name_uniprot",
                            "BindingDB Target Chain Sequence": "T_seq",
                            "PubChem AID": "assay_id",
                            "Ki (nM)": "Ki",
                            "Kd (nM)": "Kd",
                            "IC50 (nM)": "IC50",
                            "EC50 (nM)": "EC50"
                           }

In [69]:
## Total binding affinity data released 2024-04
nrows = None
rawdata = pd.read_csv(os.path.join(data_path, "BindingDB_All_202404.tsv"), sep="\t", usecols=cols_required, error_bad_lines=False, nrows=nrows)[cols_required].rename(columns=dict_to_convert_colnames)

  interactivity=interactivity, compiler=compiler, result=result)


## Basic preprocessing

In [349]:
## Remove missing value
data_notna = rawdata.loc[rawdata[(rawdata["C_cid"].notna() & rawdata["T_id"].notna() & rawdata["IC50"].notna())].index]

In [350]:
## Convert dtypes
data_notna.update(data_notna["IC50"].apply(lambda x: x.replace(">","").replace("<","") if (("<" in str(x)) | (">" in str(x))) else x))
data_notna["IC50"] = data_notna["IC50"].astype(np.float)
data_notna["C_cid"] = data_notna["C_cid"].astype(np.float).astype(np.int).astype(np.str)
## Convert protein sequence upper cases
data_notna.update(data_notna["T_seq"].apply(lambda x: x.upper()))

data = data_notna

In [74]:
## Collect Canonical and Isomeric SMILES from PubChem (*Done on 2024-05-10)
# chunk_size = 1500
# chunk_index = np.split(data_golden.index, np.arange(chunk_size, len(data_golden), chunk_size))
# SMILES_cols = ["C_seq_can_smiles", "C_seq_iso_smiles"]
# SMILES_container = pd.DataFrame([], columns=SMILES_cols)
# for i in tqdm(chunk_index):
#     success = False
#     while success != True:
#         try:
#             compounds = pcp.get_compounds(list(data_golden.loc[chunk_index[i]]["C_cid"]), as_dataframe=True)
#             success = True
#         except Exception as e:
#             print("ERROR occurred",e,"\nTry again")
#     SMILES_container = pd.concat([SMILES_container, pd.DataFrame(compounds[["canonical_smiles", "isomeric_smiles"]].values, columns=["C_seq_can_smiles", "C_seq_iso_smiles"], index=chunk_index[i])], axis=0)

In [76]:
## Add Canonical and Isomeric SMILES columns (*Done on 2024-05-10)
# data_smiles_collected = pd.concat([data_notna, SMILES_container], axis=1) #[cols_arranged]
# with open(os.path.join(data_path, "golden_set_"+datetime.datetime.today().strftime("%Y%m%d")+".pickle"), "wb") as f:
#     pickle.dump(data_smiles_collected, f)

In [105]:
## When start after collecting SMILES
# data_smiles_collected = pd.read_csv(os.path.join(data_path, "preprocessed", "golden_set_20240514.csv"))
data = data_smiles_collected

In [141]:
data_smiles_collected

Unnamed: 0_level_0,C_seq_can_smiles,T_id,T_seq,IC50
C_cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000,C1=CC=C(C=C1)C(CN)O,P03470,MNPNQKIITIGSICMVVGIISLILQIGNIISIWISHSIQTGNQNHT...,870000.0
10000015,CC(=O)NC1CCC2=CC3=C(C(=C2C4=CC=C(C(=O)C=C14)OC...,Q6B856,MREIVHIQAGQCGNQIGAKFWEVISDEHGIDPTGSYHGDSDLQLER...,2100.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P53708,MSPGASRGPRGSQAPLIAPLCCAAAALGMLLWSPACQAFNLDVEKL...,383.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P08648,MGSRTPESPLHAVQLRWGPRRRPPLLPLLLLLLPPPPRVGGFNLDA...,17.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P05556,MNLQPIFWIGLISSVCCVFAQTDENRCLKANAKSCGECIQAGPNCG...,24.0
...,...,...,...,...
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P08253,MEALMARGALTGPLRALCLLGCLLSHAAAAPSPIIKFPGDVAPKTD...,120.0
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P14780,MSLWQPLVLVLLVLGCCFAAPRQRQSTLVLFPGDLRTNLTDRQLAE...,414.0
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P22894,MFSLKTLPFLLLLHVQISKAFPVSSKEKNTKTVQDYLEKFYQLPSN...,228.0
9999994,C1CN(CCC1CCC(=O)C2=CC3=C(C=C2)NCC3)CC4=CC(=CC=...,P22303,MRPPQCLLHTPSLASPLLLLLLWLLGGGVGAEGREDAELLVTVRGG...,59.0


In [142]:
## Drop duplicates of C-T pairs
data_drop_dups = data_smiles_collected.drop_duplicates(subset=("C_seq_can_smiles", "T_seq"))
display(data_drop_dups)

data = data_drop_dups

Unnamed: 0_level_0,C_seq_can_smiles,T_id,T_seq,IC50
C_cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000,C1=CC=C(C=C1)C(CN)O,P03470,MNPNQKIITIGSICMVVGIISLILQIGNIISIWISHSIQTGNQNHT...,870000.0
10000015,CC(=O)NC1CCC2=CC3=C(C(=C2C4=CC=C(C(=O)C=C14)OC...,Q6B856,MREIVHIQAGQCGNQIGAKFWEVISDEHGIDPTGSYHGDSDLQLER...,2100.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P53708,MSPGASRGPRGSQAPLIAPLCCAAAALGMLLWSPACQAFNLDVEKL...,383.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P08648,MGSRTPESPLHAVQLRWGPRRRPPLLPLLLLLLPPPPRVGGFNLDA...,17.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P05556,MNLQPIFWIGLISSVCCVFAQTDENRCLKANAKSCGECIQAGPNCG...,24.0
...,...,...,...,...
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P08253,MEALMARGALTGPLRALCLLGCLLSHAAAAPSPIIKFPGDVAPKTD...,120.0
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P14780,MSLWQPLVLVLLVLGCCFAAPRQRQSTLVLFPGDLRTNLTDRQLAE...,414.0
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P22894,MFSLKTLPFLLLLHVQISKAFPVSSKEKNTKTVQDYLEKFYQLPSN...,228.0
9999994,C1CN(CCC1CCC(=O)C2=CC3=C(C=C2)NCC3)CC4=CC(=CC=...,P22303,MRPPQCLLHTPSLASPLLLLLLWLLGGGVGAEGREDAELLVTVRGG...,59.0


In [346]:
## Sort out Compounds with under 100 binding Targets --> max 100 targets
seqlen_per_C = data_drop_dups.groupby("C_cid").count()["T_seq"]
display(seqlen_per_C.describe())
index_T_seq_under100 = np.setdiff1d(data_drop_dups.index, seqlen_per_C[seqlen_per_C>100].index)
data_under100Ts = data_drop_dups.loc[index_T_seq_under100]

data = data_under100Ts

count    816596.000000
mean          1.603361
std           1.392733
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          99.000000
Name: T_seq, dtype: float64

In [None]:
# ## Save data for Compounds with under 100 binding Targets
# data_under100Ts.to_csv(os.path.join(data_path, "preprocessed", "golden_set_20240514.csv"))
# display(data_under100Ts)

In [154]:
## Remove duplicated T_id for each T_seq
T_seq_unq = data_under100Ts["T_seq"].unique()
data_T_seq = data_under100Ts.set_index("T_seq")

tid_dups = []
for tseq in tqdm(T_seq_unq):
    curr_tids = data_T_seq.loc[tseq]["T_id"]
    if isinstance(curr_tids, str): continue
    if len(np.unique(curr_tids.values)) > 1:
        tid_dups.append((tseq, np.unique(curr_tids.values, return_counts=True)))

100%|██████████| 5878/5878 [07:43<00:00, 12.68it/s]


In [157]:
## Find index of minor T_id to be removed
minor_tid_indexes = []
for i in tqdm(range(len(tid_dups))):
    dup_tseq = tid_dups[i][0]
    dup_tids, dup_cnts = tid_dups[i][1]
    curr_tids = data_under100Ts["T_id"][data_under100Ts["T_seq"]==dup_tseq]
    minor_tids = curr_tids[curr_tids==dup_tids[np.argmin(dup_cnts)]]
    minor_tid_indexes.append(minor_tids.index)
minor_tid_indexes = pd.Index(np.concatenate(minor_tid_indexes))

100%|██████████| 14/14 [00:02<00:00,  4.97it/s]


In [158]:
## Save preprocessed data
data_minor_tid_removed = data_under100Ts.loc[pd.Index(np.setdiff1d(data_under100Ts.index, minor_tid_indexes))]
data_minor_tid_removed.to_csv(os.path.join(data_path, "preprocessed", "golden_set_20240520.csv"))
display(data_minor_tid_removed)

data = data_minor_tid_removed

In [356]:
data_minor_tid_removed

Unnamed: 0_level_0,C_seq_can_smiles,T_id,T_seq,IC50
C_cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000,C1=CC=C(C=C1)C(CN)O,P03470,MNPNQKIITIGSICMVVGIISLILQIGNIISIWISHSIQTGNQNHT...,870000.0
10000015,CC(=O)NC1CCC2=CC3=C(C(=C2C4=CC=C(C(=O)C=C14)OC...,Q6B856,MREIVHIQAGQCGNQIGAKFWEVISDEHGIDPTGSYHGDSDLQLER...,2100.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P53708,MSPGASRGPRGSQAPLIAPLCCAAAALGMLLWSPACQAFNLDVEKL...,383.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P08648,MGSRTPESPLHAVQLRWGPRRRPPLLPLLLLLLPPPPRVGGFNLDA...,17.0
10000019,C1=CC=C(C=C1)C(CC(=O)O)NC(=O)CNC(=O)C2=CC(=CC=...,P05556,MNLQPIFWIGLISSVCCVFAQTDENRCLKANAKSCGECIQAGPNCG...,24.0
...,...,...,...,...
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P08253,MEALMARGALTGPLRALCLLGCLLSHAAAAPSPIIKFPGDVAPKTD...,120.0
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P14780,MSLWQPLVLVLLVLGCCFAAPRQRQSTLVLFPGDLRTNLTDRQLAE...,414.0
9999879,CC(C)C(C(=O)NO)N(CC1=CN=CC=C1)S(=O)(=O)C2=CC=C...,P22894,MFSLKTLPFLLLLHVQISKAFPVSSKEKNTKTVQDYLEKFYQLPSN...,228.0
9999994,C1CN(CCC1CCC(=O)C2=CC3=C(C=C2)NCC3)CC4=CC(=CC=...,P22303,MRPPQCLLHTPSLASPLLLLLLWLLGGGVGAEGREDAELLVTVRGG...,59.0


## Construct datasets for training

### Leave small set

In [303]:
def insert_CT_code(data):

    pair_code = data.apply(lambda x : str(x['C_cid']) + 'x' + str(x['T_id']), axis=1)
    data.insert(loc=0, column='(C-T)_id', value=list(pair_code))

    return data

def filter_C_by_num_T(data=None, N=30):

    # C (ompound) 별 T (arget)의 갯수로 C를 필터링
    unique_c_ids_n_counts = np.unique(data['C_cid'], return_counts=True)
    unique_c_ids = unique_c_ids_n_counts[0]
    unique_c_counts = unique_c_ids_n_counts[1]

    c_ids_filtering_idx = np.where(unique_c_counts >= N)[0]
    filtered_c_ids = unique_c_ids[c_ids_filtering_idx]
    filtered_c_ids_onehot = data['C_cid'].isin(filtered_c_ids)
    row_index = np.arange(data['C_cid'].shape[0])
    filtered_c_ids_onehot_index = filtered_c_ids_onehot * row_index
    final_filtered_c_ids_index = np.where(filtered_c_ids_onehot_index != 0)[0]

    filtered_data = data.iloc[final_filtered_c_ids_index]

    # CT_code 삽입
    filtered_data = insert_CT_code(filtered_data)

    # 갯수 확인
    num_rows = filtered_data.shape[0]
    num_C = len(np.unique(filtered_data['C_cid']))
    num_T = len(np.unique(filtered_data['T_id']))
    num_CT = len(np.unique(filtered_data['(C-T)_id']))

    # IC50 등에서 값이 달라 (C-T)_id 컬럼에 2개 이상 존재할 경우 가장 첫번째 값만 사용
    filtered_data = filtered_data.groupby('(C-T)_id').first()

    print('Number of T per C >={}\n--> Number of rows : {},\n--> Number of C : {},\n--> Number of T : {}\n--> Number of (C-T) pair : {}'.format(N, num_rows, num_C, num_T, num_CT))
    
    return filtered_data


In [371]:
filtered_data = filter_C_by_num_T(data_smiles_collected.reset_index(), N=7)

Number of T per C >=7
--> Number of rows : 314836,
--> Number of C : 27378,
--> Number of T : 3684
--> Number of (C-T) pair : 144422


In [372]:
## Filter Compounds with under N binding Targets
n_T_filtered = 7
n_T_per_C = data_minor_tid_removed.reset_index().groupby("C_cid").count()["T_seq"]
data_T_filtered = data_minor_tid_removed.loc[n_T_per_C[n_T_per_C>=n_T_filtered].index]
display(data_T_filtered)

data = data_T_filtered

Unnamed: 0_level_0,C_seq_can_smiles,T_id,T_seq,IC50
C_cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q99705,MSVGAMKKGVGRAVGLGGGSGCQATEEDPLPNCGACAPGQGGRRWR...,0.600000
10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q12809,MPVRRGHVAPQNTFLDTIIRKFEGQSRKFIIANARVENCAVIYCND...,5000.000000
10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q8JZL2,MDLQASLLSTGPNASNISDGQDNFTLAGPPPRTRSVSYINIIMPSV...,0.600000
10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,P97639,MDLQTSLLSTGPNASNISDGQDNLTLPGSPPRTGSVSYINIIMPSV...,0.800000
10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q8MJ89,ACAPGQGGRRWRLPQPAWVEGSSAWLWEPATGTGWMDLEASLLPTG...,0.800000
...,...,...,...,...
998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q9HC16,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,8150.000000
998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q00G26,MSEEEAAQIPRSSVWEQDQQNVVQRVVALPLVRATCTAVCDVYSAA...,5718.000000
998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q07820,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,4746.602893
998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,O60240,MAVNKGLTLLDGDLPEQENVLQRVLQLPVVSGTCECFQKTYTSTKE...,4375.000000


In [373]:
## Truncate Targets with over L length
maxlen_T = 1024
len_Ts = data_T_filtered.reset_index()["T_seq"].apply(lambda x: len(x))
data_T_truncated = data_T_filtered.reset_index().loc[len_Ts[len_Ts<=maxlen_T].index]
display(data_T_truncated)

data_T_truncated.to_csv(os.path.join(data_path, "preprocessed", "data_T_truncated_20240520.csv"))

data = data_T_truncated

Unnamed: 0,C_cid,C_seq_can_smiles,T_id,T_seq,IC50
0,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q99705,MSVGAMKKGVGRAVGLGGGSGCQATEEDPLPNCGACAPGQGGRRWR...,0.600000
2,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q8JZL2,MDLQASLLSTGPNASNISDGQDNFTLAGPPPRTRSVSYINIIMPSV...,0.600000
3,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,P97639,MDLQTSLLSTGPNASNISDGQDNLTLPGSPPRTGSVSYINIIMPSV...,0.800000
4,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q8MJ89,ACAPGQGGRRWRLPQPAWVEGSSAWLWEPATGTGWMDLEASLLPTG...,0.800000
5,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q969V1,MNPFHASCWNTSAELLNKSWNKEFAYQTASVVDTVILPSMIGIICS...,10000.000000
...,...,...,...,...,...
81139,998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q9HC16,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,8150.000000
81140,998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q00G26,MSEEEAAQIPRSSVWEQDQQNVVQRVVALPLVRATCTAVCDVYSAA...,5718.000000
81141,998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q07820,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,4746.602893
81142,998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,O60240,MAVNKGLTLLDGDLPEQENVLQRVLQLPVVSGTCECFQKTYTSTKE...,4375.000000


In [374]:
## Construct dictionaries
cid_to_seq = data_T_truncated[["C_cid", "C_seq_can_smiles"]].set_index("C_cid").drop_duplicates()["C_seq_can_smiles"]
i_to_C_seq, i_to_C_cid = cid_to_seq.values, cid_to_seq.index

tid_to_seq = data_T_truncated[["T_id","T_seq"]].drop_duplicates(subset=("T_seq"))
new_tids = pd.Series("", index=tid_to_seq.index)
for tid in tqdm(tid_to_seq["T_id"].unique()):
    curr_targets = tid_to_seq[tid_to_seq["T_id"]==tid]
    if len(curr_targets) > 1:
        new_tids.loc[curr_targets.index[0]] = tid
        for j in range(1,len(curr_targets)):
            new_tids.loc[curr_targets.index[j]] = tid+"-"+str(j)
    else:
        new_tids.loc[curr_targets.index[0]] = tid
        
i_to_T_id = new_tids.values
i_to_T_seq = tid_to_seq["T_seq"].values

C_seq_to_i = {i_to_C_seq[i]: i for i in range(len(i_to_C_seq))}
C_cid_to_i = {i_to_C_cid[i]: i for i in range(len(i_to_C_cid))}
T_seq_to_i = {i_to_T_seq[i]: i for i in range(len(i_to_T_seq))}
T_id_to_i = {i_to_T_id[i]: i for i in range(len(i_to_T_id))}

new_tid_col = data_T_truncated["T_seq"].apply(lambda x: i_to_T_id[T_seq_to_i[x]])
data_new_tid = data_T_truncated.copy(deep=True)
data_new_tid.loc[:,"T_id_unq"] = new_tid_col

display(data_new_tid)

data = data_new_tid

100%|██████████| 2717/2717 [00:02<00:00, 1137.68it/s]


Unnamed: 0,C_cid,C_seq_can_smiles,T_id,T_seq,IC50,T_id_unq
0,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q99705,MSVGAMKKGVGRAVGLGGGSGCQATEEDPLPNCGACAPGQGGRRWR...,0.600000,Q99705
2,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q8JZL2,MDLQASLLSTGPNASNISDGQDNFTLAGPPPRTRSVSYINIIMPSV...,0.600000,Q8JZL2
3,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,P97639,MDLQTSLLSTGPNASNISDGQDNLTLPGSPPRTGSVSYINIIMPSV...,0.800000,P97639
4,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q8MJ89,ACAPGQGGRRWRLPQPAWVEGSSAWLWEPATGTGWMDLEASLLPTG...,0.800000,Q8MJ89
5,10004545,CC1C2CN(CCC2CC3=C1C4=C(N3)C=CC(=C4)C(F)(F)F)CC...,Q969V1,MNPFHASCWNTSAELLNKSWNKEFAYQTASVVDTVILPSMIGIICS...,10000.000000,Q969V1
...,...,...,...,...,...,...
81139,998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q9HC16,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,8150.000000,Q9HC16
81140,998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q00G26,MSEEEAAQIPRSSVWEQDQQNVVQRVVALPLVRATCTAVCDVYSAA...,5718.000000,Q00G26
81141,998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,Q07820,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,4746.602893,Q07820
81142,998883,C1CC(=C2C(=O)N(CC(=O)N2C1)CCC3=CC=CC=C3)C4=CC(...,O60240,MAVNKGLTLLDGDLPEQENVLQRVLQLPVVSGTCECFQKTYTSTKE...,4375.000000,O60240


In [375]:
len(i_to_C_seq)

7803

In [376]:
len(i_to_T_seq)

3006

In [219]:
## Save constructed dictionaries
with open(os.path.join(data_path, "dictionaries", "i_to_C_cid.pickle"), "wb") as f:
    pickle.dump(i_to_C_cid, f)
with open(os.path.join(data_path, "dictionaries", "i_to_T_id.pickle"), "wb") as f:
    pickle.dump(i_to_T_id, f)
with open(os.path.join(data_path, "dictionaries", "C_cid_to_i.pickle"), "wb") as f:
    pickle.dump(C_cid_to_i, f)
with open(os.path.join(data_path, "dictionaries", "T_id_to_i.pickle"), "wb") as f:
    pickle.dump(T_id_to_i, f)
with open(os.path.join(data_path, "dictionaries", "i_to_C_seq.pickle"), "wb") as f:
    pickle.dump(i_to_C_seq, f)
with open(os.path.join(data_path, "dictionaries", "i_to_T_seq.pickle"), "wb") as f:
    pickle.dump(i_to_T_seq, f)
with open(os.path.join(data_path, "dictionaries", "C_seq_to_i.pickle"), "wb") as f:
    pickle.dump(C_seq_to_i, f)
with open(os.path.join(data_path, "dictionaries", "T_seq_to_i.pickle"), "wb") as f:
    pickle.dump(T_seq_to_i, f)
print("Dictionaries saved")

Dictionaries saved


In [291]:
## Construct C-T adjacency matrix
CT_adj = np.zeros((len(i_to_C_cid), len(i_to_T_id)), dtype=np.bool)
print("Shape of C-T adjacency matrix: ", CT_adj.shape)
index_binding = data_new_tid.apply(lambda x: (C_seq_to_i[x["C_seq_can_smiles"]], T_seq_to_i[x["T_seq"]]), axis=1)
CT_adj[tuple(np.array(list(zip(*index_binding.values))))] = True

Shape of C-T adjacency matrix:  (21507, 3460)


In [292]:
## Chunked dot product for sparse matrix
def chunked_spsdot(mat1, mat2, n_chunks=10, dtype=np.bool):
    if not isinstance(mat1, sps.lil.lil_matrix): mat1 = sps.lil_matrix(mat1)
    if not isinstance(mat2, sps.lil.lil_matrix): mat2 = sps.lil_matrix(mat2)
    chunk_size = int(mat1.shape[0] / n_chunks)
    sps_container = sps.csr_matrix((mat1.shape[0], mat2.shape[1]), dtype=dtype)
    print("Chunk size:",chunk_size)
    for i in tqdm(range(n_chunks)):
        if i < n_chunks-1:
            curr_mat = mat1[i*chunk_size:(i+1)*chunk_size]
            dot_res = curr_mat.dot(mat2)
            sps_container[i*chunk_size:(i+1)*chunk_size,:] = dot_res
        else:
            curr_mat = mat1[i*chunk_size:]
            dot_res = curr_mat.dot(mat2)
            sps_container[i*chunk_size:,:] = dot_res

    return sps_container

In [293]:
## Make sparse
sps_CT_adj = sps.csr_matrix(CT_adj)

## Calculate C-T-C adjacency matrix
sps_CTC_adj = chunked_spsdot(sps_CT_adj, sps_CT_adj.T, n_chunks=20)
display(sps_CTC_adj)

  0%|          | 0/20 [00:00<?, ?it/s]

Chunk size: 1075


  self._set_arrayXarray_sparse(i, j, x)
100%|██████████| 20/20 [00:27<00:00,  1.39s/it]


<21507x21507 sparse matrix of type '<class 'numpy.bool_'>'
	with 27097367 stored elements in Compressed Sparse Row format>

In [294]:
## Calculate C-T-C-T adjacency matrix
sps_CTCT_adj = sps_CTC_adj.dot(sps_CT_adj)
display(sps_CTCT_adj)
CTCT_adj = sps_CTCT_adj.toarray()

<21507x3460 sparse matrix of type '<class 'numpy.bool_'>'
	with 11331404 stored elements in Compressed Sparse Row format>

In [295]:
## Make T-C, T-C-T-C adjacency matrix; Leave 2-hop elements only
only_CTCT = (CTCT_adj.astype(np.int8) - CT_adj.astype(np.int8)).astype(np.bool)
TC_adj = CT_adj.T
TCTC_adj = CTCT_adj.T
only_TCTC = only_CTCT.T

In [271]:
def save_hdf5(data_to_save, data_path):
    import h5py
    from scipy.sparse import csr_matrix, lil_matrix
    with h5py.File(data_path, "w") as f:
        if isinstance(data_to_save, (csr_matrix, lil_matrix)):
            f.create_dataset("data", data=data_to_save.data)
            f.create_dataset("indices", data=data_to_save.indices)
            f.create_dataset("indptr", data=data_to_save.indptr)
            f.attrs["shape"] = data_to_save.shape
            f.attrs["type"] = "sparse"
        else:
            f.create_dataset("data", data=data_to_save)
            f.attrs["type"] = "dense"
    
def load_hdf5(data_path):
    import h5py
    from scipy.sparse import csr_matrix
    with h5py.File(data_path, "r") as f:
        if f.attrs["type"]=="sparse":
            data = f["data"][:]
            indices = f["indices"][:]
            indptr = f["indptr"][:]
            shape = f.attrs["shape"]
            return csr_matrix((data, indices, indptr), shape=shape)
        else:
            return f["data"][:]

In [287]:
save_hdf5(sps_CT_adj, os.path.join(data_path, "adjacency_matrices", "sps_CT_adj.h5"))
save_hdf5(sps_CTC_adj, os.path.join(data_path, "adjacency_matrices", "sps_CTC_adj.h5"))
save_hdf5(sps_CTCT_adj, os.path.join(data_path, "adjacency_matrices", "sps_CTCT_adj.h5"))

### Positive/Negative sampling

In [232]:
def extract_targets(data, cid):
    targets = data.loc[cid]["T_seq"]
    out = targets.values if isinstance(targets, pd.Series) else np.array([targets])
    return out

def split_per_C(cid, tids, split_ratio={"train": 8, "val": 1, "test": 1}):
    n_train_per_C = np.int(np.round((len(tids) / 10) * split_ratio["train"]))
    n_val_per_C = np.int(np.floor((len(tids) - (np.int(np.round((len(tids) / 10) * split_ratio["train"])))) / 2))
    n_test_per_C = len(tids) - n_train_per_C - n_val_per_C
    
    train_index = np.random.choice(np.arange(len(tids)), n_train_per_C, replace=False)
    train_remainder = np.setdiff1d(np.arange(len(tids)), train_index)
    val_index = np.random.choice(train_remainder, n_val_per_C, replace=False)
    test_index = np.setdiff1d(train_remainder, val_index)    
    
    return (tids[train_index], tids[val_index], tids[test_index])

In [302]:
## Make stratified train/val/test indexes according to split ratio
train_indexes, val_indexes, test_indexes = np.empty((0,2), dtype=np.int), np.empty((0,2), dtype=np.int), np.empty((0,2), dtype=np.int)
for i in tqdm(range(CT_adj.shape[0])): # for every C_cid
    target_indexes = np.nonzero(CT_adj[i,:])[0]
    curr_train, curr_val, curr_test = split_per_C(i, target_indexes)
    train_indexes = np.concatenate([train_indexes, np.hstack((curr_train[:,np.newaxis], np.tile(i, len(curr_train))[:,np.newaxis]))])
    val_indexes = np.concatenate([val_indexes, np.hstack((curr_val[:,np.newaxis], np.tile(i, len(curr_val))[:,np.newaxis]))])
    test_indexes = np.concatenate([test_indexes, np.hstack((curr_test[:,np.newaxis], np.tile(i, len(curr_test))[:,np.newaxis]))])

    
n_whole_indexes = sum((train_indexes.shape[0], val_indexes.shape[0], test_indexes.shape[0]))
print("TRAIN: {} ({}), VAL: {} ({}), TEST: {} ({})".format(train_indexes.shape[0], np.round(train_indexes.shape[0]/n_whole_indexes, 2), val_indexes.shape[0], np.round(val_indexes.shape[0]/n_whole_indexes, 2), test_indexes.shape[0], np.round(test_indexes.shape[0]/n_whole_indexes, 2)))

100%|██████████| 21507/21507 [00:09<00:00, 2243.38it/s]

TRAIN: 96167 (0.81), VAL: 3607 (0.03), TEST: 19519 (0.16)





In [297]:
## Make train/val/test set with positive/negative samples
cols_taranc = ["Target_index", "Anchor_index"]
cols_posneg = ["Positive_index", "Negative_index"]

train_index_set = pd.DataFrame(train_indexes, columns=cols_taranc).sort_values("Target_index").set_index("Target_index")
train_container = pd.DataFrame(columns=cols_posneg)
for curr_index in tqdm(train_index_set.index.unique()):
    n_anchors_per_T = train_index_set.loc[curr_index].shape[0]
    pos_per_T = np.nonzero(TCTC_adj[curr_index,:])[0]
    neg_per_T = np.nonzero(TCTC_adj[curr_index,:]==0)[0]
    curr_pos = np.random.choice(pos_per_T, n_anchors_per_T)
    curr_neg = np.random.choice(neg_per_T, n_anchors_per_T)    
    curr_posneg = pd.DataFrame(np.hstack((curr_pos[:,np.newaxis], curr_neg[:,np.newaxis])), index=np.tile(curr_index, n_anchors_per_T), columns=cols_posneg)
    train_container = pd.concat([train_container, curr_posneg], axis=0)
    
train_index_set.loc[:,cols_posneg] = train_container
train_index_set = train_index_set.reset_index()

train_set = pd.DataFrame(index=train_index_set.index)
train_set.loc[:,"T_seq"] = train_index_set["Target_index"].apply(lambda x: i_to_T_seq[x])
train_set.loc[:,"C_anc"] = train_index_set["Anchor_index"].apply(lambda x: i_to_C_seq[x])
train_set.loc[:,"C_pos"] = train_index_set["Positive_index"].apply(lambda x: i_to_C_seq[x])
train_set.loc[:,"C_neg"] = train_index_set["Negative_index"].apply(lambda x: i_to_C_seq[x])

val_index_set = pd.DataFrame(val_indexes, columns=cols_taranc).sort_values("Target_index")
val_set = pd.DataFrame(index=val_index_set.index)
val_set.loc[:,"T_seq"] = val_index_set["Target_index"].apply(lambda x: i_to_T_seq[x])
val_set.loc[:,"C_anc"] = val_index_set["Anchor_index"].apply(lambda x: i_to_C_seq[x])

test_index_set = pd.DataFrame(test_indexes, columns=cols_taranc).sort_values("Target_index")
test_set = pd.DataFrame(index=test_index_set.index)
test_set.loc[:,"T_seq"] = test_index_set["Target_index"].apply(lambda x: i_to_T_seq[x])
test_set.loc[:,"C_anc"] = test_index_set["Anchor_index"].apply(lambda x: i_to_C_seq[x])

100%|██████████| 3267/3267 [00:11<00:00, 279.80it/s]


In [243]:
## Save train/va/test datasets
train_set.to_csv(data_path+"/splitted/train.csv")
val_set.to_csv(data_path+"/splitted/val.csv")
test_set.to_csv(data_path+"/splitted/test.csv")