In [1]:
from methods.widedta import _WideDTADataHandler
from modules.encoders import WideCNN
from tdc.multi_pred import DTI
from pathlib import Path


In [27]:
ds = "bindingdb_ki"

ds = ds.lower()
df = DTI(ds, path=Path("..", "data", ds), print_stats=True).get_data()
ori_size = df.shape[0]

ori_size

Found local copy...
Loading...
--- Dataset Statistics ---
174547 unique drugs.
3068 unique targets.
374820 drug-target pairs.
--------------------------
Done!


374820

In [28]:
df = df.drop_duplicates(subset=['Drug_ID', 'Target_ID'])

ori_size = df.shape[0]

ori_size

324347

In [29]:
df = df.dropna()

print("NaN dropped:", df.shape[0] - ori_size)

NaN dropped: -28678


In [30]:
df.groupby('Drug_ID')['Target_ID'].nunique()

drug_target_counts = df.groupby('Drug_ID')['Target_ID'].nunique().reset_index(name='Target_Count')

print("Min interactions: ", drug_target_counts["Target_Count"].min())
print("Max interactions: ", drug_target_counts["Target_Count"].max())


Min interactions:  1
Max interactions:  104


In [31]:
drug_target_counts[drug_target_counts['Target_Count'] < 10]

Unnamed: 0,Drug_ID,Target_Count
0,7.0,6
1,16.0,2
2,19.0,1
3,49.0,1
4,70.0,1
...,...,...
159453,145866189.0,2
159454,145866190.0,2
159455,145866191.0,2
159456,145866192.0,2


In [32]:
drug_ids_to_remove = drug_target_counts[drug_target_counts['Target_Count'] < 10]['Drug_ID']

df = df[~df['Drug_ID'].isin(drug_ids_to_remove)]

df.shape[0]

16159

In [33]:
df

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y
1028,637542.0,O=C(O)/C=C/c1ccc(O)cc1,P00533,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,1000000.0
1029,689043.0,O=C(O)/C=C/c1ccc(O)c(O)c1,P00533,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,400000.0
1292,774.0,NCCc1cnc[nH]1,Q16769,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,850000.0
1481,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P17252,MADVFPGNDSTASQDVANRFARKGALRQKNVHEVKDHKFIARFFKQ...,1000.0
1482,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P68400,MSGPVPSRARVYTDVNTHRPREYWDYESHVVEWGNQDDYQLVRKLG...,1000.0
...,...,...,...,...,...
373904,73213196.0,CN1CCc2c(c3cccc4c3n2CCc2ccccc2-4)C1,P14416,MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIA...,10000.0
373911,5280443.0,O=c1cc(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,178300.0
373912,5281607.0,O=c1cc(-c2ccccc2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,19400.0
373915,5280343.0,O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,106700.0


In [39]:
print("Original size:", ori_size)
print("Final size:    ", df.shape[0])
print("Difference:  ", df.shape[0] - ori_size)

Original size: 324347
Final size:     16159
Difference:   -308188


Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y
1481,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P17252,MADVFPGNDSTASQDVANRFARKGALRQKNVHEVKDHKFIARFFKQ...,1000.0
1482,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P68400,MSGPVPSRARVYTDVNTHRPREYWDYESHVVEWGNQDDYQLVRKLG...,1000.0
1483,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P49841,MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQ...,20.0
1646,1935.0,Nc1c2c(nc3ccccc13)CCCC2,P22303,MRPPQCLLHTPSLASPLLLLLLWLLGGGVGAEGREDAELLVTVRGG...,40.0
2344,72894.0,Nc1ccccc1S(N)(=O)=O,P00915,MASPDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLK...,45400.0
...,...,...,...,...,...
373903,73213196.0,CN1CCc2c(c3cccc4c3n2CCc2ccccc2-4)C1,P35367,MSLPNSSCLLEDKMCEGNKTTMASPQLMPLVVVLSTICLVTVGLNL...,10000.0
373904,73213196.0,CN1CCc2c(c3cccc4c3n2CCc2ccccc2-4)C1,P14416,MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIA...,10000.0
373912,5281607.0,O=c1cc(-c2ccccc2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,19400.0
373915,5280343.0,O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,106700.0


In [44]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 2. Standardizing Numerical Values
scaler = StandardScaler()
df[['Y']] = scaler.fit_transform(df[['Y']])

df

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y
1481,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P17252,MADVFPGNDSTASQDVANRFARKGALRQKNVHEVKDHKFIARFFKQ...,-0.373241
1482,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P68400,MSGPVPSRARVYTDVNTHRPREYWDYESHVVEWGNQDDYQLVRKLG...,-0.373241
1483,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P49841,MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQ...,-0.440872
1646,1935.0,Nc1c2c(nc3ccccc13)CCCC2,P22303,MRPPQCLLHTPSLASPLLLLLLWLLGGGVGAEGREDAELLVTVRGG...,-0.439492
2344,72894.0,Nc1ccccc1S(N)(=O)=O,P00915,MASPDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLK...,2.690890
...,...,...,...,...,...
373903,73213196.0,CN1CCc2c(c3cccc4c3n2CCc2ccccc2-4)C1,P35367,MSLPNSSCLLEDKMCEGNKTTMASPQLMPLVVVLSTICLVTVGLNL...,0.247867
373904,73213196.0,CN1CCc2c(c3cccc4c3n2CCc2ccccc2-4)C1,P14416,MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIA...,0.247867
373912,5281607.0,O=c1cc(-c2ccccc2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,0.896579
373915,5280343.0,O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,6.921323


In [45]:
scaler = MinMaxScaler()

df[['Y']] = scaler.fit_transform(df[['Y']])

df

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y
1481,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P17252,MADVFPGNDSTASQDVANRFARKGALRQKNVHEVKDHKFIARFFKQ...,0.006250
1482,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P68400,MSGPVPSRARVYTDVNTHRPREYWDYESHVVEWGNQDDYQLVRKLG...,0.006250
1483,447961.0,Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1,P49841,MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQ...,0.000125
1646,1935.0,Nc1c2c(nc3ccccc13)CCCC2,P22303,MRPPQCLLHTPSLASPLLLLLLWLLGGGVGAEGREDAELLVTVRGG...,0.000250
2344,72894.0,Nc1ccccc1S(N)(=O)=O,P00915,MASPDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLK...,0.283750
...,...,...,...,...,...
373903,73213196.0,CN1CCc2c(c3cccc4c3n2CCc2ccccc2-4)C1,P35367,MSLPNSSCLLEDKMCEGNKTTMASPQLMPLVVVLSTICLVTVGLNL...,0.062500
373904,73213196.0,CN1CCc2c(c3cccc4c3n2CCc2ccccc2-4)C1,P14416,MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIA...,0.062500
373912,5281607.0,O=c1cc(-c2ccccc2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,0.121250
373915,5280343.0,O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,P08170,MFSAGHKIKGTVVLMPKNELEVNPDGSAVDNLNAFLGRSVSLQLIS...,0.666875


In [None]:
# 4. Removing Outliers (example using Z-score, assuming 'numerical_feature' is your target feature)
from scipy import stats
import numpy as np

df = df[(np.abs(stats.zscore(df['Y'])) < 3)]

df