In [11]:
import pandas as pd

from typing import Tuple, List
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

df = pd.read_csv ('protac.csv')
df = df.rename(columns={"E3 ligase":"E3ligase"})

In [12]:
target_encoded = pd.get_dummies(df['Target'],prefix='Target')
target_encoded

Unnamed: 0,Target_AAK1,Target_ABL1,Target_ABL2,Target_ADRA1A,Target_AKT1,Target_AKT2,Target_AKT3,Target_ALK,Target_ALK G1202R,Target_AR,...,Target_VEGFR-2,Target_VHL,Target_Wee1,Target_YES1,Target_c-KIT,Target_c-Met,Target_eIF4E,Target_p38alpha,Target_p38beta,Target_p38delta
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3934,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3935,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3937,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
ligase_encoded = pd.get_dummies(df['E3ligase'],prefix='E3ligase')
ligase_encoded

Unnamed: 0,E3ligase_AhR,E3ligase_CRBN,E3ligase_DCAF11,E3ligase_DCAF15,E3ligase_DCAF16,E3ligase_IAP,E3ligase_MDM2,E3ligase_RNF114,E3ligase_RNF4,E3ligase_VHL,E3ligase_XIAP,E3ligase_cIAP1
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3934,0,1,0,0,0,0,0,0,0,0,0,0
3935,0,1,0,0,0,0,0,0,0,0,0,0
3936,0,0,1,0,0,0,0,0,0,0,0,0
3937,0,0,1,0,0,0,0,0,0,0,0,0


In [14]:
def get_morgan_fp(smiles: str, nbits: int = 256):
    """get morgan fingeprprint"""
    m = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nbits)
    array = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fingerprint, array)
    return array

In [15]:
#test
get_morgan_fp("COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN1CCN(CCOCCOCC(=O)N[C@H](C(=O)N2C[C@H](O)C[C@H]2C(=O)NCC2=CC=C(C3=C(C)N=CS3)C=C2)C(C)(C)C)CC1")

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1], dtype=int8)

In [16]:
fingerprints = []
i = 0
for x in df['Smiles']:
    fp = get_morgan_fp(x)
    fingerprints.append(fp)
    i+=1

print(i)

3939


In [17]:
names = ["f"+str(i) for i in range(1,257)]
fp_encoded=pd.DataFrame([fingerprints[0]],columns=names)
for x in fingerprints[1:]:
    row =pd.DataFrame([x],columns=names)
    fp_encoded = pd.concat([fp_encoded,row])
fp_encoded

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256
0,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
0,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
0,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
0,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,1
0,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,1
0,1,1,0,1,1,0,0,0,0,0,...,0,0,0,1,0,1,1,0,0,1
0,1,1,0,1,1,0,0,0,0,0,...,0,0,0,1,0,1,1,0,0,1


In [18]:
dft = df.select_dtypes(include=[object])
rows = dft.apply(lambda x: x.str.contains("degrad",case=False)).any(axis=1)
for i,v in rows.items():
    if v:
        rows[i]=1
    else:
        rows[i]=0

rows

0       0
1       0
2       0
3       0
4       0
       ..
3934    0
3935    0
3936    0
3937    0
3938    0
Length: 3939, dtype: object

In [19]:
protacdb = pd.concat([target_encoded.reset_index(drop=True), ligase_encoded.reset_index(drop=True), fp_encoded.reset_index(drop=True)], axis =1)

In [20]:
protacdb.insert(0,"Degradation",rows)
protacdb

Unnamed: 0,Degradation,Target_AAK1,Target_ABL1,Target_ABL2,Target_ADRA1A,Target_AKT1,Target_AKT2,Target_AKT3,Target_ALK,Target_ALK G1202R,...,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3934,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,1
3935,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,1
3936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,0,1
3937,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,0,1
