### Import

In [1]:
import pandas as pd
import numpy as np
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost
import catboost


from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit import RDConfig
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools as PandasTools
from rdkit import DataStructs
from rdkit.Chem.Subshape import SubshapeBuilder,SubshapeAligner,SubshapeObjects

In [2]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)


In [3]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

### DataLoad

In [5]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [6]:
# 학습 ChEMBL 데이터 로드
df = pd.read_csv('train.csv')  # 예시 파일 이름
df.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


### Data Pre-processing

In [7]:
#제거 변수

delete = ['Molecule ChEMBL ID','Standard Type','Standard Relation','Standard Value','Standard Units','pChEMBL Value','Assay ChEMBL ID','Target ChEMBL ID','Target Name','Target Organism',
          'Target Type','Document ChEMBL ID','pIC50','IC50_nM']

In [8]:
y_train = df['pIC50'].values
x_train = df.drop(delete ,axis=1)


In [9]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [10]:
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size=0.3)

### Train & Validation

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense,Input,Dropout,LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1_l2

In [16]:
x_train = x_train['Smiles'].tolist()
x_val = x_val['Smiles'].tolist()
# Tokenize the SMILES strings
tokenizer = Tokenizer(char_level=True, filters='')
tokenizer.fit_on_texts(x_train)
train_sequences = tokenizer.texts_to_sequences(x_train)
val_sequences = tokenizer.texts_to_sequences(x_val)

# Padding sequences to have the same length (adjust maxlen as necessary)
maxlen = 10  # This should be set according to your data
x_train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post')
x_val_padded = pad_sequences(val_sequences, maxlen=maxlen, padding='post')

In [17]:
print(x_train_padded.shape) 

(1366, 10)


In [19]:
input_shape = (x_train_padded.shape[1], 1)  # (timesteps, features)
es = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)


model = Sequential()
    
model.add(LSTM(units=64, input_shape=input_shape, dropout=0.19, recurrent_dropout=0.0))
model.add(Dense(32, activation='relu',kernel_regularizer = l1_l2(l1 = 0.005,l2 = 0.01)))
model.add(Dense(16, activation='relu',kernel_regularizer = l1_l2(l1 = 0.005,l2 = 0.01)))
model.add(Dense(8, activation='relu',kernel_regularizer = l1_l2(l1 = 0.005,l2 = 0.01)))
model.add(Dense(1, activation='linear'))


optimizer = Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])

model.summary()

model.fit(x_train_padded, y_train,epochs=10000, batch_size=16,validation_data = [x_val_padded,y_val] ,callbacks=[es])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                16896     
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 8)                 136       
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 19,649
Trainable params: 19,649
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10

<keras.callbacks.History at 0x235535975e0>

In [20]:
test = pd.read_csv('./test.csv')
test = test.drop(['ID'],axis=1)


Unnamed: 0,Smiles
0,O=C(C1=CSC(C2=CC=CN=C2)=N1)NC3=CC(NC4CCN(C)CC4...
1,N#CC1=CC(C=C2)=C(C=C1)N2C(N=C3)=NC(NC4CCCCC4)=...
2,N#CC(C=C1)=C(N[C@@H]2CCNC2)C=C1NC(N=C3)=NC=C3C...
3,N#CC(C=C1)=CC=C1NC(N=C2)=NC(NC3CC(NC(C=C)=O)CC...
4,N#CC(C=C1)=CC=C1NC(N=C2)=NC(NC3CC(N)CC3)=C2C(C...
...,...
108,N#CC1=CC(C=C2)=C(C=C1)N2C(N=C3)=NC(N4CCOCC4)=C...
109,O=C(C1=CSC(C2=CC=NC=C2)=N1)NC3=CC(NC4CCN(C(C)C...
110,N#Cc1ccc(Nc2ncc(cn2)c3cnn(c3)C4CCNCC4)cc1N[C@@...
111,O=C(C)N(CC1)CCC1N2N=CC(C3=CN=C(N4C(C=CC(C#N)=C...


In [21]:
test = test['Smiles'].tolist()

# Tokenize the SMILES strings
tokenizer = Tokenizer(char_level=True, filters='')
tokenizer.fit_on_texts(test)
test_sequences = tokenizer.texts_to_sequences(test)

# Padding sequences to have the same length (adjust maxlen as necessary)
maxlen = 10  # This should be set according to your data
test_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post')


In [23]:
y_pred = model.predict(test_padded)

In [24]:
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(y_pred)
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,131.468048
1,TEST_001,18.565424
2,TEST_002,22.130863
3,TEST_003,8.976595
4,TEST_004,8.976595


In [25]:
submit.to_csv('./baseline_submit.csv', index=False)