### Import

In [23]:
import pandas as pd
import numpy as np
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost
import catboost


from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit import RDConfig
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools as PandasTools
from rdkit import DataStructs
from rdkit.Chem.Subshape import SubshapeBuilder,SubshapeAligner,SubshapeObjects

In [24]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)


### DataLoad

In [25]:
# 학습 ChEMBL 데이터 로드
df = pd.read_csv('train.csv')  # 예시 파일 이름
df.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


### Data Pre-processing

In [26]:
#제거 변수

delete = ['Molecule ChEMBL ID','Standard Type','Standard Relation','Standard Value','Standard Units','pChEMBL Value','Assay ChEMBL ID','Target ChEMBL ID','Target Name','Target Organism',
          'Target Type','Document ChEMBL ID','pIC50','IC50_nM']

In [27]:
y_train = df['pIC50'].values
x_train = df.drop(delete ,axis=1)


In [28]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Train & Validation

In [29]:
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size=0.2, random_state=42)

In [30]:
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense,Dropout,LSTM,Bidirectional, LSTM, Flatten,Embedding,Conv1D,TimeDistributed,GRU,MaxPool1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.backend import clear_session
import re

In [31]:
# 원자 기호를 추출하는 함수 정의
def get_atom_tokens(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        atom_tokens = [atom.GetSymbol() for atom in mol.GetAtoms()]
        return atom_tokens
    else:
        return []

# x_train 데이터프레임에 적용하여 새로운 'AtomTokens' 열 생성
x_train['Smiles'] = x_train['Smiles'].apply(get_atom_tokens)
x_val['Smiles'] = x_val['Smiles'].apply(get_atom_tokens)

In [32]:
# REGEXPS = {
#         "brackets": re.compile(r"(\[[^\]]*\])"),
#         "2_ring_nums": re.compile(r"(%\d{2})"),
#         "brcl": re.compile(r"(Br|Cl)")
#     }
# REGEXP_ORDER = ["brackets", "2_ring_nums", "brcl"]

# def tokenize(data, with_begin_and_end=True):
#     """Tokenizes a SMILES string."""
#     def split_by(data, regexps):
#         if not regexps:
#             return list(data)
#         regexp = REGEXPS[regexps[0]]
#         splitted = regexp.split(data)
#         tokens = []
#         for i, split in enumerate(splitted):
#             if i % 2 == 0:
#                 tokens += split_by(split, regexps[1:])
#             else:
#                 tokens.append(split)
#         return tokens

#     tokens = split_by(data, REGEXP_ORDER)
    
#     # if true, the token set is enriched by "^" and "$" denoting the start and end of the embedding, respectively
#     if with_begin_and_end:
#         tokens = ["^"] + tokens + ["$"]
#     return tokens

In [33]:
# x_train['Smiles'] = x_train['Smiles'].apply(tokenize)
# x_val['Smiles'] = x_val['Smiles'].apply(tokenize)

In [34]:
# x_train과 x_val이 이미 정의되어 있다고 가정합니다.
x_train = x_train['Smiles'].tolist()
x_val = x_val['Smiles'].tolist()

# Tokenize the SMILES strings
tokenizer = Tokenizer(char_level=True, filters='')
tokenizer.fit_on_texts(x_train)

# 어휘 사전의 크기 확인
vocab_size = len(tokenizer.word_index) + 1  # +1은 0을 포함시키기 위한 것
print("Vocabulary size:", vocab_size)

# 텍스트를 시퀀스로 변환
train_sequences = tokenizer.texts_to_sequences(x_train)
val_sequences = tokenizer.texts_to_sequences(x_val)

# Padding sequences to have the same length (adjust maxlen as necessary)
maxlen = max(len(seq) for seq in train_sequences)
x_train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post')
x_val_padded = pad_sequences(val_sequences, maxlen=maxlen, padding='post')

Vocabulary size: 10


In [35]:
embedding_dim = 72
max_words = 150


In [36]:
print("Character counts in the training data:")
print(tokenizer.word_counts)
print("Unique characters and their indices in the training data:")
print(tokenizer.word_index)

Character counts in the training data:
OrderedDict([('c', 34276), ('n', 9965), ('o', 3597), ('s', 512), ('f', 828), ('cl', 172), ('br', 4), ('p', 2), ('h', 2)])
Unique characters and their indices in the training data:
{'c': 1, 'n': 2, 'o': 3, 'f': 4, 's': 5, 'cl': 6, 'br': 7, 'p': 8, 'h': 9}


In [37]:
print(x_train_padded.shape) 

(1561, 72)


In [38]:
input_shape = (x_train_padded.shape[1], 1)  # (timesteps, features)
es = EarlyStopping(monitor='val_loss', patience=5
                   , restore_best_weights=True)

clear_session()


model = Sequential()

# Ensure vocab_size is set correctly
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=x_train_padded.shape[1]))
model.add(LSTM(units=16, return_sequences=True, dropout=0))
model.add(Bidirectional(LSTM(16, dropout=0.19, recurrent_dropout=0.0, return_sequences=True)))
model.add(Dense(16, activation='relu'))
model.add(Bidirectional(LSTM(16, dropout=0.19, recurrent_dropout=0.0, go_backwards=True, return_sequences=True)))
model.add(Dense(16, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.02)))
model.add(Flatten())
model.add(Dense(8, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.02)))
model.add(Dense(1, activation='linear'))  # Change activation if needed

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])

model.summary()

history = model.fit(x_train_padded, y_train,epochs=1000, batch_size=32,validation_data = (x_val_padded,y_val) ,callbacks=[es])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 72, 72)            720       
                                                                 
 lstm (LSTM)                 (None, 72, 16)            5696      
                                                                 
 bidirectional (Bidirectiona  (None, 72, 32)           4224      
 l)                                                              
                                                                 
 dense (Dense)               (None, 72, 16)            528       
                                                                 
 bidirectional_1 (Bidirectio  (None, 72, 32)           4224      
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 72, 16)            5

In [39]:
# input_shape = (x_train_padded.shape[1], 1)  # (timesteps, features)
# es = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)


# model = Sequential()

# model.add(LSTM(units=64, input_shape=input_shape, dropout=0.19, recurrent_dropout=0.0, return_sequences=False))
# model.add(Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=0.005, l2=0.01)))
# model.add(Dropout(0.1)) 
# model.add(Dense(16, activation='relu', kernel_regularizer=l1_l2(l1=0.005, l2=0.01)))
# model.add(Dense(1, activation='linear'))

# optimizer = Adam(learning_rate=0.005)
# model.compile(optimizer=optimizer, loss='mse', metrics=['mse'])

# model.summary()

# history = model.fit(x_train_padded, y_train,epochs=10000, batch_size=32,validation_data = (x_val_padded,y_val) ,callbacks=[es])

In [40]:
test = pd.read_csv('./test.csv')
test = test.drop(['ID'],axis=1)


In [41]:
test['Smiles'] = test['Smiles'].apply(get_atom_tokens)
# test['Smiles'] = test['Smiles'].apply(tokenize)
test = test['Smiles'].tolist()

# Tokenize the SMILES strings
tokenizer = Tokenizer(char_level=True, filters='')
tokenizer.fit_on_texts(test)
test_sequences = tokenizer.texts_to_sequences(test)

# Padding sequences to have the same length (adjust maxlen as necessary)
test_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post')


In [42]:
y_pred = model.predict(test_padded)

In [43]:
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(y_pred)
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,5.024796
1,TEST_001,9.875721
2,TEST_002,10.185475
3,TEST_003,13.001334
4,TEST_004,7.940677


In [44]:
submit.to_csv('./baseline_submit.csv', index=False)