In [9]:
!pip install pycaret
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import os
import pandas as pd
import numpy as np
import random

from pycaret.regression import *

In [11]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [14]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
import numpy as np

# RDKit을 사용해 SMILES 문자열을 피처로 변환하는 함수
def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(2048)  # 오류가 있을 경우 0 벡터 반환
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    return np.array(fp)
# 데이터 로드
train_df = pd.read_csv('/content/drive/MyDrive/신약개발/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/신약개발/test.csv')

train_df['features'] = train_df['Smiles'].apply(featurize)
test_df['features'] = test_df['Smiles'].apply(featurize)

# 피처를 개별 컬럼으로 확장
X_train = np.vstack(train_df['features'])
X_test = np.vstack(test_df['features'])

y_train = train_df['IC50_nM']



In [15]:
from pycaret.regression import *

# 데이터 준비
train_data = pd.DataFrame(X_train)
train_data['IC50_nM'] = y_train.values

# PyCaret 설정
regression_setup = setup(data=train_data, target='IC50_nM', session_id=42)

# 모델 학습 및 선택
best_model = compare_models()

# 모델 튜닝
tuned_model = tune_model(best_model)

# 모델 평가
evaluate_model(tuned_model)

# 테스트 데이터에 대한 예측 생성
test_predictions = predict_model(tuned_model, data=test_df)

test_predictions.head()

Unnamed: 0,Description,Value
0,Session id,42
1,Target,IC50_nM
2,Target type,Regression
3,Original data shape,"(1952, 2049)"
4,Transformed data shape,"(1952, 2049)"
5,Transformed train set shape,"(1366, 2049)"
6,Transformed test set shape,"(586, 2049)"
7,Numeric features,2048
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,671.4599,6777807.5069,2197.185,0.1822,2.6432,61.6474,2.346
par,Passive Aggressive Regressor,688.9683,6731073.4676,2203.4681,0.1595,2.6344,58.8982,4.301
en,Elastic Net,836.0467,6947457.964,2247.4621,0.1233,3.1973,138.4766,0.657
br,Bayesian Ridge,880.7873,6860352.9451,2274.4264,0.0499,3.1926,132.644,4.0
dummy,Dummy Regressor,957.5291,7666446.3125,2390.7225,-0.0192,3.7256,207.4425,0.494
lightgbm,Light Gradient Boosting Machine,808.2381,6734940.5067,2291.9965,-0.0241,2.8633,249.5249,3.266
gbr,Gradient Boosting Regressor,795.6765,7803680.7696,2518.5229,-0.3453,2.6514,176.6779,3.375
knn,K Neighbors Regressor,714.5009,7821932.4375,2553.0587,-0.4807,1.6588,59.6831,0.705
xgboost,Extreme Gradient Boosting,790.8919,10376430.3875,2772.5574,-0.5581,2.3572,69.7598,3.769
rf,Random Forest Regressor,762.8394,8423090.6316,2641.0302,-0.6978,2.1985,166.5604,27.707


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,709.1536,6852864.2281,2617.7976,0.2134,2.2759,28.9824
1,470.3536,1166932.6233,1080.2466,0.2667,2.3692,26.4994
2,439.5419,1051132.3392,1025.2475,0.3347,2.3399,32.0344
3,372.8193,857358.6509,925.9366,0.2115,2.6494,55.9684
4,606.2119,6129705.9713,2475.8243,-0.0004,2.3325,39.3318
5,892.3848,15689541.778,3961.0026,0.0238,2.2598,24.2907
6,1080.4657,28474938.039,5336.1913,0.0488,2.3666,26.6206
7,582.5312,1954228.1868,1397.9371,0.1937,2.3055,29.288
8,447.5085,801031.8799,895.0038,0.2031,2.1549,59.2803
9,537.8952,6327281.5474,2515.4088,0.1618,2.3973,28.698


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
# 테스트 데이터에 대한 예측 생성
test_predictions = predict_model(tuned_model, data=test_df)

# 결과를 제출 파일 형식으로 저장
submission_df = pd.DataFrame({
    'ID': test_df['ID'],  # ID 칼럼이 있는 경우
    'IC50_nM': test_predictions['Label']  # 예측된 IC50 값
})

submission_df.to_csv('pycaret_submission.csv', index=False)

from google.colab import files
files.download('pycaret_submission.csv')