# ㅇ

In [6]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [44]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [45]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [46]:
# 평가 산식 정의 (로그 변환 시 음수/0 방지)
def normalized_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    normalized_rmse_value = rmse / np.mean(y_true)
    return normalized_rmse_value

def correct_ratio(y_true, y_pred):
    # 0 또는 음수 값을 방지하기 위해 작은 양수로 대체
    y_true = np.where(y_true <= 0, 1e-9, y_true)
    y_pred = np.where(y_pred <= 0, 1e-9, y_pred)

    pIC50_true = -np.log10(y_true * 1e-9)
    pIC50_pred = -np.log10(y_pred * 1e-9)

    absolute_error = np.abs(pIC50_true - pIC50_pred)
    correct_ratio_value = np.mean(absolute_error <= 0.5)
    return correct_ratio_value

def custom_score(y_true, y_pred):
    A = normalized_rmse(y_true, y_pred)
    B = correct_ratio(y_true, y_pred)
    score = 0.5 * (1 - min(A, 1)) + 0.5 * B
    return score


In [47]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors

# RDKit을 사용해 SMILES 문자열을 피처로 변환하는 함수
def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(2048)  # 오류가 있을 경우 0 벡터 반환
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    return np.array(fp)

# 데이터 로드
train_df = pd.read_csv('/content/drive/MyDrive/신약개발/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/신약개발/test.csv')

# 훈련 및 테스트 데이터에서 SMILES 피처 추출
train_df['smiles_features'] = train_df['Smiles'].apply(featurize)
test_df['smiles_features'] = test_df['Smiles'].apply(featurize)

# SMILES 피처를 개별 컬럼으로 확장
X_train = np.vstack(train_df['smiles_features'])
X_test = np.vstack(test_df['smiles_features'])
y_train = train_df['IC50_nM'].values

In [48]:
# K-Fold 교차 검증 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 모델 초기화 및 평가
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1차 모델 정의 (개별 모델)
level0 = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(random_state=42)),
    ('lgbm', LGBMRegressor(random_state=42))
]

# 2차 메타 모델 정의
level1 = LinearRegression()

# 스태킹 모델 정의
stacking_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=kf)

# K-Fold 교차 검증을 통한 평가
best_score = float('-inf')
for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    stacking_model.fit(X_train_fold, y_train_fold)

    val_predictions = stacking_model.predict(X_val_fold)
    score = custom_score(y_val_fold, val_predictions)
    if score > best_score:
        best_score = score

print(f"Best Model Score: {best_score}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1830
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 610
[LightGBM] [Info] Start training from score 650.147773
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008843 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1614
[LightGBM] [Info] Number of data points in the train set: 1248, number of used features: 538
[LightGBM] [Info] Start training from score 685.935326
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

In [49]:
# 전체 데이터로 최종 모델 학습 및 테스트 데이터 예측
stacking_model.fit(X_train_scaled, y_train)
test_predictions = stacking_model.predict(X_test_scaled)

# 결과 저장
submission_df = pd.DataFrame({
    'ID': test_df['ID'],
    'IC50_nM': test_predictions
})
submission_df.to_csv('stacking_submission.csv', index=False)

from google.colab import files
files.download('stacking_submission.csv')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2118
[LightGBM] [Info] Number of data points in the train set: 1952, number of used features: 706
[LightGBM] [Info] Start training from score 649.001365
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010620 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1830
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 610
[LightGBM] [Info] Start training from score 650.147773
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>