# Boost up AI 2025: 신약 개발 경진대회 (h2o method)
- 참가팀: dalcw
- 참가자: 문성수
- 소속: 전남대학교(휴학)

## Install Library
- h2o
- pandas
- numpy
- tqdm
- rdkit

In [24]:
# automl library
import h2o
from h2o.automl import H2OAutoML

# chemical feature extractor
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, rdMolDescriptors, MACCSkeys, AllChem, Crippen
from rdkit import RDLogger
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

# basic library
import numpy as np
import pandas as pd
import tqdm

# 모든 경고 끄기
RDLogger.DisableLog('rdApp.*')

## Dataset load
- using external dataset
    - [PubChem AID 1851 dataset](https://pubchem.ncbi.nlm.nih.gov/bioassay/1851)
    - [PubChem AID 884 dataset](https://pubchem.ncbi.nlm.nih.gov/bioassay/884)

In [19]:
# main dataset
main_data = pd.read_csv("./main_data/train.csv")

# AID_1851
aid_1851_data = pd.read_csv("./main_data/AID_1851_datatable_all.csv")
# cyp3a4만을 대상으로 하는 데이터만을 필터링
aid_1851_data = aid_1851_data[aid_1851_data["Panel Name"] == "p450-cyp3a4"]
# smiles와 11.4um에서 실험된 activity 값을 추출
aid_1851_data = aid_1851_data[["PUBCHEM_EXT_DATASOURCE_SMILES", "Activity at 11.43 uM"]]
# activation 값에 음수를 취함 (이게 저해율이라고 판단함)
aid_1851_data["Activity at 11.43 uM"] = aid_1851_data.apply(lambda x: -float(x["Activity at 11.43 uM"]), axis=1)
# 칼럼 이름 변경
aid_1851_data.columns = ["Canonical_Smiles", "Inhibition"]

# AID 884
# AID 1851과 동일한 방식의 전처리로 처리
aid_884_data = pd.read_csv("./main_data/AID_884_datatable_all.csv")
aid_884_data = aid_884_data[["PUBCHEM_EXT_DATASOURCE_SMILES", "Activity at 11.43 uM"]]
aid_884_data.dropna(inplace=True)
aid_884_data["Activity at 11.43 uM"] = aid_884_data.apply(lambda x: -float(x["Activity at 11.43 uM"]), axis=1)
aid_884_data.columns = ["Canonical_Smiles", "Inhibition"]

# Full data
train = pd.concat([main_data, aid_1851_data, aid_884_data], axis=0)

# 만약 동일한 데이터가 존재할 경우에는 평균으로 aggregation 함
train = train.groupby("Canonical_Smiles", as_index=False).agg({"Inhibition": "mean"})
# Inhibition의 값이 0 이상인 경우의 데이터만 유효하다고 판단함
train = train[train["Inhibition"] >= 0]

# train shape
train.head()

  aid_1851_data = pd.read_csv("./main_data/AID_1851_datatable_all.csv")


Unnamed: 0,Canonical_Smiles,Inhibition
0,B(C1=C(C=CS1)C(=O)C)(O)O,13.4805
1,B(C1=C(C=CS1)C=O)(O)O,2.5505
2,B(C1=CC(=CC=C1)/C=C/C(=O)O)(O)O,17.4322
3,B(C1=CC(=CC=C1)N)(O)O,9.252
4,B(C1=CC(=CC=C1)[N+](=O)[O-])(O)O,6.1117


# Machine learning method
- using h2o automl library

## Feature extraction
1. physicochemical descriptors
1. morgan finger print
1. maccs fingerprint

In [8]:
# SMARTS 패턴 정의 (논문 기반)
SMARTS_PATTERNS = {
    "has_imidazole": Chem.MolFromSmarts("n1cncc1"),
    "has_tertiary_amine": Chem.MolFromSmarts("[NX3]([C])[C]"),
    "has_furan": Chem.MolFromSmarts("c1ccoc1"),
    "has_acetylene": Chem.MolFromSmarts("C#C"), 
    "has_pyridine": Chem.MolFromSmarts("n1ccccc1"),
    "has_thiophene": Chem.MolFromSmarts("c1ccsc1")
}

def feature_extractor(smiles):
    mol = Chem.MolFromSmiles(smiles)

    # 기본 물성 특징 15개
    physical_features = [
        Descriptors.MolWt(mol),
        Crippen.MolLogP(mol),
        Descriptors.TPSA(mol),
        Lipinski.NumRotatableBonds(mol),
        Lipinski.NumHDonors(mol),
        Lipinski.NumHAcceptors(mol),
        rdMolDescriptors.CalcNumAromaticRings(mol),
        rdMolDescriptors.CalcNumRings(mol),
        rdMolDescriptors.CalcFractionCSP3(mol),
        Descriptors.HeavyAtomCount(mol),
        rdMolDescriptors.CalcLabuteASA(mol),                       # 접근 가능 표면적
        Descriptors.MolMR(mol),                                    # 몰 굴절률
        rdMolDescriptors.CalcExactMolWt(mol),                      # 정밀 분자량
        Descriptors.NumValenceElectrons(mol),                      # 원자가 전자 수
        len([a for a in mol.GetAtoms() if a.GetSymbol() == 'P'])   # 인(P) 원자 수
    ]

    # Morgan fingerprint (2048bit)
    generator = GetMorganGenerator(radius=2, fpSize=2048)
    morgan_fp = generator.GetFingerprint(mol)
    morgan_features = list(morgan_fp)

    # MACCS fingerprint (167bit)
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    maccs_features = [int(bit) for bit in maccs_fp.ToBitString()]

    # SMARTS 구조 플래그 (6개)
    smarts_flags = [int(mol.HasSubstructMatch(pat)) for pat in SMARTS_PATTERNS.values()]

    # 전체 특징 벡터
    all_features = physical_features + morgan_features + maccs_features + smarts_flags
    
    return all_features

# testing
print(len(feature_extractor("CCC")))

2236


In [23]:
# feature extractor
# smile to numeric vector
def feature_extractor(smiles):
    mol = Chem.MolFromSmiles(smiles)

    # 1. physicochemical descriptors
    physical_features = [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.TPSA(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumAromaticRings(mol),
        Descriptors.NumHeteroatoms(mol),
        Descriptors.FractionCSP3(mol),
        Descriptors.NumAliphaticRings(mol),
        Lipinski.NumAromaticHeterocycles(mol),
        Lipinski.NumSaturatedHeterocycles(mol),
        Lipinski.NumAliphaticHeterocycles(mol),
        Descriptors.HeavyAtomCount(mol),
        Descriptors.RingCount(mol),
        Descriptors.NOCount(mol),
        Descriptors.NHOHCount(mol),
        Descriptors.NumRadicalElectrons(mol),
    ]
    
    # 2. morgan fingerprint (updated)
    generator = GetMorganGenerator(radius=2, fpSize=2048)
    morgan_fp = generator.GetFingerprint(mol)
    morgan_features = list(morgan_fp)

    # 3. maccs fingerprint
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    maccs_features = [int(bit) for bit in maccs_fp.ToBitString()]

    all_features = physical_features + morgan_features + maccs_features
    return all_features


# testing
print(len(feature_extractor("CCC")))

2233


## Train data encoding
- 생성한 특징 추출기를 기반하여 데이터 전체를 인코딩함

In [4]:
train_encodings = []

for _, row in tqdm.tqdm(train.iterrows(), total=len(train)):
    train_encodings.append(feature_extractor(row["Canonical_Smiles"]))

train_X = np.array(train_encodings)
train_y = np.array(train["Inhibition"])

train = np.concatenate([train_X, train_y.reshape(-1, 1)], axis=1)
train = pd.DataFrame(train)

100%|██████████| 17996/17996 [00:59<00:00, 300.66it/s]


## Model training
- h2o automl을 이용하여 모델 학습 진행

In [5]:
h2o.init()

# train data
train_h2o = h2o.H2OFrame(train)

# target column (last column)
target = 2236
features = [col for col in train_h2o.columns if col != target]

train_h2o[target] = train_h2o[target].asnumeric()

# AutoML 수행
aml = H2OAutoML(max_runtime_secs=20000, sort_metric="RMSE")
aml.train(x=features, y=target, training_frame=train_h2o)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "21.0.8" 2025-07-15; OpenJDK Runtime Environment (build 21.0.8+9-Ubuntu-0ubuntu124.04.1); OpenJDK 64-Bit Server VM (build 21.0.8+9-Ubuntu-0ubuntu124.04.1, mixed mode, sharing)
  Starting server from /home/srcadmin/anaconda3/lib/python3.12/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpc1hsfc3g
  JVM stdout: /tmp/tmpc1hsfc3g/h2o_srcadmin_started_from_python.out
  JVM stderr: /tmp/tmpc1hsfc3g/h2o_srcadmin_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 7 days
H2O_cluster_name:,H2O_from_python_srcadmin_s0sz9n
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,29.48 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
17:07:42.409: _train param, Dropping bad and constant columns: [2067, 2065, 2064, 2063, 2069, 2068]

██
17:07:56.259: _train param, Dropping bad and constant columns: [2067, 2065, 2064, 2063, 2069, 2068]

████████████████
17:08:11.713: _train param, Dropping bad and constant columns: [2067, 2065, 2064, 2063, 2069, 2068]

███
17:13:19.677: _train param, Dropping unused columns: [2067, 2065, 2064, 2063, 2069, 2068]


17:13:26.678: _train param, Dropping bad and constant columns: [2067, 2065, 2064, 2063, 2069, 2068]


17:13:35.719: _train param, Dropping bad and constant columns: [2067, 2065, 2064, 2063, 2069, 2068]

████████
17:23:33.994: _train param, Dropping bad and constant columns: [2067, 2065, 2064, 2063, 2069, 2068]

██
17:26:28.809: _train param, Dropping bad and constant columns: [2067, 2065, 2064, 2063, 2069, 2068]

███
17:30:05.654: _train param, Dropping bad and c

key,value
Stacking strategy,blending
Number of base models (used / total),28/79
# GBM base models (used / total),1/8
# XGBoost base models (used / total),26/64
# DRF base models (used / total),0/2
# GLM base models (used / total),0/1
# DeepLearning base models (used / total),1/4
Metalearner algorithm,GLM
Metalearner fold assignment scheme,AUTO
Metalearner nfolds,0


## Test file gen.
- 학습된 모델을 이용하여 submission 파일을 생성함

In [30]:
# test data generation
test = pd.read_csv("./main_data/test.csv")
test_encodings = []

for _, row in tqdm.tqdm(test.iterrows(), total=len(test)):
    test_encodings.append(feature_extractor(row["Canonical_Smiles"]))

test_X = np.array(test_encodings)
test = pd.DataFrame(test_X)

# test file generation
test_h2o = h2o.H2OFrame(test)
preds = aml.leader.predict(test_h2o)

sub = pd.read_csv("./main_data/sample_submission.csv")
sub["Inhibition"] = preds["predict"].as_data_frame()

sub.to_csv("./submission/submission_h2o.csv", index=False)

100%|██████████| 100/100 [00:00<00:00, 304.88it/s]


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%



