In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cancer-prediction/labeled_cancer_drugs.csv
/kaggle/input/cancer-prediction/__results__.html
/kaggle/input/cancer-prediction/__notebook__.ipynb
/kaggle/input/cancer-prediction/__output__.json
/kaggle/input/cancer-prediction/custom.css


In [2]:
df = pd.read_csv('/kaggle/input/cancer-prediction/labeled_cancer_drugs.csv')

In [3]:
df

Unnamed: 0,SMILES,anti_cancer
0,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl,0
1,C(CCl)Cl,0
2,C(CCl)O,0
3,CC(C)(CO)C(=O)C(=O)O,0
4,C1C(C(C(OC1O)CO)O)O,0
...,...,...
17922,CN1CCC[C@H]1COC2=NC3=C(CCN(C3)C4=CC=CC5=C4C(=C...,1
17923,CC#CC(=O)N1CCC[C@H]1C2=NC(=C3N2C=CN=C3N)C4=CC=...,1
17924,[OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O...,1
17925,CCN(C(C)C)C(=O)C1=C(C=CC(=C1)F)OC2=CN=CN=C2N3C...,1


In [4]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp311-cp311-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [5]:
from rdkit import Chem, DataStructs
from rdkit.Chem import rdFingerprintGenerator as rfg
from rdkit.Chem.MolStandardize import rdMolStandardize
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [6]:
morgan_gen = rfg.GetMorganGenerator(
    radius=2,
    fpSize=2048,
    includeChirality=False,
    useBondTypes=True,
    countSimulation=False 
)

def load_and_clean_mol(smiles: str):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    params = rdMolStandardize.CleanupParameters()
    params.removeHs = False
    mol = rdMolStandardize.Cleanup(mol, params)

    patt = Chem.MolFromSmarts('[H;X0]') 
    if patt is not None:
        mol = Chem.DeleteSubstructs(mol, patt, onlyFrags=False)

    if mol.GetNumAtoms() == 0:
        return None

    return mol

def smiles_to_morgan_array_clean(smiles: str) -> np.ndarray | None:
    mol = load_and_clean_mol(smiles)
    if mol is None:
        return None
    fp = morgan_gen.GetFingerprint(mol)
    arr = np.zeros((fp.GetNumBits(),), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr


In [7]:
n_bits = 2048
fps = [smiles_to_morgan_array_clean(s) for s in df['SMILES'].astype(str)]
zero = np.zeros(n_bits, dtype=np.uint8)
X = np.vstack([x if x is not None else zero for x in fps])

[23:23:26] Initializing MetalDisconnector
[23:23:26] Running MetalDisconnector
[23:23:26] Initializing Normalizer
[23:23:26] Running Normalizer
[23:23:26] Initializing MetalDisconnector
[23:23:26] Running MetalDisconnector
[23:23:26] Initializing Normalizer
[23:23:26] Running Normalizer
[23:23:26] Initializing MetalDisconnector
[23:23:26] Running MetalDisconnector
[23:23:26] Initializing Normalizer
[23:23:26] Running Normalizer
[23:23:26] Initializing MetalDisconnector
[23:23:26] Running MetalDisconnector
[23:23:26] Initializing Normalizer
[23:23:26] Running Normalizer
[23:23:26] Initializing MetalDisconnector
[23:23:26] Running MetalDisconnector
[23:23:26] Initializing Normalizer
[23:23:26] Running Normalizer
[23:23:26] Initializing MetalDisconnector
[23:23:26] Running MetalDisconnector
[23:23:26] Initializing Normalizer
[23:23:26] Running Normalizer
[23:23:26] Initializing MetalDisconnector
[23:23:26] Running MetalDisconnector
[23:23:26] Initializing Normalizer
[23:23:26] Running Nor

In [8]:
X.shape

(17927, 2048)

In [9]:
y = df['anti_cancer']

In [10]:
def run_lgb_skf(
    X, y,
    n_splits=5,
    random_state=42,
    lgb_params=None,
    num_boost_round=10000,
    early_stopping_rounds=200,
    verbose_eval=False
):
    y = np.asarray(y).reshape(-1)

    if lgb_params is None:
        lgb_params = dict(
            objective="binary",
            boosting_type="gbdt",
            learning_rate=0.03,
            num_leaves=64,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            reg_alpha=0.0,
            min_child_samples=20,
            n_estimators=num_boost_round,
            random_state=random_state,
            n_jobs=-1,
        )

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    oof = np.zeros(len(y), dtype=float)
    models = []
    fold_aucs = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="auc",
            callbacks=[
                lgb.early_stopping(early_stopping_rounds, verbose=verbose_eval),
            ],
        )

        p_va = clf.predict_proba(X_va)[:, 1]
        oof[va_idx] = p_va

        auc = roc_auc_score(y_va, p_va)
        fold_aucs.append(auc)
        models.append(clf)
        if verbose_eval:
            print(f"[Fold {fold}] AUC = {auc:.6f}  best_iter={clf.best_iteration_}")

    oof_auc = roc_auc_score(y, oof)
    if verbose_eval:
        print(f"OOF AUC = {oof_auc:.6f}  (mean {np.mean(fold_aucs):.6f} ± {np.std(fold_aucs):.6f})")

    return {
        "oof_preds": oof,
        "oof_auc": oof_auc,
        "fold_aucs": fold_aucs,
        "models": models,
    }

In [11]:
result = run_lgb_skf(X, y, n_splits=5, verbose_eval=True)
print("OOF AUC:", result["oof_auc"])

[LightGBM] [Info] Number of positive: 215, number of negative: 14126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.145353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4096
[LightGBM] [Info] Number of data points in the train set: 14341, number of used features: 2048
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014992 -> initscore=-4.185134
[LightGBM] [Info] Start training from score -4.185134
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[90]	valid_0's auc: 0.82819	valid_0's binary_logloss: 0.0630004
[Fold 1] AUC = 0.828190  best_iter=90
[LightGBM] [Info] Number of positive: 215, number of negative: 14126
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [