In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cancer-prediction/labeled_cancer_drugs.csv
/kaggle/input/cancer-prediction/__results__.html
/kaggle/input/cancer-prediction/__notebook__.ipynb
/kaggle/input/cancer-prediction/__output__.json
/kaggle/input/cancer-prediction/custom.css


In [2]:
df = pd.read_csv('/kaggle/input/cancer-prediction/labeled_cancer_drugs.csv')

In [3]:
df

Unnamed: 0,SMILES,anti_cancer
0,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl,0
1,C(CCl)Cl,0
2,C(CCl)O,0
3,CC(C)(CO)C(=O)C(=O)O,0
4,C1C(C(C(OC1O)CO)O)O,0
...,...,...
17922,CN1CCC[C@H]1COC2=NC3=C(CCN(C3)C4=CC=CC5=C4C(=C...,1
17923,CC#CC(=O)N1CCC[C@H]1C2=NC(=C3N2C=CN=C3N)C4=CC=...,1
17924,[OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O...,1
17925,CCN(C(C)C)C(=O)C1=C(C=CC(=C1)F)OC2=CN=CN=C2N3C...,1


In [4]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp311-cp311-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [5]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [6]:
import math, random
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GINConv, global_mean_pool

from rdkit import Chem
from rdkit.Chem import rdchem
from rdkit.Chem.rdchem import HybridizationType
from rdkit import Chem

In [7]:
def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [8]:
ATOM_LIST = list(range(1, 119))  
HYB_SET = [
    HybridizationType.SP, HybridizationType.SP2, HybridizationType.SP3,
    HybridizationType.SP3D, HybridizationType.SP3D2
]

In [9]:
def one_hot(value, choices, allow_unknown=True):
    vec = [0]*len(choices)
    try:
        idx = choices.index(value)
        vec[idx] = 1
    except ValueError:
        if not allow_unknown:
            raise
    return vec

def atom_features(atom: rdchem.Atom):
    z = atom.GetAtomicNum()
    feats = []
    feats += one_hot(z, ATOM_LIST)                      # 原子番号 one-hot
    feats += one_hot(atom.GetHybridization(), HYB_SET)  # 混成軌道
    feats += [
        atom.GetTotalDegree(),                          # 次数
        atom.GetFormalCharge(),                         # 形式電荷
        int(atom.GetIsAromatic()),                      # 芳香族
        atom.GetTotalNumHs()                            # H数
    ]
    return torch.tensor(feats, dtype=torch.float)

def bond_features(bond: rdchem.Bond):
    bt = bond.GetBondType()
    feats = [
        int(bt == rdchem.BondType.SINGLE),
        int(bt == rdchem.BondType.DOUBLE),
        int(bt == rdchem.BondType.TRIPLE),
        int(bt == rdchem.BondType.AROMATIC),
        int(bond.GetIsConjugated()),
        int(bond.IsInRing()),
    ]
    return torch.tensor(feats, dtype=torch.float)



In [10]:
def safe_mol_from_smiles(smi: str):
    if not isinstance(smi, str) or not smi:
        return None
    try:
        m = Chem.MolFromSmiles(smi, sanitize=False)
        if m is None:
            return None
        flags = Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_ADJUSTHS
        Chem.SanitizeMol(m, sanitizeOps=flags)
        return m
    except Exception:
        return None

In [11]:
def smiles_to_data(smiles: str, y: int | float | None = None):
    mol = safe_mol_from_smiles(smiles)
    if mol is None:
        return None

    #ノード特徴量
    xs = [atom_features(a) for a in mol.GetAtoms()]
    x = torch.stack(xs, dim=0) if xs else torch.zeros((0, 8), dtype=torch.float)

    #エッジ
    edge_index_list = []
    edge_attr_list = []
    for b in mol.GetBonds():
        i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
        bf = bond_features(b)
        edge_index_list += [[i, j], [j, i]]
        edge_attr_list += [bf, bf]
    if edge_index_list:
        edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()
        edge_attr  = torch.stack(edge_attr_list, dim=0)
    else:
        edge_index = torch.zeros((2,0), dtype=torch.long)
        edge_attr  = torch.zeros((0,6), dtype=torch.float)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    if y is not None:
        data.y = torch.tensor([float(y)], dtype=torch.float)
    data.smiles = smiles
    return data


In [12]:
class SmilesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.cache = {}

    def len(self):
        return len(self.df)

    def get(self, idx):
        if idx in self.cache:
            return self.cache[idx]
        row = self.df.iloc[idx]
        d = smiles_to_data(row['SMILES'], row.get('anti_cancer', None))
        if d is None:
            d = Data(x=torch.zeros((1, len(ATOM_LIST)+len(HYB_SET)+3), dtype=torch.float),
                     edge_index=torch.zeros((2,0), dtype=torch.long),
                     edge_attr=torch.zeros((0,6), dtype=torch.float),
                     y=torch.tensor([0.0], dtype=torch.float))
            d.invalid = True
        self.cache[idx] = d
        return d

In [13]:
class GINNet(nn.Module):
    def __init__(self, in_dim, hidden=128, num_layers=3, dropout=0.2):
        super().__init__()
        self.dropout = dropout

        mlps = []
        last = in_dim
        for _ in range(num_layers):
            mlp = nn.Sequential(
                nn.Linear(last, hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
            )
            mlps.append(GINConv(mlp))
            last = hidden
        self.gin_layers = nn.ModuleList(mlps)
        self.lin_out = nn.Linear(hidden, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.gin_layers:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = global_mean_pool(x, data.batch)
        logit = self.lin_out(x).squeeze(-1)
        return logit


In [14]:
assert 'SMILES' in df.columns and 'anti_cancer' in df.columns
df = df.dropna(subset=['SMILES', 'anti_cancer']).reset_index(drop=True)
df['anti_cancer'] = df['anti_cancer'].astype(int)

y_np = df['anti_cancer'].values
pos_ratio = y_np.mean()
print(f"Samples: {len(df)}  PosRatio: {pos_ratio:.3f}")

Samples: 17927  PosRatio: 0.015


In [15]:
tr_idx, va_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.2,
    stratify=df['anti_cancer'],
    random_state=42
)
df_tr, df_va = df.iloc[tr_idx].copy(), df.iloc[va_idx].copy()

train_ds = SmilesDataset(df_tr)
valid_ds = SmilesDataset(df_va)

tmp = train_ds.get(0)
in_dim = tmp.x.shape[1]

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=128, shuffle=False)

model = GINNet(in_dim=in_dim, hidden=128, num_layers=3, dropout=0.2).to(device)

pos_weight = None
pos_frac = df_tr['anti_cancer'].mean()
if 0 < pos_frac < 1:
    pw = (1 - pos_frac) / max(pos_frac, 1e-6)
    pos_weight = torch.tensor([pw], dtype=torch.float, device=device)
    print(f"Using pos_weight={pw:.2f}")

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) if pos_weight is not None \
            else nn.BCEWithLogitsLoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, verbose=True)


Using pos_weight=65.70




In [16]:
def evaluate(loader):
    model.eval()
    logits_all, ys_all = [], []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            out = model(batch)
            logits_all.append(out.detach().cpu())
            ys_all.append(batch.y.detach().cpu().view(-1))
    logits = torch.cat(logits_all).numpy()
    ys = torch.cat(ys_all).numpy()
    probs = 1 / (1 + np.exp(-logits))
    try:
        auc = roc_auc_score(ys, probs)
    except ValueError:
        auc = float('nan')
    preds = (probs >= 0.5).astype(int)
    return {'auc': auc}



In [17]:
best_auc = -1
best_state = None
patience, wait = 10, 0
EPOCHS = 50

In [18]:
for epoch in range(1, EPOCHS+1):
    model.train()
    losses = []
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        logits = model(batch)
        loss = criterion(logits, batch.y.view(-1))
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        losses.append(loss.item())

    val_metrics = evaluate(valid_loader)
    scheduler.step(val_metrics['auc'])
    print(f"Epoch {epoch:02d} | loss {np.mean(losses):.4f} | "
          f"val AUC {val_metrics['auc']:.4f} ")

    if val_metrics['auc'] > best_auc:
        best_auc = val_metrics['auc']
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping.")
            break

if best_state is not None:
    model.load_state_dict(best_state)
print("Best val AUC:", best_auc)

Epoch 01 | loss 1.3676 | val AUC 0.5197 


  probs = 1 / (1 + np.exp(-logits))


Epoch 02 | loss 1.9086 | val AUC 0.5611 
Epoch 03 | loss 1.5696 | val AUC 0.5465 
Epoch 04 | loss 1.5592 | val AUC 0.5492 


  probs = 1 / (1 + np.exp(-logits))


Epoch 05 | loss 1.4917 | val AUC 0.5572 


  probs = 1 / (1 + np.exp(-logits))


Epoch 06 | loss 3.4030 | val AUC 0.5054 


  probs = 1 / (1 + np.exp(-logits))


Epoch 07 | loss 1.5672 | val AUC 0.5069 


  probs = 1 / (1 + np.exp(-logits))


Epoch 08 | loss 1.5403 | val AUC 0.5115 


  probs = 1 / (1 + np.exp(-logits))


Epoch 09 | loss 1.9490 | val AUC 0.5441 


  probs = 1 / (1 + np.exp(-logits))


Epoch 10 | loss 1.9711 | val AUC 0.5404 


  probs = 1 / (1 + np.exp(-logits))


Epoch 11 | loss 1.5631 | val AUC 0.5279 
Epoch 12 | loss 1.6000 | val AUC 0.5540 
Early stopping.
Best val AUC: 0.5611105868042449


  probs = 1 / (1 + np.exp(-logits))
