# ChemBERT_MTR

## 라이브러리(restart)

In [None]:
# 런타임 재시작
!pip install -U "numpy==1.26.4"

## 라이브러리2

In [None]:
# 드라이브 마운트 (colab에서만)
'''from google.colab import drive
drive.mount('/content/drive')'''

In [None]:
# (A) 우리 작업에 불필요하고 버전 충돌의 원인이 되는 패키지 제거
!pip uninstall -y bigframes peft diffusers gradio gcsfs cuml-cu12 umap-learn || true

# (B) ChemBERTa에 필요한 핵심 스택만 확정 설치 (서로 호환되는 조합)
!pip install --no-cache-dir \
  "transformers==4.44.2" \
  "tokenizers==0.19.1" \
  "accelerate==0.34.2" \
  "huggingface_hub==0.24.6" \
  "datasets==2.20.0" \
  "scikit-learn==1.6.1" \
  "torchmetrics==1.4.0" \
  "sentencepiece==0.1.99"

!pip install -U rdkit

In [None]:
# 라이브러리
import os, json, random, math, time
import numpy as np
import pandas as pd
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import torch
from torch import nn
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, set_seed
)
from rdkit import Chem
from transformers import EarlyStoppingCallback

In [None]:
# ===== 기본 설정 =====
SEED = 42
set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
N_FOLDS = 5
OUTPUT_ROOT = "./chemberta_5fold"

# ===== 스케줄러 & 얼리 스톱 설정 =====
LR_SCHEDULER = "cosine_with_restarts"
ES_PATIENCE = 5
ES_DELTA = 1e-4

## 데이터

In [None]:
# ===== 유틸 =====
def IC50_to_pIC50(ic50_nM):
    ic50_nM = np.clip(ic50_nM, 1e-10, None)
    return 9 - np.log10(ic50_nM)

def pIC50_to_IC50(pIC50):
    return np.power(10.0, 9 - pIC50)

def canonicalize_smiles(smi: str):
    if pd.isna(smi):
        return None
    mol = Chem.MolFromSmiles(str(smi))
    return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True) if mol else None

In [None]:
# ===== 데이터 전처리 =====
chembl_ask = pd.read_csv('./ChEMBL_ASK1(IC50).csv', sep=";")
pubchem_ask = pd.read_csv('./Pubchem_ASK1.csv')

chembl_ask.dropna(subset=['Standard Value'],inplace=True)
chembl_ask = chembl_ask[chembl_ask['Standard Relation'] == "'='"]
chembl_ask.reset_index(drop=True,inplace=True)

pubchem_ask.dropna(subset=['Activity_Value'],inplace=True)
pubchem_ask = pubchem_ask[pubchem_ask['Activity_Type']=='IC50']
pubchem_ask['Activity_Qualifier'] = pubchem_ask['Activity_Qualifier'].fillna('=')
pubchem_ask = pubchem_ask[pubchem_ask['Activity_Qualifier'] == '=']
pubchem_ask.reset_index(drop=True,inplace=True)

df_names = pd.read_excel('./CAS_KPBMA_MAP3K5_IC50s.xlsx', sheet_name='Ligand Number Names SMILES', header=1)
df_ic50  = pd.read_excel('./CAS_KPBMA_MAP3K5_IC50s.xlsx', sheet_name='MAP3K5 Ligand IC50s', header=1)
cas_df = pd.merge(df_names, df_ic50, on="Substance Name", how="inner")
cas_df['Measurement Prefix (Parsed)'] = cas_df['Measurement Prefix (Parsed)'].fillna('=')
cas_df = cas_df[cas_df['Measurement Prefix (Parsed)'] == '=']
cas_df = cas_df[cas_df['Assay Parameter'] == 'IC50']
cas_df.reset_index(drop=True,inplace=True)

In [None]:
# ===== 데이터 처리 =====
chem_df = chembl_ask[['Smiles','Standard Value']].copy()
chem_df.rename(columns={'Standard Value':'ic50_nM'}, inplace=True)
chem_df.dropna(inplace=True)
chem_df['ic50_nM'] = pd.to_numeric(chem_df['ic50_nM'], errors='coerce')

pub_df = pubchem_ask[['SMILES','Activity_Value']].copy()
pub_df.rename(columns={'SMILES':'Smiles', 'Activity_Value':'ic50_nM'}, inplace=True)
pub_df.dropna(inplace=True)
pub_df['ic50_nM'] = pd.to_numeric(pub_df['ic50_nM'], errors='coerce')
# 가정: PubChem 값이 μM → nM 로 변환
pub_df['ic50_nM'] = pub_df['ic50_nM'] * 1000

# cas_df
cas_df = cas_df[['SMILES_x', 'Single Value (Parsed)']].rename(columns={'SMILES_x':'Smiles','Single Value (Parsed)':'ic50_nM'})
cas_df.dropna(inplace=True)
cas_df['ic50_nM'] = pd.to_numeric(cas_df['ic50_nM'], errors='coerce')
# 가정: μM → nM
cas_df['ic50_nM'] = cas_df['ic50_nM'] * 1000
cas_df.reset_index(drop=True, inplace=True)

# 병합
train_df = pd.concat([chem_df, pub_df, cas_df], axis=0, ignore_index=True)
train_df = train_df.dropna(subset=['Smiles','ic50_nM']).copy()
train_df['Smiles'] = train_df['Smiles'].map(canonicalize_smiles)
train_df = train_df.dropna(subset=['Smiles']).reset_index(drop=True)

print("Train size:", len(train_df))

In [None]:
# pIC50 변환 → 구간 필터 → 동일 SMILES 중앙값 집계
train_df['pIC50'] = train_df['ic50_nM'].apply(IC50_to_pIC50)
train_df = train_df[(train_df['pIC50'] >= 6) & (train_df['pIC50'] <= 10)]
train_df = (train_df.groupby('Smiles', as_index=False)['pIC50']
                     .median()
                     .reset_index(drop=True))

print("Train size:", len(train_df))

## 모델링

In [None]:
# ======= 여기서부터 '1) Scaffold 기반 StratifiedGroupKFold' 적용 =======

# 1) scaffold 생성 함수
def smiles_to_scaffold(smi: str):
    m = Chem.MolFromSmiles(smi)
    return MurckoScaffoldSmiles(mol=m, includeChirality=True) if m else "NONE"

# 2) 층화용 pIC50 bin + 그룹용 scaffold
#   - q=10으로 기본 설정, 값이 부족하면 자동으로 bin 수가 줄어듭니다(duplicates='drop').
train_df["bin"] = pd.qcut(train_df["pIC50"], q=10, labels=False, duplicates="drop")
train_df["scaffold"] = train_df["Smiles"].map(smiles_to_scaffold)

# 3) StratifiedGroupKFold로 splits 생성
cv = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
splits = list(cv.split(train_df, y=train_df["bin"], groups=train_df["scaffold"]))

In [None]:
# ===== 모델/토크나이저 =====
model_name = "DeepChem/ChemBERTa-77M-MTR"
max_len = 256

tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels=1, problem_type="regression")
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

def tok_with_label(ex):
    out = tokenizer(ex["Smiles"], max_length=max_len, truncation=True, padding="max_length")
    out["labels"] = float(ex["pIC50"])
    return out

In [None]:
# ===== 메트릭(학습용) =====
def pIC50_to_IC50(pIC50):
    return np.power(10.0, 9 - pIC50)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds  = preds.reshape(-1).astype(np.float64)
    labels = labels.astype(np.float64)

    y_true = pIC50_to_IC50(labels); y_pred = pIC50_to_IC50(preds)
    rmse_ic50 = float(np.sqrt(np.mean((y_true - y_pred)**2)))
    rng = float(np.max(y_true) - np.min(y_true)); A = float(rmse_ic50 / (rng if rng>1e-12 else 1.0))

    x = labels - labels.mean(); y = preds - preds.mean()
    denom = float(np.sqrt((x**2).sum()) * np.sqrt((y**2).sum()))
    r = float((x*y).sum() / denom) if denom>0 else 0.0
    B = float(r*r)

    rmse = float(np.sqrt(np.mean((preds - labels)**2)))
    mae  = float(np.mean(np.abs(preds - labels)))
    ss_res = float(np.sum((labels - preds)**2)); ss_tot = float(np.sum((labels - labels.mean())**2))
    r2 = float(1.0 - ss_res/ss_tot) if ss_tot>0 else 0.0

    score = 0.4*(1.0 - min(A,1.0)) + 0.6*B
    return {"score": score, "A_nrmse": A, "B_r2": B, "rmse_ic50": rmse_ic50, "rmse": rmse, "mae": mae, "r2": r2}

In [None]:
# ===== 학습 루프 =====
oof_pred_pic50 = np.zeros(len(train_df), dtype=np.float64)
fold_metrics = []

best_epochs, best_ckpts, best_scores = [], [], []

for fold, (trn_idx, val_idx) in enumerate(splits, 1):
    print(f"\n===== FOLD {fold}/{N_FOLDS} =====")
    df_trn = train_df.iloc[trn_idx].reset_index(drop=True)
    df_val = train_df.iloc[val_idx].reset_index(drop=True)

    ds_trn = Dataset.from_pandas(df_trn[['Smiles','pIC50']].copy()).map(tok_with_label)
    ds_val = Dataset.from_pandas(df_val[['Smiles','pIC50']].copy()).map(tok_with_label)
    for ds in (ds_trn, ds_val):
        ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    out_dir = os.path.join(OUTPUT_ROOT, f"fold{fold}")
    args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=50,
        save_total_limit=2,
        weight_decay=0.01,
        warmup_ratio=0.1,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="score",
        greater_is_better=True,
        fp16=True,
        logging_steps=50,
        lr_scheduler_type=LR_SCHEDULER,
        report_to="none",
        overwrite_output_dir=True,
        seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_trn,
        eval_dataset=ds_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # 폴드별 best epoch / ckpt / score 추출
    best_metric = -float("inf")
    best_epoch = None
    for log in trainer.state.log_history:
        if "eval_score" in log and log["eval_score"] > best_metric:
            best_metric = float(log["eval_score"])
            best_epoch = log.get("epoch", None)

    if best_epoch is None:
        last_eval = [l for l in trainer.state.log_history if "eval_score" in l]
        if len(last_eval) > 0:
            best_epoch = last_eval[-1].get("epoch", None)

    best_ckpt_path = trainer.state.best_model_checkpoint
    best_epochs.append(best_epoch)
    best_ckpts.append(best_ckpt_path)
    best_scores.append(best_metric)

    print(f"[FOLD {fold}] ✅ Best eval_score={best_metric:.6f} @ epoch={best_epoch} | ckpt={best_ckpt_path}")

    # OOF 채우기
    val_pred = trainer.predict(ds_val).predictions.reshape(-1)
    oof_pred_pic50[val_idx] = val_pred

    # 폴드 메트릭 기록
    m = trainer.evaluate(ds_val); fold_metrics.append(m)
    print({k: round(float(v), 5) for k,v in m.items()})


===== FOLD 1/5 =====


Map:   0%|          | 0/1131 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Score,A Nrmse,B R2,Rmse Ic50,Rmse,Mae,R2
1,No log,57.882729,0.047563,959993.689173,0.079271,949913401.225838,7.60807,7.570441,-99.166729
2,59.425700,57.074448,0.094406,851029.191454,0.157343,842093070.937124,7.554763,7.517348,-97.76798
3,58.077100,55.261631,0.10797,651900.18778,0.17995,645054995.275129,7.433816,7.396833,-94.630878
4,58.077100,50.746826,0.100983,344456.372934,0.168304,340839453.923286,7.123681,7.087166,-86.817974
5,55.132700,36.432831,0.063105,90872.533958,0.105174,89918338.821657,6.035961,5.991433,-62.047421
6,40.624800,9.519953,0.031191,25693.008589,0.051986,25423222.519105,3.085442,2.923986,-15.474387
7,11.698100,3.088046,0.018076,4294.993235,0.030126,4249894.221061,1.757284,1.44671,-4.3439
8,11.698100,1.075627,0.008476,70.960939,0.014127,70215.822991,1.037124,0.811857,-0.861385
9,2.460600,0.660934,0.006511,1.528627,0.010852,1512.575716,0.812979,0.673861,-0.143755
10,0.722100,0.707101,0.039024,0.923149,0.013806,913.455761,0.840893,0.680291,-0.223646


[FOLD 1] ✅ Best eval_score=0.473709 @ epoch=30.0 | ckpt=/content/drive/MyDrive/jump_ai/chemberta_5fold/fold1/checkpoint-1080


{'eval_loss': 0.3496, 'eval_score': 0.47371, 'eval_A_nrmse': 0.41453, 'eval_B_r2': 0.3992, 'eval_rmse_ic50': 410.17331, 'eval_rmse': 0.59127, 'eval_mae': 0.47291, 'eval_r2': 0.39501, 'eval_runtime': 0.1514, 'eval_samples_per_second': 1248.077, 'eval_steps_per_second': 19.811, 'epoch': 50.0}

===== FOLD 2/5 =====


Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Score,A Nrmse,B R2,Rmse Ic50,Rmse,Mae,R2
1,No log,59.185001,0.05985,943931.196956,0.099749,943610260.3755,7.693178,7.657786,-106.287087
2,59.145200,58.391586,0.078099,841305.792842,0.130165,841019748.895363,7.641438,7.606284,-104.848822
3,57.611800,56.726795,0.080257,665107.844541,0.133762,664881707.891811,7.531719,7.496981,-101.831002
4,57.611800,52.895119,0.076202,403997.069204,0.127003,403859710.211115,7.272903,7.238598,-94.885169
5,53.994800,42.835468,0.070079,160781.837541,0.116799,160727171.720901,6.544881,6.506423,-76.649622
6,39.523500,19.715015,0.048506,40302.161711,0.080843,40288458.977106,4.440159,4.310612,-34.738225
7,39.523500,5.279256,0.02746,4095.200999,0.045766,4093808.630826,2.297663,1.85539,-8.569925
8,9.858500,1.686528,0.022201,281.166121,0.037002,281070.52415,1.298664,0.938578,-2.057239
9,1.406900,0.758269,0.019449,13.190995,0.032415,13186.510533,0.870786,0.691735,-0.374546
10,1.406900,0.721766,0.026797,7.517996,0.044662,7515.439473,0.849568,0.663983,-0.308375


[FOLD 2] ✅ Best eval_score=0.533009 @ epoch=48.0 | ckpt=/content/drive/MyDrive/jump_ai/chemberta_5fold/fold2/checkpoint-1632


{'eval_loss': 0.39056, 'eval_score': 0.53301, 'eval_A_nrmse': 0.17397, 'eval_B_r2': 0.33766, 'eval_rmse_ic50': 173.90603, 'eval_rmse': 0.62495, 'eval_mae': 0.44269, 'eval_r2': 0.29201, 'eval_runtime': 0.1937, 'eval_samples_per_second': 1239.02, 'eval_steps_per_second': 20.65, 'epoch': 50.0}

===== FOLD 3/5 =====


Map:   0%|          | 0/1013 [00:00<?, ? examples/s]

Map:   0%|          | 0/307 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Score,A Nrmse,B R2,Rmse Ic50,Rmse,Mae,R2
1,No log,59.579525,0.035842,950171.370312,0.059737,949791301.477533,7.718777,7.676215,-89.172563
2,58.807000,58.815655,0.082618,848875.629094,0.137697,848536078.586618,7.669136,7.626727,-88.016458
3,58.807000,57.243027,0.095631,674599.351619,0.159386,674329511.674937,7.565912,7.523778,-85.63631
4,57.562500,53.775398,0.091041,412922.718533,0.151735,412757549.321432,7.333171,7.291413,-80.388119
5,53.497600,45.414494,0.083935,152289.50862,0.139892,152228592.770965,6.739028,6.696454,-67.734043
6,53.497600,24.520708,0.077141,32676.603413,0.128568,32663532.761652,4.951839,4.871799,-36.111661
7,36.908900,5.908438,0.078366,3032.334885,0.130611,3031121.949725,2.430728,2.192608,-7.942316
8,8.612500,1.442078,0.059913,104.408231,0.099856,104366.467497,1.200866,0.999131,-1.18256
9,8.612500,0.628904,0.039597,2.339592,0.065995,2338.656351,0.793035,0.699698,0.048165
10,1.214700,0.658093,0.324059,0.290242,0.066927,290.125497,0.811229,0.657809,0.003988


[FOLD 3] ✅ Best eval_score=0.511587 @ epoch=40.0 | ckpt=/content/drive/MyDrive/jump_ai/chemberta_5fold/fold3/checkpoint-1280


{'eval_loss': 0.44218, 'eval_score': 0.51159, 'eval_A_nrmse': 0.24423, 'eval_B_r2': 0.3488, 'eval_rmse_ic50': 244.13369, 'eval_rmse': 0.66496, 'eval_mae': 0.5146, 'eval_r2': 0.33077, 'eval_runtime': 0.2603, 'eval_samples_per_second': 1179.224, 'eval_steps_per_second': 19.206, 'epoch': 50.0}

===== FOLD 4/5 =====


Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

Map:   0%|          | 0/295 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Score,A Nrmse,B R2,Rmse Ic50,Rmse,Mae,R2
1,No log,58.007561,0.107298,948665.291304,0.178829,948380691.652588,7.61627,7.577511,-95.906026
2,59.596400,57.293121,0.170341,854072.050243,0.283902,853815828.57021,7.569222,7.530916,-94.712503
3,59.596400,55.820755,0.18087,691089.549416,0.30145,690882222.503798,7.471329,7.43392,-92.252807
4,57.615000,52.576511,0.178814,446264.060242,0.298023,446130180.993122,7.250966,7.215356,-86.833043
5,54.789400,44.056328,0.179775,189672.571623,0.299625,189615669.838411,6.637494,6.604729,-72.599434
6,54.789400,23.482937,0.159988,61358.755888,0.266647,61340348.257441,4.84592,4.776766,-38.230024
7,38.940700,6.204239,0.140543,11289.50386,0.234239,11286117.007788,2.490831,2.193433,-9.364651
8,10.273600,1.766365,0.133295,1516.843335,0.222159,1516388.281778,1.329047,1.020254,-1.950847
9,10.273600,0.565983,0.104527,32.913982,0.174212,32904.107846,0.752318,0.589734,0.054482
10,1.515900,0.545229,0.110674,2.050587,0.184457,2049.971929,0.738396,0.571906,0.089153


[FOLD 4] ✅ Best eval_score=0.573571 @ epoch=49.0 | ckpt=/content/drive/MyDrive/jump_ai/chemberta_5fold/fold4/checkpoint-1617


{'eval_loss': 0.33777, 'eval_score': 0.57357, 'eval_A_nrmse': 0.27041, 'eval_B_r2': 0.46956, 'eval_rmse_ic50': 270.32913, 'eval_rmse': 0.58118, 'eval_mae': 0.41358, 'eval_r2': 0.43574, 'eval_runtime': 0.2098, 'eval_samples_per_second': 1406.32, 'eval_steps_per_second': 23.836, 'epoch': 50.0}

===== FOLD 5/5 =====


Map:   0%|          | 0/1031 [00:00<?, ? examples/s]

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Score,A Nrmse,B R2,Rmse Ic50,Rmse,Mae,R2
1,No log,59.654087,0.062344,949644.029672,0.103907,949169207.835716,7.723606,7.679894,-86.565475
2,58.520600,58.920605,0.146298,853401.794193,0.24383,852975093.456031,7.675976,7.632701,-85.488787
3,58.520600,57.391861,0.182381,685701.493412,0.303968,685358642.794649,7.575742,7.533383,-83.244782
4,57.850700,53.96508,0.189007,429956.725252,0.315011,429741746.97006,7.346093,7.305599,-78.214646
5,53.725900,45.246403,0.188891,170186.604146,0.314818,170101510.875459,6.726545,6.689404,-65.416614
6,53.725900,22.864944,0.189875,46772.558769,0.316458,46749172.498505,4.78173,4.716554,-32.563154
7,38.120900,5.201656,0.197245,3472.776377,0.328742,3471039.98994,2.280714,2.048954,-6.635443
8,9.748800,1.226453,0.171196,37.240948,0.285327,37222.327556,1.107453,0.941816,-0.800294
9,9.748800,0.492723,0.458662,0.273008,0.279776,272.871426,0.701942,0.590646,0.276738
10,1.330300,0.484963,0.492251,0.211127,0.294503,211.021459,0.696393,0.552414,0.28813


[FOLD 5] ✅ Best eval_score=0.533364 @ epoch=12.0 | ckpt=/content/drive/MyDrive/jump_ai/chemberta_5fold/fold5/checkpoint-396


{'eval_loss': 0.43909, 'eval_score': 0.53336, 'eval_A_nrmse': 0.21619, 'eval_B_r2': 0.3664, 'eval_rmse_ic50': 216.07896, 'eval_rmse': 0.66264, 'eval_mae': 0.54429, 'eval_r2': 0.35546, 'eval_runtime': 0.2033, 'eval_samples_per_second': 1421.6, 'eval_steps_per_second': 24.595, 'epoch': 50.0}


In [None]:
# ===== 결과 저장(OoF, 메트릭, 매니페스트) =====
y_true_pic50 = train_df["pIC50"].values.astype(np.float64)
oof_df = pd.DataFrame({"Smiles": train_df["Smiles"], "pIC50_true": y_true_pic50, "pIC50_oof": oof_pred_pic50})
oof_path = os.path.join(OUTPUT_ROOT, "oof_5fold.csv")
oof_df.to_csv(oof_path, index=False)
print("Saved OOF:", oof_path)

metrics_df = pd.DataFrame(fold_metrics)
metrics_csv = os.path.join(OUTPUT_ROOT, "fold_metrics.csv")
metrics_df.to_csv(metrics_csv, index=False)
print("Saved fold metrics:", metrics_csv)

In [None]:
# manifest.json: 추론에서 사용할 정보(체크포인트/epoch/score 등)
manifest = {
    "created_at": int(time.time()),
    "model_name": model_name,
    "max_len": max_len,
    "n_folds": N_FOLDS,
    "folds": [
        {"fold": i+1, "checkpoint": ck, "best_epoch": (int(ep) if ep is not None and abs(ep-int(ep))<1e-9 else ep), "best_score": float(sc)}
        for i, (ep, ck, sc) in enumerate(zip(best_epochs, best_ckpts, best_scores))
    ],
    # 추론 스크립트에서 참고할 기본 경로(원하면 수정)
    "test_csv_path": "./test.csv",
    "submission_name": "submission_5fold.csv",
}
manifest_path = os.path.join(OUTPUT_ROOT, "manifest.json")
with open(manifest_path, "w") as f:
    json.dump(manifest, f, indent=2)
print("Saved manifest:", manifest_path)

print("\n=== Per-fold best checkpoints (for inference) ===")
for f in manifest["folds"]:
    print(f"Fold {f['fold']}: epoch={f['best_epoch']}, score={f['best_score']:.6f}")
    print(f"           ckpt={f['checkpoint']}")