# ChemBERT_MTR

## 라이브러리(restart)

In [None]:
# 런타임 재시작
!pip install -U "numpy==1.26.4"

## 라이브러리2

In [None]:
# 드라이브 마운트 (colab에서만)
'''from google.colab import drive
drive.mount('/content/drive')'''

In [None]:
# (A) 우리 작업에 불필요하고 버전 충돌의 원인이 되는 패키지 제거
!pip uninstall -y bigframes peft diffusers gradio gcsfs cuml-cu12 umap-learn || true

# (B) ChemBERTa에 필요한 핵심 스택만 확정 설치 (서로 호환되는 조합)
!pip install --no-cache-dir \
  "transformers==4.44.2" \
  "tokenizers==0.19.1" \
  "accelerate==0.34.2" \
  "huggingface_hub==0.24.6" \
  "datasets==2.20.0" \
  "scikit-learn==1.6.1" \
  "torchmetrics==1.4.0" \
  "sentencepiece==0.1.99"

!pip install -U rdkit

In [None]:
# 라이브러리
import os, json, random, math, time
import numpy as np
import pandas as pd
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import torch
from torch import nn
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, set_seed
)
from rdkit import Chem
from transformers import EarlyStoppingCallback

In [None]:
# ===== 기본 설정 =====
SEED = 42
set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
N_FOLDS = 5
OUTPUT_ROOT = "./chemberta_5fold"

# ===== 스케줄러 & 얼리 스톱 설정 =====
LR_SCHEDULER = "cosine_with_restarts"
ES_PATIENCE = 5
ES_DELTA = 1e-4

## 추론

In [None]:
# =========================
# infer_chemberta.py
# manifest.json을 읽어 각 fold의 best checkpoint로 TEST 예측 → 평균(소프트 앙상블) → 제출 저장
# 또한 어떤 체크포인트가 사용됐는지 출력
# =========================

MANIFEST_PATH = os.path.join(OUTPUT_ROOT, "manifest.json")

# ===== 유틸 =====
def canonicalize_smiles(smi: str):
    if pd.isna(smi):
        return None
    mol = Chem.MolFromSmiles(str(smi))
    return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True) if mol else None

def pIC50_to_IC50(pIC50):
    return np.power(10.0, 9 - pIC50)

# ===== 매니페스트 로드 =====
with open(MANIFEST_PATH, "r") as f:
    manifest = json.load(f)

model_name = manifest["model_name"]
max_len = int(manifest["max_len"])
fold_entries = manifest["folds"]

test_csv_path = manifest.get("test_csv_path", "./test.csv")
submission_name = manifest.get("submission_name", "submission_5fold.csv")

print("Loaded manifest:", MANIFEST_PATH)
print(f"model_name={model_name}, max_len={max_len}, n_folds={len(fold_entries)}")
print("\n=== Using checkpoints for ensembling ===")
for f in fold_entries:
    print(f"Fold {f['fold']}: epoch={f['best_epoch']}, score={float(f['best_score']):.6f}")
    print(f"           ckpt={f['checkpoint']}")

# ===== 토크나이저 =====
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ===== 테스트 로드/전처리 =====
test_df = pd.read_csv(test_csv_path)
test_df['Smiles'] = test_df['Smiles'].map(canonicalize_smiles)

def tok_test_only(ex):
    return tokenizer(ex["Smiles"], max_length=max_len, truncation=True, padding="max_length")

ds_test = Dataset.from_pandas(test_df[['Smiles']].copy())
ds_test = ds_test.map(tok_test_only)
ds_test.set_format(type='torch', columns=['input_ids','attention_mask'])

# ===== 각 fold 체크포인트 로드하여 예측 =====
all_fold_preds = []

for f in fold_entries:
    ckpt_dir = f["checkpoint"]
    if not os.path.isdir(ckpt_dir):
        raise FileNotFoundError(f"Checkpoint not found: {ckpt_dir}")

    config = AutoConfig.from_pretrained(ckpt_dir)
    model  = AutoModelForSequenceClassification.from_pretrained(ckpt_dir, config=config)

    # Trainer.predict를 재사용 (학습 없이 추론만)
    tmp_out = os.path.join(OUTPUT_ROOT, f"infer_tmp_fold{f['fold']}")
    args = TrainingArguments(
        output_dir=tmp_out,
        per_device_eval_batch_size=64,
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
        report_to="none",
    )
    trainer = Trainer(model=model, args=args, tokenizer=tokenizer)

    preds = trainer.predict(ds_test).predictions.reshape(-1)
    all_fold_preds.append(preds)

# ===== 소프트 앙상블 & 제출 저장 =====
test_pic50_mean = np.mean(np.stack(all_fold_preds, axis=0), axis=0)
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "ASK1_IC50_nM": pIC50_to_IC50(test_pic50_mean)
})

sub_path = os.path.join(OUTPUT_ROOT, submission_name)
submission.to_csv(sub_path, index=False)
print("\nSaved submission:", sub_path)