In [9]:
# cfg_dict = {
#     "data": {
#         "raw_data_root": "/home/jovyan/CHJ/Mol-LLM_Custom/dataset/real_train",
#         "data_tag": "3.3M_0415",
#         "tasks": ["reagent_prediction"],
#     },
#     "gnn": {
#         "gnn_type": "gine_tokengt",
#         "gnn_hidden_dim": 1024,
#         "gine": {
#             "gnn_hidden_dim": 1024,
#             "gin_num_layers": 5,
#             "drop_ratio": 0.0,
#             "used_gnn_layer": -1,
#             "gnn_jk": "last",
#             "graph_encoder_ckpt": "/home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom/mol-llm.ckpt",
#             "gnn_type": "gine",
#         },
#         "tokengt": {
#             "input_feat_dim": 9,
#             "gnn_hidden_dim": 1024,
#             "num_layers": 5,
#             "num_heads": 8,
#             "method": "laplacian",
#             "d_p": 64,
#             "d_e": 64,
#             "use_graph_token": True,
#             "max_position_embeddings": 102,
#             "graph_encoder_ckpt": "/home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom/mol-llm.ckpt",
#             "gnn_type": "tokengt",
#         },
#     },
#     "trainer": {
#         "bert_hidden_dim": 768,
#         "bert_name": "scibert",
#         "cross_attention_freq": 2,
#         "num_query_token": 32,
#         "bert_num_hidden_layers": 5,
#         "projector_type": "qformer",
#         "llm_model": "mistralai/Mistral-7B-Instruct-v0.3",
#         "tune_llm": "lora",
#         "peft_config": None,
#         "peft_dir": "",
#         "load_in_8bit": False,
#         "lora_r": 64,
#         "lora_alpha": 32,
#         "lora_dropout": 0.1,
#         "selfies_token_path": "Mol-LLM_Custom/model/selfies_dict.txt",
#         "add_selfies_tokens": True,
#         "prompt": "[START_I_SMILES]{}[END_I_SMILES]",
#         "num_beams": 1,
#         "strategy_name": None,
#         "accelerator": "gpu",
#         "devices": "0,1,2,3,4,5,6,7",
#         "precision": "bf16-mixed",
#         "max_steps": -1,
#         "max_epochs": 12,
#         "every_n_epochs": 1,
#         "task": None,
#         "logging_dir": "/home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom",
#         "llava_pretraining": 0,
#         "second_stage_start_epoch": 4,
#         "num_workers": 0,
#         "skip_sanity_check": False,
#         "total_batch_size": 512,
#         "batch_size": 8,
#         "inference_batch_size": 11,
#         "truncation": 1,
#         "padding": "max_length",
#         "max_length": 512,
#         "inference_max_length": 512,
#         "gen_max_len": 256,
#         "min_len": 8,
#         "apply_sequence_packing": False,
#         "max_packing_size": -1,
#         "weight_decay": 0.05,
#         "min_lr": 1e-05,
#         "init_lr": 0.0001,
#         "warmup_lr": 1e-05,
#         "warmup_epochs": 0.25,
#         "scheduler": "linear_warmup_cosine_lr",
#         "optimizer": "adamw",
#         "log_every_n_steps": 50,
#         "gradient_clip_val": 0.5,
#         "val_check_interval": 0.5,
#         "test_on_trainset": False,
#         "mol_representation": "string+graph",
#         "log_attn_score": True,
#         "eval_modality_util": None,
#         "tune_gnn": True,
#         "train_molpo": False,
#         "eval_molpo": False,
#         "find_unused_parameters": False,
#         "selfies_enumeration": False,
#         "isomericSmiles": False,
#         "canonical": False,
#         "allHsExplicit": False,
#     },
#     "filename": "debugging",
#     "seed": 42,
#     "mode": "ft",
#     "wandb_entity": "hj_ai",
#     "wandb_project": "mol-llm",
#     "wandb_log_freq": 100,
#     "wandb_id": None,
#     "debug": False,
#     "ckpt_path": None,
#     "pretrained_ckpt_path": None,
#     "shuffle_selfies": False,
#     "shuffle_graph": False,
#     "process_disjoint": True,
# }

In [1]:
import torch
import hydra
from hydra import initialize, compose
from omegaconf import OmegaConf
import os
import selfies as sf
from torch_geometric.data import Data, Batch
from model.blip2_stage3 import Blip2Stage3
from ogb.utils import smiles2graph

# ==============================================================================
# 1. Config & Setup (stage3.py와 동일한 환경 구성)
# ==============================================================================
hydra.core.global_hydra.GlobalHydra.instance().clear()

# config 경로는 실제 환경에 맞게 수정해주세요
with initialize(version_base=None, config_path="configs"):
    cfg = compose(config_name="test_CHJ.yaml")

# Config 구조 맞추기 (stage3.py의 main 함수 로직 반영)
OmegaConf.set_struct(cfg, False)
if "trainer" in cfg: cfg = OmegaConf.merge(cfg, cfg.trainer)
if "gnn" in cfg: cfg = OmegaConf.merge(cfg, cfg.gnn)

# 경로 설정 (사용자 환경에 맞게 수정)
root_dir = "/home/jovyan/CHJ/Mol-LLM_Custom"
ckpt_path = os.path.join(root_dir, "checkpoint/Custom/mol-llm.ckpt") 
token_path = os.path.join(root_dir, "model/selfies_dict.txt")

# GNN 체크포인트 경로 설정 (중요)
if hasattr(cfg, "gine"): cfg.gine.graph_encoder_ckpt = ckpt_path
if hasattr(cfg, "tokengt"): cfg.tokengt.graph_encoder_ckpt = ckpt_path
cfg.selfies_token_path = token_path
OmegaConf.set_struct(cfg, True)

# ==============================================================================
# 2. 모델 초기화 및 로드 (여기가 핵심입니다!)
# ==============================================================================
print("Initializing Model...")
# 1. 모델 껍데기 생성 (stage3.py와 동일)
model = Blip2Stage3(cfg)

print(f"Loading weights from {ckpt_path}...")
# 2. 체크포인트 파일 읽기
checkpoint = torch.load(ckpt_path, map_location="cpu")
state_dict = checkpoint["state_dict"] if "state_dict" in checkpoint else checkpoint

# 3. [중요] 키 변경 없이 그대로 로드합니다.
# Blip2Stage3는 내부에 self.blip2model을 가지고 있으므로, 
# 체크포인트의 'blip2model.xxx' 키와 정확히 매칭됩니다.
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)

print(f"Missing keys: {len(missing_keys)}")
print(f"Unexpected keys: {len(unexpected_keys)}")
# missing_keys에 'blip2model.llm_model.lm_head.weight' 등이 없어야 정상입니다.

# 4. 장치 이동 및 정밀도 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval()
model.to(device)
# 학습 때 bfloat16을 썼다면 여기서도 맞춰줍니다.
model.to(torch.bfloat16) 

print("Model loaded successfully!")

# ==============================================================================
# 3. 데이터 전처리 (Input 준비)
# ==============================================================================
tokenizer = model.blip2model.llm_tokenizer
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

mol_token_id = tokenizer.mol_token_id
mol_token = "<mol>"
num_query_tokens = 32

# 사용자 입력 예시
input_selfies = "[C][O][C@H1][C][Branch1][C][C][=C][C@H1][C][C@H1][Ring1][Ring1][Ring1][#Branch1]"
instruction_template = "Please provide the HOMO energy value for this molecule: <INPUT>"
system_prompt = "You are a helpful assistant for molecular chemistry, to address tasks including molecular property classification, molecular property regression, chemical reaction prediction, molecule captioning, molecule generation."

# Graph 변환 함수
def smiles2data(smiles):
    try:
        graph = smiles2graph(smiles)
        x = torch.from_numpy(graph["node_feat"]).long()
        edge_index = torch.from_numpy(graph["edge_index"]).long()
        edge_attr = torch.from_numpy(graph["edge_feat"]).long()
        return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    except Exception:
        return None

# 데이터 생성
smiles = sf.decoder(input_selfies)
graph = smiles2data(smiles)

# 배치 차원 추가 및 이동
graph_batch = Batch.from_data_list([graph]).to(device)
# 모델이 두 개의 그래프 입력을 요구하는 경우 (Stage3 구조에 따라 다름, 보통 하나면 됨)
# 만약 에러가 나면 model.blip2model.generate의 graphs 인자를 확인해야 함. 
# 여기서는 safe하게 튜플로 묶어서 전달 준비
graphs_input = (graph_batch, graph_batch) 

# 프롬프트 구성
formatted_selfies = f"<SELFIES> {input_selfies} </SELFIES>"
# <mol> 토큰을 32개 채워넣어 그래프 정보를 위한 자리를 만듭니다.
graph_placeholder = "<GRAPH>" + mol_token * num_query_tokens + "</GRAPH>"
input_mol_string = formatted_selfies + graph_placeholder
final_instruction = instruction_template.replace("<INPUT>", input_mol_string)

full_prompt = f"<s>[INST] {system_prompt} \n\n{final_instruction} [/INST]"
print(f"Prompt: {full_prompt}")

# 토크나이징
inputs = tokenizer(
    [full_prompt], 
    return_tensors="pt", 
    add_special_tokens=False, 
    padding=True
)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
is_mol_token = (input_ids == mol_token_id)

# ==============================================================================
# 4. Inference 실행
# ==============================================================================
print("Generating...")

with torch.no_grad():
    # autocast를 사용하여 dtype 불일치 문제 방지
    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
        outputs = model.blip2model.generate(
            graphs=graphs_input,  # 모델 정의에 따라 graphs가 리스트나 튜플일 수 있음
            input_ids=input_ids,
            attention_mask=attention_mask,
            is_mol_token=is_mol_token,
            num_beams=5,
            max_length=256,
            min_length=1,
            do_sample=False,
            repetition_penalty=1.0
        )

# ==============================================================================
# 5. 결과 확인
# ==============================================================================
raw_prediction = outputs.predictions[0]
clean_prediction = raw_prediction.replace(tokenizer.pad_token, "").replace("</s>", "").strip()

print("\n" + "="*50)
print(f"Input SELFIES: {input_selfies}")
print(f"Prediction: {clean_prediction}")
print("="*50)

  import pynvml  # type: ignore[import]
  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import parse_version
  return torch.cuda.amp.custom_fwd(orig_func)  # type: ignore
  return torch.cuda.amp.custom_bwd(orig_func)  # type: ignore


Initializing Model...
{'data': {'raw_data_root': '/home/jovyan/CHJ/Mol-LLM_Custom/dataset/real_train', 'data_tag': '3.3M_0415', 'tasks': ['reagent_prediction']}, 'gnn': {'gnn_type': 'gine_tokengt', 'gnn_hidden_dim': 1024, 'gine': {'gnn_hidden_dim': 1024, 'gin_num_layers': 5, 'drop_ratio': 0.0, 'used_gnn_layer': -1, 'gnn_jk': 'last', 'graph_encoder_ckpt': '/home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom/mol-llm.ckpt', 'gnn_type': 'gine'}, 'tokengt': {'input_feat_dim': 9, 'gnn_hidden_dim': 1024, 'num_layers': 5, 'num_heads': 8, 'method': 'laplacian', 'd_p': 64, 'd_e': 64, 'use_graph_token': True, 'max_position_embeddings': 102, 'graph_encoder_ckpt': '/home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom/mol-llm.ckpt', 'gnn_type': 'tokengt'}}, 'trainer': {'bert_hidden_dim': 768, 'bert_name': 'scibert', 'cross_attention_freq': 2, 'num_query_token': 32, 'bert_num_hidden_layers': 5, 'projector_type': 'qformer', 'llm_model': 'mistralai/Mistral-7B-Instruct-v0.3', 'tune_llm': 'lora', 'peft_config

loading file tokenizer.model from cache at /home/jovyan/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/c170c708c41dac9275d15a8fff4eca08d52bab71/tokenizer.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/jovyan/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/c170c708c41dac9275d15a8fff4eca08d52bab71/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/jovyan/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/c170c708c41dac9275d15a8fff4eca08d52bab71/tokenizer_config.json
loading file tokenizer.json from cache at /home/jovyan/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/c170c708c41dac9275d15a8fff4eca08d52bab71/tokenizer.json


Added 2944 selfies tokens to the tokenizer


loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/c170c708c41dac9275d15a8fff4eca08d52bab71/config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 32768
}

loading weights file model.safetensors from cache at /home/jovyan/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/c170c708c41dac9275

trainable params: 167,772,160 || all params: 7,440,183,296 || trainable%: 2.2549
/home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom/mol-llm.ckpt -args.gine.graph_encoder_ckpt
load graph encoder from /home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom/mol-llm.ckpt
load graph encoder from /home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom/mol-llm.ckpt
bert load scibert


loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/config.json
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

loading weights file pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/pytorch_model.bin
Generate config GenerationConfig {
  "pad_token_id": 0
}

Some weigh

Loading weights from /home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/Custom/mol-llm.ckpt...
Missing keys: 291
Unexpected keys: 0
Model loaded successfully!
Prompt: <s>[INST] You are a helpful assistant for molecular chemistry, to address tasks including molecular property classification, molecular property regression, chemical reaction prediction, molecule captioning, molecule generation. 

Please provide the HOMO energy value for this molecule: <SELFIES> [C][O][C@H1][C][Branch1][C][C][=C][C@H1][C][C@H1][Ring1][Ring1][Ring1][#Branch1] </SELFIES><GRAPH><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol></GRAPH> [/INST]
Generating...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Input SELFIES: [C][O][C@H1][C][Branch1][C][C][=C][C@H1][C][C@H1][Ring1][Ring1][Ring1][#Branch1]
Prediction: [=As] [Ge] [86Zr]


In [11]:
from datasets import load_from_disk

data = load_from_disk('/home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/mol-llm_testset')

data[0]

{'task': 'alchemy_homo',
 'x': [[5, 0, 4, 5, 3, 0, 2, 0, 0],
  [7, 0, 2, 5, 0, 0, 2, 0, 0],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 1],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 3, 5, 1, 0, 1, 0, 1],
  [5, 0, 3, 5, 0, 0, 1, 0, 1],
  [5, 0, 4, 5, 3, 0, 2, 0, 0]],
 'edge_index': [[0, 1, 1, 2, 2, 3, 2, 7, 3, 4, 3, 5, 4, 5, 5, 6, 6, 7, 7, 8],
  [1, 0, 2, 1, 3, 2, 7, 2, 4, 3, 5, 3, 5, 4, 6, 5, 7, 6, 8, 7]],
 'edge_attr': [[0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [0, 0, 0],
  [0, 0, 0]],
 'additional_x': [[5, 0, 4, 5, 3, 0, 2, 0, 0],
  [7, 0, 2, 5, 0, 0, 2, 0, 0],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 1],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 3, 5, 1, 0, 1, 0, 1],
  [5, 0, 3, 5, 0, 0, 1, 0, 1],
  [5, 0, 