In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from model_wrapper_with_mlp_adapter import FeaturePrefixAdapter, PrefixLLaMAModel
import os

# === Fix path with spaces
BASE_DIR = "/content/drive/MyDrive/LLM + Fatigue Handling"
MODEL_DIR = os.path.join(BASE_DIR, "llama_prefix_final_model")
ADAPTER_PATH = os.path.join(MODEL_DIR, "prefix_adapter.pth")

# === Constants
FEATURE_DIM = 9
EMBEDDING_DIM = 4096
PREFIX_TOKEN_COUNT = 5

# === Bits and Bytes Config (Optional if you used quantization)
bnb_config = BitsAndBytesConfig(load_in_4bit=True)

# === Load tokenizer (from local)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)

# === Load base model (from local)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
    local_files_only=True  # ← IMPORTANT
)

# === Load LoRA adapter (from local)
llama_model = PeftModel.from_pretrained(
    base_model,
    MODEL_DIR,
    is_trainable=False,
    adapter_name="default",
    local_files_only=True  # ← IMPORTANT
)

# === Load your MLP adapter
adapter = FeaturePrefixAdapter(
    input_dim=FEATURE_DIM,
    hidden_dim=256,
    output_dim=EMBEDDING_DIM,
    num_tokens=PREFIX_TOKEN_COUNT
)
adapter.load_state_dict(torch.load(ADAPTER_PATH, map_location="cpu"))
adapter.eval()

# === Combine everything
model = PrefixLLaMAModel(llama_model, adapter).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
