In [13]:
# %pip install transformers>=4.53.2 mlflow==3.2.0 accelerate qwen_vl_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, time, io, json, tempfile, requests
from PIL import Image
import torch, mlflow
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import accelerate
from importlib import import_module

os.environ["TRANSFORMERS_NO_TF"] = "1"   # HF가 TensorFlow 백엔드 불러오지 않도록
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # TF C++ 로그 숨김 (0~3: info~error)
os.environ["JAX_PLATFORMS"] = "cpu"      # (설치되어 있다면) JAX도 CPU만 사용

In [3]:
MLFLOW_URI = "http://mlflow.mlflow.svc.cluster.local:5000"
os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_URI

mlflow.set_tracking_uri(MLFLOW_URI)
print("Tracking URI =", mlflow.get_tracking_uri())

Tracking URI = http://mlflow.mlflow.svc.cluster.local:5000


In [4]:
EXP_NAME = "qwen25vl_demo-eun2ce-20250814-01"
RUN_NAME = "qwen25vl_smoketest-eun2ce-20250814-01"
# HF_MODEL_ID = ""
MODEL_NAME = "/models/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5"

mlflow.set_experiment(EXP_NAME)

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1755135394244, experiment_id='2', last_update_time=1755135394244, lifecycle_stage='active', name='qwen25vl_demo-eun2ce-20250814-01', tags={}>

In [5]:
# IMAGE_URL = "https://huggingface.co/datasets/hf-internal-testing/fixtures_images/resolve/main/COCO/000000039769.png"
IMAGE_PATH = "./testimg.png"

PROMPT = "이 이미지를 한 문장으로 설명해줘. 그리고 주요 객체 3개도 bullet로 적어줘."

def load_image(image_url=None, image_path=None):
    if image_path and os.path.exists(image_path):
        return Image.open(image_path).convert("RGB")
    if image_url:
        r = requests.get(image_url, timeout=15)
        r.raise_for_status()
        return Image.open(io.BytesIO(r.content)).convert("RGB")
    raise ValueError("image_url 또는 image_path 중 하나는 반드시 지정해야 합니다.")

image = load_image(image_path=IMAGE_PATH)

In [6]:
MODEL_ID = os.getenv("HF_MODEL_ID", MODEL_NAME)
use_gpu = torch.cuda.is_available()
dtype = torch.bfloat16 if use_gpu else torch.float32

tokenizer  = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
processor  = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model      = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=dtype,
    device_map="auto" if use_gpu else None
)
# Qwen이 배포한 멀티모달 헬퍼 로드
qwen_utils = import_module("qwen_vl_utils")

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
def qwen_vl_generate(image: Image.Image, prompt: str, max_new_tokens=128, do_sample=False):
    # Qwen 포맷의 chat 메시지
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text",  "text": prompt}
        ]
    }]
    # text 프롬프트 생성
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    # vision 입력 분리
    image_inputs, video_inputs = qwen_utils.process_vision_info(messages)
    # processor로 텐서화
    inputs = processor(
        text=[text], images=image_inputs, videos=video_inputs,
        return_tensors="pt"
    )
    if use_gpu:
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

    t0 = time.time()
    gen_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample
    )
    latency = time.time() - t0

    # 첫 토큰부터 전체 디코딩 (Qwen의 special token은 스킵)
    out = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0]
    # apply_chat_template가 앞부분까지 포함하므로, 마지막 응답만 잘라내기(간단 파싱)
    # 대개 마지막 "\nassistant\n" 이후가 모델 응답입니다. 안전하게 뒤쪽 1200자만 사용.
    completion = out[-1200:].strip()
    return completion, latency, len(gen_ids[0]) if hasattr(gen_ids, "__len__") else None

completion, latency, gen_tokens = qwen_vl_generate(image, PROMPT)
print("completion:", completion.splitlines()[0][:120], "...")
print(f"latency={latency:.2f}s  tokens={gen_tokens}")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


completion: system ...
latency=3.08s  tokens=198


In [8]:
from mlflow.models.signature import infer_signature
import pandas as pd
import base64

with open(IMAGE_PATH, "rb") as f:
    img_b64 = base64.b64encode(f.read()).decode("utf-8")
    
# 입력/출력 예시 생성 (PyFunc용)
input_example = {
    "prompt": "이미지 속 장면을 요약해줘.",
    "image_b64": img_b64,
    "max_new_tokens": 128
}
signature = infer_signature(
    model_input=pd.DataFrame([input_example]),
    model_output=pd.DataFrame([{"completion":"text"}])
)

from transformers import AutoTokenizer, AutoProcessor, Qwen2_5_VLForConditionalGeneration
import torch, gc, time, pandas as pd

class QwenVL_Pyfunc(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.model_id = context.model_config.get("model_id", "Qwen/Qwen2.5-VL-3B-Instruct")
        # 가벼운 것만 준비(모두 CPU)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
        self.processor = AutoProcessor.from_pretrained(self.model_id, trust_remote_code=True)
        # 모델 로드 옵션은 저장만 해두고, 진짜 로드는 predict에서 매 호출마다!
        self._torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
        self._device_map = "auto" if torch.cuda.is_available() else None

    def _load_image(self, row):
        from PIL import Image
        import io, requests, base64, os
        if "image_path" in row and isinstance(row["image_path"], str) and os.path.exists(row["image_path"]):
            return Image.open(row["image_path"]).convert("RGB")
        if "image_b64" in row and row["image_b64"]:
            return Image.open(io.BytesIO(base64.b64decode(row["image_b64"]))).convert("RGB")
        if "image_url" in row and row["image_url"]:
            r = requests.get(row["image_url"], timeout=15)
            r.raise_for_status()
            return Image.open(io.BytesIO(r.content)).convert("RGB")
        raise ValueError("image_path / image_url / image_b64 중 하나가 필요합니다.")

    def _build_model(self):
        # 매 호출마다 “잠깐” GPU로 로드
        return Qwen2_5_VLForConditionalGeneration.from_pretrained(
            self.model_id,
            trust_remote_code=True,
            torch_dtype=self._torch_dtype,
            device_map=self._device_map,
        )

    def predict(self, context, model_input):
        # 입력 정규화
        if isinstance(model_input, dict):
            rows = [model_input]
        elif isinstance(model_input, pd.DataFrame):
            rows = model_input.to_dict(orient="records")
        else:
            raise TypeError("predict input은 dict 또는 pandas.DataFrame 이어야 합니다.")

        model = self._build_model()
        outs = []
        try:
            for row in rows:
                prompt = row.get("prompt", "")
                image  = self._load_image(row)

                # Qwen 멀티모달 입력 준비
                from importlib import import_module
                qwen_utils = import_module("qwen_vl_utils")

                messages = [{
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text",  "text": prompt}
                    ]
                }]
                text = self.tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
                image_inputs, video_inputs = qwen_utils.process_vision_info(messages)
                inputs = self.processor(text=[text], images=image_inputs, videos=video_inputs,
                                        return_tensors="pt")

                # 필요하면만 디바이스로 이동
                if torch.cuda.is_available():
                    inputs = {k: v.to(model.device) for k, v in inputs.items()}

                t0 = time.time()
                with torch.inference_mode():
                    gen_ids = model.generate(**inputs, max_new_tokens=int(row.get("max_new_tokens", 128)))
                latency = time.time() - t0

                # 출력 후단 파싱(간단)
                completion = self.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0]
                completion = completion[-1200:].strip()
                outs.append({"completion": completion, "latency": latency})

        finally:
            # 🔻 매 호출 끝날 때 VRAM 해제
            try:
                del model
            except Exception:
                pass
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.ipc_collect()

        return pd.DataFrame(outs)



In [9]:
with mlflow.start_run(run_name=RUN_NAME) as run:
    mlflow.log_params({
        "model_id": MODEL_ID,
        "dtype": str(dtype),
        "max_new_tokens": 128,
    })
    mlflow.log_metrics({
        "latency_s": latency,
        "generated_tokens": gen_tokens or 0,
    })

    # 프롬프트/응답/레코드 저장(프롬프트 “보존”)
    mlflow.log_text(PROMPT, "prompt.txt")
    mlflow.log_text(completion, "completion.txt")
    mlflow.log_text(json.dumps({
        "prompt": PROMPT,
        "image_path": IMAGE_PATH,
        "completion": completion
    }, ensure_ascii=False) + "\n", "records.jsonl")

    # 입력 이미지도 아티팩트로 보존
    tmp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
    image.save(tmp_img.name)
    mlflow.log_artifact(tmp_img.name, artifact_path="inputs")

    # PyFunc 모델 저장(가중치는 HF에서 로드)
    model_cfg = {"model_id": MODEL_ID}
    mlflow.pyfunc.log_model(
        name="model",
        python_model=QwenVL_Pyfunc(),
        input_example=input_example,
        signature=signature,
        pip_requirements=[
            "mlflow>=3.2",
            "transformers>=4.45",
            "torch>=2.2",
            "pillow",
            "accelerate",
            "sentencepiece",
            # qwen 유틸은 모델 리포 내 remote_code로 제공
        ],
        model_config=model_cfg
    )

    model_uri = mlflow.get_artifact_uri("model")
    print("Run ID   :", run.info.run_id)
    print("Model URI:", model_uri)

2025/08/14 03:15:43 INFO mlflow.pyfunc: Validating input example against model signature
 - sentencepiece (current: uninstalled, required: sentencepiece)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - sentencepiece (current: uninstalled, required: sentencepiece)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Run ID   : 87c49955790d47dab31d4919bef18822
Model URI: mlflow-artifacts:/2/87c49955790d47dab31d4919bef18822/artifacts/model
🏃 View run qwen25vl_smoketest-eun2ce-20250814-01 at: http://mlflow.mlflow.svc.cluster.local:5000/#/experiments/2/runs/87c49955790d47dab31d4919bef18822
🧪 View experiment at: http://mlflow.mlflow.svc.cluster.local:5000/#/experiments/2


In [13]:
EXP_ID = "2"
M_ID   = "m-d5f2340df5ee4591ac3efe498aa66eea"  # mlflow에서 찾을 수 있음
model_uri = f"mlflow-artifacts:/{EXP_ID}/models/{M_ID}/artifacts"
print("model_uri =", model_uri)

model_uri = mlflow-artifacts:/2/models/m-d5f2340df5ee4591ac3efe498aa66eea/artifacts


In [14]:
# 2) 로드
loaded = mlflow.pyfunc.load_model(model_uri)

# 3) 입력(검증/서빙 모두 안전한 base64 형태 권장)
img = Image.new("RGB", (64,64), (200,220,240))
buf = io.BytesIO(); img.save(buf, format="PNG")
img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")

# 4) (옵션) 호출마다 VRAM 반납
os.environ["PYFUNC_RELEASE_CUDA"] = "1"   # "1"이면 predict 끝나고 VRAM 정리
os.environ["PYFUNC_USE_GPU"]      = "auto" # "1"=강제GPU, "0"=CPU, "auto"=가능하면 GPU

# 5) 추론
pred = loaded.predict([{
    "prompt": "이미지를 한 문장으로 요약해줘",
    "image_b64": img_b64,
    "max_new_tokens": 32
}])
print(pred.iloc[0]["completion"][:120], "...")
print("latency:", pred.iloc[0]["latency"])

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

 - sentencepiece (current: uninstalled, required: sentencepiece)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

system
You are a helpful assistant.
user
이미지를 한 문장으로 요약해줘
assistant
"나는 당신을 사랑합니다." ...
latency: 1.1055660247802734


In [15]:
%pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Package                            Version
---------------------------------- --------------
absl-py                            2.1.0
accelerate                         1.10.0
alembic                            1.14.1
altair                             5.5.0
annotated-types                    0.7.0
anyio                              4.8.0
archspec                           0.2.5
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
arrow                              1.3.0
asttokens                          3.0.0
astunparse                         1.6.3
async_generator                    1.10
async-lru                          2.0.4
attrs                              25.1.0
av                                 15.0.0
babel                              2.17.0
beautifulsoup4                     4.13.3
bleach                             6.2.0
blinker                            1.9.0
bokeh                              3.6.3
boltons                            24.0