# Deepseek-ocr-2

## 기초 실행

In [None]:
# 기초 실행
from transformers import AutoModel, AutoTokenizer
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
model_name = 'deepseek-ai/DeepSeek-OCR-2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True) # _attn_implementation='sdpa'
model = model.eval().cuda().to(torch.bfloat16)

# prompt = "<image>\nFree OCR. "
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
image_file = 'data/New_sample/원천데이터/인.허가/5350109/1994/5350109-1994-0001-0001.jpg'
output_path = '/workspace/output/deepseek-ocr-2/1994/5350109-1994-0001-0001'


res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 768, crop_mode=True, save_results = True)

## vram, time 확인 버전

In [None]:
import torch
import time
import os
from transformers import AutoModel, AutoTokenizer

# 1. 초기 설정 및 환경 확인
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
model_name = 'deepseek-ai/DeepSeek-OCR-2'
output_path = '/workspace/output/deepseek-ocr-2/1994/5350109-1994-0001-0001'
os.makedirs(output_path, exist_ok=True)

# 성능 기록을 위한 초기화
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats() # 최대 VRAM 사용량 초기화
start_time = time.time() # 시작 시간 기록

try:
    # 2. 모델 로드 (시간 측정 시작)
    print("--- 모델 로딩 시작 ---")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    # flash_attention_2 빌드 실패 시를 대비해 'sdpa'를 예비로 두는 것을 추천합니다.
    model = AutoModel.from_pretrained(
        model_name, 
        _attn_implementation='eager', # flash_attention_2 빌드 실패했다면 'sdpa' 사용 eager
        trust_remote_code=True, 
        use_safetensors=True
    )
    model = model.eval().cuda().to(torch.bfloat16)
    
    load_time = time.time() - start_time
    print(f"모델 로드 완료: {load_time:.2f}초")

    # 3. 인퍼런스 수행
    prompt = "<image>\n<|grounding|>Convert the document to markdown. "
    image_file = 'data/New_sample/원천데이터/인.허가/5350109/1994/5350109-1994-0001-0001.jpg'
    
    infer_start_time = time.time()
    res = model.infer(
        tokenizer, 
        prompt=prompt, 
        image_file=image_file, 
        output_path=output_path, 
        base_size=1024, 
        image_size=768, 
        crop_mode=True, 
        save_results=True
    )
    infer_end_time = time.time()

    # 4. 성능 지표 계산
    total_time = infer_end_time - start_time
    pure_infer_time = infer_end_time - infer_start_time
    
    # VRAM 사용량 측정 (Bytes -> GB 변환)
    max_vram = torch.cuda.max_memory_allocated() / (1024 ** 3) 
    reserved_vram = torch.cuda.max_memory_reserved() / (1024 ** 3)

    # 5. 결과 파일 저장
    log_file_path = os.path.join(output_path, "performance_log.txt")
    with open(log_file_path, "w") as f:
        f.write(f"--- DeepSeek-OCR-2 Performance Log ---\n")
        f.write(f"Model Name: {model_name}\n")
        f.write(f"Total Time (Load + Infer): {total_time:.2f} sec\n")
        f.write(f"Pure Inference Time: {pure_infer_time:.2f} sec\n")
        f.write(f"Peak VRAM Allocated: {max_vram:.2f} GB\n")
        f.write(f"Peak VRAM Reserved: {reserved_vram:.2f} GB\n")
    
    print(f"\n[성공] 로그가 저장되었습니다: {log_file_path}")
    print(f"최대 VRAM 사용량: {max_vram:.2f} GB")

except Exception as e:
    print(f"[오류 발생] {e}")

# paddleocr-vl

# 기초 실행

In [None]:
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoProcessor

# ---- Settings ----
model_path = "PaddlePaddle/PaddleOCR-VL"
image_path = "data/New_sample/원천데이터/인.허가/5350109/1994/5350109-1994-0001-0001.jpg"
task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
# ------------------

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

PROMPTS = {
    "ocr": "OCR:",
    "table": "Table Recognition:",
    "formula": "Formula Recognition:",
    "chart": "Chart Recognition:",
}

# 이미지 로드 후 리사이즈 (예: 가로세로 최대 1024 혹은 1536으로 제한)
image = Image.open(image_path).convert("RGB")

model = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True, torch_dtype=torch.bfloat16
).to(DEVICE).eval()
processor = AutoProcessor.from_pretrained(
    model_path, 
    trust_remote_code=True,
    min_pixels=256*28*28, # 최소 해상도
    max_pixels=1024*28*28 # 최대 해상도 (이 값을 조절하며 OOM 확인)
    )

messages = [
    {"role": "user",         
     "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": PROMPTS[task]},
        ]
    }
]
inputs = processor.apply_chat_template(
    messages, 
    tokenize=True, 
    add_generation_prompt=True, 	
    return_dict=True,
    return_tensors="pt"
).to(DEVICE)

outputs = model.generate(**inputs, max_new_tokens=1024)
outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
print(outputs)
