In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
model_name = 'deepseek-ai/DeepSeek-OCR-2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True) # _attn_implementation='sdpa'
model = model.eval().cuda().to(torch.bfloat16)

# prompt = "<image>\nFree OCR. "
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
image_file = 'data/New_sample/원천데이터/인.허가/5350109/1994/5350109-1994-0001-0001.jpg'
output_path = '/workspace/output/deepseek-ocr-2/1994/5350109-1994-0001-0001'


res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 768, crop_mode=True, save_results = True)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

modeling_deepseekocr2.py: 0.00B [00:00, ?B/s]

deepencoderv2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR-2:
- deepencoderv2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_deepseekv2.py: 0.00B [00:00, ?B/s]

configuration_deepseek_v2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR-2:
- configuration_deepseek_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR-2:
- modeling_deepseekv2.py
- configuration_deepseek_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


conversation.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR-2:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR-2:
- modeling_deepseekocr2.py
- deepencoderv2.py
- modeling_deepseekv2.py
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR2. This is not supported for all configurations of models and can yield errors.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-000001.safetensors:   0%|          | 0.00/6.78G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


KeyboardInterrupt: 

In [1]:
import torch
import time
import os
from transformers import AutoModel, AutoTokenizer

# 1. 초기 설정 및 환경 확인
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
model_name = 'deepseek-ai/DeepSeek-OCR-2'
output_path = '/workspace/output/deepseek-ocr-2/1994/5350109-1994-0001-0001'
os.makedirs(output_path, exist_ok=True)

# 성능 기록을 위한 초기화
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats() # 최대 VRAM 사용량 초기화
start_time = time.time() # 시작 시간 기록

try:
    # 2. 모델 로드 (시간 측정 시작)
    print("--- 모델 로딩 시작 ---")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    # flash_attention_2 빌드 실패 시를 대비해 'sdpa'를 예비로 두는 것을 추천합니다.
    model = AutoModel.from_pretrained(
        model_name, 
        _attn_implementation='flash_attention_2', # 빌드 실패했다면 'sdpa' 사용
        trust_remote_code=True, 
        use_safetensors=True
    )
    model = model.eval().cuda().to(torch.bfloat16)
    
    load_time = time.time() - start_time
    print(f"모델 로드 완료: {load_time:.2f}초")

    # 3. 인퍼런스 수행
    prompt = "<image>\n<|grounding|>Convert the document to markdown. "
    image_file = 'data/New_sample/원천데이터/인.허가/5350109/1994/5350109-1994-0001-0001.jpg'
    
    infer_start_time = time.time()
    res = model.infer(
        tokenizer, 
        prompt=prompt, 
        image_file=image_file, 
        output_path=output_path, 
        base_size=1024, 
        image_size=768, 
        crop_mode=True, 
        save_results=True
    )
    infer_end_time = time.time()

    # 4. 성능 지표 계산
    total_time = infer_end_time - start_time
    pure_infer_time = infer_end_time - infer_start_time
    
    # VRAM 사용량 측정 (Bytes -> GB 변환)
    max_vram = torch.cuda.max_memory_allocated() / (1024 ** 3) 
    reserved_vram = torch.cuda.max_memory_reserved() / (1024 ** 3)

    # 5. 결과 파일 저장
    log_file_path = os.path.join(output_path, "performance_log.txt")
    with open(log_file_path, "w") as f:
        f.write(f"--- DeepSeek-OCR-2 Performance Log ---\n")
        f.write(f"Model Name: {model_name}\n")
        f.write(f"Total Time (Load + Infer): {total_time:.2f} sec\n")
        f.write(f"Pure Inference Time: {pure_infer_time:.2f} sec\n")
        f.write(f"Peak VRAM Allocated: {max_vram:.2f} GB\n")
        f.write(f"Peak VRAM Reserved: {reserved_vram:.2f} GB\n")
    
    print(f"\n[성공] 로그가 저장되었습니다: {log_file_path}")
    print(f"최대 VRAM 사용량: {max_vram:.2f} GB")

except Exception as e:
    print(f"[오류 발생] {e}")

--- 모델 로딩 시작 ---


You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR2. This is not supported for all configurations of models and can yield errors.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


모델 로드 완료: 266.18초


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


BASE:  torch.Size([1, 256, 1280])
PATCHES:  torch.Size([6, 144, 1280])


The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.


<|ref|>text<|/ref|><|det|>[[75, 119, 360, 270]]<|/det|>
문서번호 지정 55142 - 1
시행일자 94. 7. 2.
경 유 [제 1 안]
수 신 내 부 결 재
참 조

<|ref|>text<|/ref|><|det|>[[75, 321, 480, 341]]<|/det|>
제 목 공장설립 변경 신고수리 "건의"

<|ref|>text<|/ref|><|det|>[[75, 379, 896, 483]]<|/det|>
1. 장유면 유하리 864번지 (주) 서 롱 대 표 으로 부터 공업배치및공장설립에관한법률 제 13조 1항의 규정에 의거 공장설립 변경 신고가 있어 동법시행령 제 19조 2항의 규정에 의거 다음과 같이 수리하고 벌첩 공장설립 변경 신고 확인서를 교부코자 합니다

<|ref|>text<|/ref|><|det|>[[75, 493, 895, 540]]<|/det|>
2. 사후관리에 철저를 기하고자 담당계장 이 봉 구 와 담당자 배 병 감 를 책임담당자로 지정코자 합니다

<|ref|>figure_title<|/ref|><|det|>[[203, 550, 420, 567]]<|/det|>
0 변경 신고수리 사항

<|ref|>table<|/ref|><|det|>[[85, 571, 913, 644]]<|/det|>
<table><tr><td>업 체 명</td><td>소 재 지</td><td>대 표 자</td><td>업 종</td><td>대지면적(㎡)</td><td>건축면적(㎡)</td></tr><tr><td>(주)서룡</td><td>장유면 유하리 864번지외8필</td><td></td><td>타이어코트및그직물제조업</td><td>10,655</td><td>6801.62</td></tr></table>

<|ref|>text<|/ref|><|det|>[[75, 663, 580, 682]]<|/det|>
첨 부 : 1. 공장설립 변경 신고 확인서 (안) 1부.

<|ref|>text<|/ref|><|det|>[[181, 692, 3

image: 0it [00:00, ?it/s]
other: 100%|██████████| 12/12 [00:00<00:00, 43881.12it/s]


[성공] 로그가 저장되었습니다: /workspace/output/deepseek-ocr-2/1994/5350109-1994-0001-0001/performance_log.txt
최대 VRAM 사용량: 12.95 GB



