In [None]:
!git clone https://github.com/deepseek-ai/DeepSeek-VL2

In [None]:
%cd DeepSeek-VL2

In [None]:
!pip install -e .

In [None]:
!pip install numpy==1.26.4
#after changing numpy version re-start rerun 

In [None]:
!pip install git+https://github.com/huggingface/transformers

In [None]:
!pip install xformers==0.0.21

In [None]:
!pip3 install torch==2.0.1+cu118 torchaudio==2.0.2+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorch.org/whl/cu118

In [None]:
import os
import json
import time
import torch
import gc
from tqdm import tqdm
from PIL import Image
from transformers import AutoModelForCausalLM
from google.colab import drive
from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images

# Google Drive connection
drive.mount('/content/drive')

def is_image_file(filename):
    return filename.lower().endswith(('.png', '.jpg', '.jpeg'))

def run_deepseek_on_existing_json(image_root, input_json_path, model_path="deepseek-ai/deepseek-vl2-tiny"):
    """Appends  Deepseek VL2 result to existing JSON file."""


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    print("Initializing models...")
    model_name = model_path.split("/")[-1].lower()
    processor = DeepseekVLV2Processor.from_pretrained(model_path)
    vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
    model = vl_gpt.to(torch.bfloat16).to(device).eval()

    #Load current JSON
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    start_time = time.time()

    for filename in tqdm([k for k in data if not k.startswith("_")], desc="OCR process"):
        for root, _, files in os.walk(image_root):
            if 'text_category' in root:
                continue
            if filename in files:
                img_path = os.path.join(root, filename)
                try:
                    conversation = [
                        {
                            "role": "<|User|>",
                            "content": "<image>\n<|ref|>Extract the Turkish text exactly.<|/ref|>.",
                            "images": [img_path],
                        },
                        {"role": "<|Assistant|>", "content": ""},
                    ]

                    pil_images = load_pil_images(conversation)

                    prepare_inputs = processor(
                        conversations=conversation,
                        images=pil_images,
                        force_batchify=True,
                        system_prompt=""
                    ).to(model.device)

                    inputs_embeds = model.prepare_inputs_embeds(**prepare_inputs)

                    outputs = model.generate(
                        inputs_embeds=inputs_embeds,
                        attention_mask=prepare_inputs.attention_mask,
                        pad_token_id=processor.tokenizer.eos_token_id,
                        bos_token_id=processor.tokenizer.bos_token_id,
                        eos_token_id=processor.tokenizer.eos_token_id,
                        max_new_tokens=512,
                        do_sample=False,
                        use_cache=True,
                    )

                    extracted_text = processor.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)

                    #write OCR results
                    data[filename]["models"][model_name] = {
                        "prediction": extracted_text,
                        "cer": None,
                        "wer": None
                    }

                except Exception as e:
                    print(f"{filename} error: {str(e)}")
                break  


    elapsed = round(time.time() - start_time,2)
    print(f"\n OCR completed: {elapsed:.2f} seconds")

    meta = data.get("_meta", {})
    processing_times = meta.get("processing_times", {})
    processing_times[model_name] = elapsed
    meta["processing_times"] = processing_times
    data["_meta"] = meta

    # save updated JSON
    with open(input_json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f" Updated JSON saved: {input_json_path}")
    return data




input_json_path = '/content/drive/MyDrive/nutuk/benchmark/converted_data.json'
image_root = '/content/drive/MyDrive/nutuk/benchmark/'


# start OCR process
updated = run_deepseek_on_existing_json(image_root, input_json_path)