In [None]:
!pip install --upgrade vllm


In [None]:
!pip install --upgrade mistral_common


In [None]:
from io import BytesIO
from PIL import Image
import base64
import os
import json
import time
import gc
from tqdm import tqdm
from google.colab import drive
from google.colab import userdata
from huggingface_hub import login
from vllm import LLM
from vllm.sampling_params import SamplingParams

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

try:
    hf_token = userdata.get('hf_token')
    if hf_token:
        login(token=hf_token)
        print("Hugging Face girişi başarılı! Token kullanılacak.")
    else:
        print("HF_TOKEN gizli anahtarı bulunamadı. Lütfen Colab Secrets'a ekleyin.")
except userdata.SecretNotFoundError:
    print("HF_TOKEN adlı gizli anahtar bulunamadı.")
except Exception as e:
    print(f"Hugging Face girişi sırasında bir hata oluştu: {e}")

# Google Drive connection
drive.mount('/content/drive')

def is_image_file(filename):
    return filename.lower().endswith(('.png', '.jpg', '.jpeg'))

def encode_image_to_base64(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        buffered = BytesIO()
        image.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode("utf-8")
    except Exception as e:
        print(f"Base64 encoding failed for {image_path}: {e}")
        return None

def run_pixtral_on_existing_json(image_root, input_json_path, model_path = "mistralai/Pixtral-12B-2409"):
    """Appends Pixtral result to existing JSON file."""


    print("Using device: VLLM managed")

    print("Initializing models...")
    model_name = model_path.split("/")[-1].lower()
    sampling_params = SamplingParams(max_tokens=8192)

    # LLM object must be created only once 
    llm = LLM(model=model_path, tokenizer_mode="mistral", max_model_len=8192, trust_remote_code=True)

    # Load JSON files
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    start_time = time.time()

    for filename in tqdm([k for k in data if not k.startswith("_")], desc="OCR process"):
        # create road
        for root, _, files in os.walk(image_root):
            if 'text_category' in root:
                continue
            if filename in files:
                img_path = os.path.join(root, filename)
                try:
                    image_base64 = encode_image_to_base64(img_path)
                    if image_base64 is None:
                        extracted_text = ""
                    else:
                        prompt = (
                            "Extract the Turkish text from the image exactly as it appears. "
                            "Do not repeat, comment, translate, or add any text. "
                            "Return only the raw text. If no text is detected, return an empty response."
                        )

                        messages = [
                            {
                                "role": "user",
                                "content": [
                                    {"type": "text", "text": prompt},
                                    {
                                        "type": "image_url",
                                        "image_url": {
                                            "url": f"data:image/png;base64,{image_base64}"
                                        }
                                    }
                                ],
                            }
                        ]

                        outputs = llm.chat(messages, sampling_params=sampling_params)
                        extracted_text = outputs[0].outputs[0].text.strip()

                    #write OCR results
                    data[filename]["models"][model_name] = {
                        "prediction": extracted_text,
                        "cer": None,
                        "wer": None
                    }

                except Exception as e:
                    print(f"{filename} error: {str(e)}")
                break  

        gc.collect()

    elapsed = round(time.time() - start_time,2)
    print(f"\n OCR completed: {elapsed:.2f} seconds")

    meta = data.get("_meta", {})
    processing_times = meta.get("processing_times", {})
    processing_times[model_name] = elapsed
    meta["processing_times"] = processing_times
     # Updated total image count 

    data["_meta"] = meta

    # Save updated JSON 
    with open(input_json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Updated JSON saved: {input_json_path}")
    return data



input_json_path = '/content/drive/MyDrive/nutuk/benchmark/converted_data.json'
image_root = '/content/drive/MyDrive/nutuk/benchmark/'


# start OCR process
updated = run_pixtral_on_existing_json(image_root, input_json_path)