In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
!pip install transformers==4.47.0

In [None]:
import gc
import os
import json
import time
from PIL import Image
from tqdm import tqdm
import torch
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoProcessor

# Bellek yönetimi optimizasyonu
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Google Drive bağlantısı
drive.mount('/content/drive')

def is_image_file(filename):
    return filename.lower().endswith(('.png', '.jpg', '.jpeg'))

def run_phi35_on_existing_json(image_root, input_json_path, model_path="microsoft/Phi-3.5-vision-instruct"):
    """Mevcut JSON dosyasına Phi-3.5 sonucu ekler."""

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    print("Initializing models...")
    model_name = model_path.split("/")[-1].lower()

    processor = AutoProcessor.from_pretrained(
        model_path,
        trust_remote_code=True,
        num_crops=16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype="auto",
        attn_implementation='flash_attention_2'
    )

    #Load current JSON
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    start_time = time.time()

    for filename in tqdm([k for k in data if not k.startswith("_")], desc="OCR process"):
        # Yol oluştur
        for root, _, files in os.walk(image_root):
            if 'text_category' in root:
                continue
            if filename in files:
                img_path = os.path.join(root, filename)
                try:
                    image = Image.open(img_path).convert('RGB')

                    prompt_message = [
                        {"role": "user", "content": "<|image_1|>\nExtract the Turkish text from the image exactly as it appears. Do not repeat, comment, translate, or add any text. Return only the raw text. If no text is detected, return an empty response."}
                    ]

                    prompt = processor.tokenizer.apply_chat_template(
                        prompt_message,
                        tokenize=False,
                        add_generation_prompt=True
                    )

                    inputs = processor(prompt, [image], return_tensors="pt").to(model.device)

                    with torch.inference_mode():
                        generate_ids = model.generate(
                            **inputs,
                            max_new_tokens=512,
                            temperature=0.0,
                            do_sample=False,
                            eos_token_id=processor.tokenizer.eos_token_id
                        )
                        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
                        extracted_text = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

                    #write OCR results
                    data[filename]["models"][model_name] = {
                        "prediction": extracted_text,
                        "cer": None,
                        "wer": None
                    }

                    del inputs, generate_ids
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()

                except Exception as e:
                    print(f"{filename} error: {str(e)}")
                break  

       # gc.collect()
       # torch.cuda.empty_cache()
       # torch.cuda.synchronize()

    elapsed = round(time.time() - start_time,2)
    print(f"\n OCR completed: {elapsed:.2f} seconds")

    meta = data.get("_meta", {})
    processing_times = meta.get("processing_times", {})
    processing_times[model_name] = elapsed
    meta["processing_times"] = processing_times
    data["_meta"] = meta

    # save updated JSON
    with open(input_json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Updated JSON saved: {input_json_path}")
    return data




input_json_path = '/content/drive/MyDrive/nutuk/benchmark/converted_data.json'
image_root = '/content/drive/MyDrive/nutuk/benchmark/'


# start OCR process
updated = run_phi35_on_existing_json(image_root, input_json_path)