In [None]:
import torch
from ultralytics import YOLO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import cv2

# Load YOLO model (trained for text detection ideally)
yolo_model = YOLO("best.pt")  # Replace with your text detection model

# Load TrOCR model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1",use_fast=True)
ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

device = "cuda" if torch.cuda.is_available() else "cpu"
ocr_model.to(device)

# Function to read text with TrOCR
def read_text_with_trocr(image):
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
    generated_ids = ocr_model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

# Load a single image
image_path = "goal.png"  # Your image file path
frame = cv2.imread(image_path)

# YOLO detection
results = yolo_model(frame)

# Process detections
for r in results[0].boxes.xyxy:
    x1, y1, x2, y2 = map(int, r)
    crop = frame[y1:y2, x1:x2]

    # Convert crop to PIL Image for TrOCR
    pil_img = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))

    # OCR with TrOCR
    text = read_text_with_trocr(pil_img)

    # Draw results on the image
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
    cv2.putText(frame, text, (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

# Show output
cv2.imshow("YOLO + TrOCR", frame)
cv2.waitKey(0)
cv2.destroyAllWindows()
