In [None]:
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
from paddleocr import PaddleOCR
from rapidfuzz import process
from PIL import ImageFont, ImageDraw, Image

font_path = "/content/NanumGothic.ttf"
font = ImageFont.truetype(font_path, 20)

model = YOLO("/content/best.pt")
tracker = DeepSort(max_age=30, n_init=1)
ocr_model = PaddleOCR(lang='korean')

dictionary_path = "/content/dictionary.txt"
with open(dictionary_path, 'r', encoding='utf-8') as f:
    dictionary = [line.strip() for line in f.readlines()]

def find_best_match(text, dictionary):
    match, score, _ = process.extractOne(text, dictionary)
    return match if score > 60 else None

####### future work: YOLO & OCR IoU #######
def box_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxBArea + 1e-6)
    return iou
############################################

ocr_last_seen = {}  # {track_id: (last_time, last_text)}

cap = cv2.VideoCapture("/content/test18.mp4")
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
fps = cap.get(cv2.CAP_PROP_FPS)
fps = fps if fps > 0 else 30

output_path = "/content/output18.mp4"
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

ocr_output_file = "/content/ocr18_2.txt"
yolo_output_file = "/content/yolo18_2.txt"
seen_ids = set()
ocr_interval = 0.6

with open(ocr_output_file, "w", encoding="utf-8") as ocr_file, open(yolo_output_file, "w", encoding="utf-8") as yolo_file:

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        current_time_sec = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000

        results = model(frame, conf=0.2)
        class_names = model.names
        class_best_detections = {}

        for result in results:
            for box in result.boxes.data:
                box = box.cpu().numpy()
                x1, y1, x2, y2, conf, cls = map(float, box[:6])
                cls = int(cls)
                if cls not in [0, 1]:
                    continue
                if cls not in class_best_detections or conf > class_best_detections[cls][1]:
                    class_best_detections[cls] = ([[x1, y1, x2 - x1, y2 - y1], conf, cls])

        detections = list(class_best_detections.values())
        tracks = tracker.update_tracks(detections, frame=frame)

        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        draw = ImageDraw.Draw(frame_pil)

        detected_track_ids = []

        for track in tracks:
            if not track.is_confirmed():
                continue

            track_id = track.track_id
            ltrb = track.to_ltrb()
            x1, y1, x2, y2 = map(int, ltrb)
            x1, x2 = min(x1, x2), max(x1, x2)
            y1, y2 = min(y1, y2), max(y1, y2)

            draw.rectangle([(x1, y1), (x2, y2)], outline=(0, 255, 0), width=3)
            detected_track_ids.append(track_id)

            if track_id not in seen_ids:
                seen_ids.add(track_id)
                yolo_file.write(f"(time: {current_time_sec:.2f}s) object ID: {track_id}, class: {class_names[cls]}\n")

        # OCR execution decision
        should_ocr = False
        is_new_id_detected = False

        for track_id in detected_track_ids:
            if track_id not in ocr_last_seen:
                should_ocr = True
                is_new_id_detected = True
                break
            else:
                last_time, _ = ocr_last_seen[track_id]
                if current_time_sec - last_time >= ocr_interval:
                    should_ocr = True
                    break

        if should_ocr:
            ocr_result = ocr_model.ocr(frame)

            if isinstance(ocr_result, list) and len(ocr_result) > 0:
                first_item = ocr_result[0]

                # case 1: list of lines (box, (text, score))
                if isinstance(first_item, list):
                    for box, (text, score) in first_item:
                        matched_text = find_best_match(text, dictionary)
                        if matched_text:
                            for track_id in detected_track_ids:
                                last_time, last_text = ocr_last_seen.get(track_id, (-100, ""))
                                if matched_text != last_text:
                                    ocr_file.write(f"(time: {current_time_sec:.2f}s) ID {track_id}: {matched_text} (confidence: {score:.2f})\n")
                                    ocr_last_seen[track_id] = (current_time_sec, matched_text)

                # case 2: dict format
                elif isinstance(first_item, dict):
                    texts = first_item.get("rec_texts", [])
                    scores = first_item.get("rec_scores", [])

                    for text, score in zip(texts, scores):
                        matched_text = find_best_match(text, dictionary)
                        if matched_text:
                            for track_id in detected_track_ids:
                                last_time, last_text = ocr_last_seen.get(track_id, (-100, ""))
                                if matched_text != last_text:
                                    ocr_file.write(f"(time: {current_time_sec:.2f}s) ID {track_id}: {matched_text} (confidence: {score:.2f})\n")
                                    ocr_last_seen[track_id] = (current_time_sec, matched_text)

        out.write(cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR))

cap.release()
out.release()