# Setup

In [None]:
keyframes_dir = None
save_dir = None

In [None]:
import os
from collections import defaultdict

dir_path = os.getcwd()

if not keyframes_dir:
    if 'google.colab' in str(get_ipython()):
        # Update this path as necessary
        keyframes_dir = f'{dir_path}/keyframes'
    elif 'kaggle' in str(get_ipython()):
        keyframes_dir = f'{dir_path}/keyframes'
    else:
        parent_dir_path = os.path.dirname(dir_path)
        keyframes_dir = f'{parent_dir_path}/transnet/keyframes'

if not save_dir:
    save_dir = './object_extraction'

In [None]:
! pip install ultralytics pillow tqdm



In [None]:
import os
import glob
import json
import torch
from typing import Dict, List
from ultralytics import YOLO
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Parse data path

In [None]:
def parse_keyframe_info(keyframes_dir):
    all_keyframe_paths = {}
    for part in sorted(os.listdir(keyframes_dir)):
        data_part_path = f'{keyframes_dir}/{part}'
        data_part = part.split('/')[-1]
        all_keyframe_paths[data_part] = []
        image_path = sorted(glob.glob(f'{data_part_path}/*.jpg'))
        all_keyframe_paths[data_part] = image_path
    return all_keyframe_paths

# Process Image

In [None]:
def create_directory(path):
    """Create a directory if it does not exist."""
    if not os.path.exists(path):
        os.makedirs(path)


class ObjectDetector:
    def __init__(self, model_path: str, threshold: float = 0.5):
        """
        Initializes the ObjectDetector with a model and a confidence threshold.

        :param model_path: Path to the YOLO model.
        :param threshold: Confidence score threshold for filtering detections.
        """
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"Using device: {self.device}")
        self.model = YOLO(model_path).to(self.device)
        self.threshold = threshold

    def detect(self, image_path: str) -> List[Dict]:
        """
        Detect objects in the given image and filter them by the confidence threshold.

        :param image_path: Path to the image file.
        :return: A list of dictionaries containing detection data.
        """
        results = self.model(image_path)
        filtered_results = [
            {
                "label": self.model.names[int(box.cls[0])],
                "score": float(box.conf[0]),
                "box": box.xyxy[0].tolist()
            }
            for r in results
            for box in r.boxes
            if float(box.conf[0]) >= self.threshold  # Apply threshold filtering here
        ]
        return filtered_results


def count_objects_by_class(detected_objects: List[Dict]) -> Dict[str, int]:
    return {obj["label"]: sum(1 for o in detected_objects if o["label"] == obj["label"])
            for obj in detected_objects}


def save_results(save_dir: str, video_id: str, results: Dict[str, Dict]):
    for dir_type in ['object_detection', 'object_counts']:
        full_dir = os.path.join(save_dir, dir_type, video_id)
        create_directory(full_dir)

    for image_path, data in results.items():
        base_name = os.path.basename(image_path).replace('.jpg', '')
        for data_type, content in [('detection', 'detected_objects'), ('counts', 'class_counts')]:
            json_path = os.path.join(
                save_dir, f'object_{data_type}', video_id, f'{base_name}_{data_type}.json')
            create_directory(os.path.dirname(json_path))
            with open(json_path, 'w') as f:
                json.dump(data[content], f)


def process_video(detector: ObjectDetector, image_paths: List[str]) -> Dict[str, Dict]:
    video_results = {}
    for image_path in image_paths:
        detected_objects = detector.detect(image_path)
        if detected_objects:
            class_counts = count_objects_by_class(detected_objects)
            video_results[image_path] = {
                'detected_objects': detected_objects,
                'class_counts': class_counts
            }
    return video_results


def process_and_save_results(detector: ObjectDetector, all_keyframe_paths: Dict[str, Dict[str, List[str]]], save_dir: str):
    for video_id, image_paths in tqdm(all_keyframe_paths.items()):
        video_results = process_video(detector, image_paths)
        save_results(save_dir, video_id, video_results)

In [None]:
model_path = 'yolov10x.pt'
confidence_threshold = 0.7

detector = ObjectDetector(model_path, threshold= confidence_threshold)
all_keyframe_paths = parse_keyframe_info(keyframes_dir)
process_and_save_results(detector, all_keyframe_paths, save_dir)

Using device: cuda


  0%|          | 0/4 [00:00<?, ?it/s]


image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V001/001.jpg: 384x640 (no detections), 75.6ms
Speed: 1.0ms preprocess, 75.6ms inference, 5.0ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V001/002.jpg: 384x640 3 persons, 2 ties, 1 tv, 16.4ms
Speed: 1.0ms preprocess, 16.4ms inference, 9.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V001/003.jpg: 384x640 3 persons, 2 ties, 16.4ms
Speed: 0.9ms preprocess, 16.4ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V001/004.jpg: 384x640 (no detections), 16.2ms
Speed: 0.9ms preprocess, 16

 25%|██▌       | 1/4 [00:07<00:23,  8.00s/it]


image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V001_extra/000000.jpg: 640x640 (no detections), 24.4ms
Speed: 0.9ms preprocess, 24.4ms inference, 0.2ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V001_extra/000002.jpg: 640x640 2 boats, 23.4ms
Speed: 0.9ms preprocess, 23.4ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V001_extra/000005.jpg: 640x640 (no detections), 23.2ms
Speed: 0.9ms preprocess, 23.2ms inference, 0.2ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V001_extra/000006.jpg: 640x640 1 stop sign, 23.2ms
Speed: 1.0ms 

 50%|█████     | 2/4 [00:37<00:41, 20.74s/it]


image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V002/001.jpg: 384x640 (no detections), 14.7ms
Speed: 1.5ms preprocess, 14.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V002/002.jpg: 384x640 2 persons, 2 ties, 1 tv, 14.6ms
Speed: 0.9ms preprocess, 14.6ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V002/003.jpg: 384x640 3 persons, 1 tie, 14.5ms
Speed: 1.1ms preprocess, 14.5ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V002/004.jpg: 384x640 2 ties, 15.0ms
Speed: 2.3ms preprocess, 15.0ms infer

 75%|███████▌  | 3/4 [00:43<00:13, 13.84s/it]


image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V002_extra/000000.jpg: 640x640 (no detections), 24.2ms
Speed: 0.9ms preprocess, 24.2ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V002_extra/000003.jpg: 640x640 1 boat, 23.3ms
Speed: 0.9ms preprocess, 23.3ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V002_extra/000006.jpg: 640x640 (no detections), 23.7ms
Speed: 1.2ms preprocess, 23.7ms inference, 0.2ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/keyframes/L01_V002_extra/000007.jpg: 640x640 1 stop sign, 23.6ms
Speed: 1.1ms p

100%|██████████| 4/4 [01:05<00:00, 16.45s/it]
