# Setup

In [None]:
keyframes_dir = None
save_dir = None

In [None]:
import os
from collections import defaultdict

dir_path = os.getcwd()

if not keyframes_dir:
    if 'google.colab' in str(get_ipython()):
        # Update this path as necessary
        keyframes_dir = f'{dir_path}/Keyframes'
    elif 'kaggle' in str(get_ipython()):
        keyframes_dir = f'{dir_path}/Keyframes'
    else:
        parent_dir_path = os.path.dirname(dir_path)
        keyframes_dir = f'{parent_dir_path}/transnet/Keyframes'

if not save_dir:
    save_dir = './object_extraction'

In [None]:
! pip install ultralytics pillow tqdm

[0m

In [None]:
import os
import glob
import json
import torch
from typing import Dict, List
from ultralytics import YOLO
from tqdm.auto import tqdm

# Parse data path

In [None]:
def parse_keyframe_info(keyframes_dir = '../transnet/Keyframes'):
    all_keyframe_paths = {}
    for part in sorted(os.listdir(keyframes_dir)):
        data_part = part.split('/')[-1]
        all_keyframe_paths[data_part] = {}
        
    for data_part in sorted(all_keyframe_paths.keys()):
        data_part_path = f'{keyframes_dir}/{data_part}'
        video_dirs = sorted(os.listdir(data_part_path))
        video_ids = [video_dir.split('_')[-1] for video_dir in video_dirs]
        for video_id, video_dir in zip(video_ids, video_dirs):
            keyframe_paths = sorted(
                glob.glob(f'{data_part_path}/{video_dir}/*.jpg'))
            all_keyframe_paths[data_part][video_id] = keyframe_paths
    
    return all_keyframe_paths

# Process Image

In [None]:
def create_directory(path):
    """Create a directory if it does not exist."""
    if not os.path.exists(path):
        os.makedirs(path)


class ObjectDetector:
    def __init__(self, model_path: str, threshold: float = 0.5):
        """
        Initializes the ObjectDetector with a model and a confidence threshold.

        :param model_path: Path to the YOLO model.
        :param threshold: Confidence score threshold for filtering detections.
        """
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"Using device: {self.device}")
        self.model = YOLO(model_path).to(self.device)
        self.threshold = threshold

    def detect(self, image_path: str) -> List[Dict]:
        """
        Detect objects in the given image and filter them by the confidence threshold.

        :param image_path: Path to the image file.
        :return: A list of dictionaries containing detection data.
        """
        results = self.model(image_path)
        filtered_results = [
            {
                "label": self.model.names[int(box.cls[0])],
                "score": float(box.conf[0]),
                "box": box.xyxy[0].tolist()
            }
            for r in results
            for box in r.boxes
            if float(box.conf[0]) >= self.threshold  # Apply threshold filtering here
        ]
        return filtered_results


def count_objects_by_class(detected_objects: List[Dict]) -> Dict[str, int]:
    return {obj["label"]: sum(1 for o in detected_objects if o["label"] == obj["label"])
            for obj in detected_objects}


def save_results(save_dir: str, key: str, video_id: str, results: Dict[str, Dict]):
    for dir_type in ['object_detection', 'object_counts']:
        full_dir = os.path.join(save_dir, dir_type, key, video_id)
        create_directory(full_dir)

    for image_path, data in results.items():
        base_name = os.path.basename(image_path).replace('.jpg', '')
        for data_type, content in [('detection', 'detected_objects'), ('counts', 'class_counts')]:
            json_path = os.path.join(
                save_dir, f'object_{data_type}', key, video_id, f'{base_name}_{data_type}.json')
            create_directory(os.path.dirname(json_path))
            with open(json_path, 'w') as f:
                json.dump(data[content], f, indent=4)


def process_video(detector: ObjectDetector, image_paths: List[str]) -> Dict[str, Dict]:
    video_results = {}
    for image_path in image_paths:
        detected_objects = detector.detect(image_path)
        if detected_objects:
            class_counts = count_objects_by_class(detected_objects)
            video_results[image_path] = {
                'detected_objects': detected_objects,
                'class_counts': class_counts
            }
    return video_results


def process_and_save_results(detector: ObjectDetector, all_keyframe_paths: Dict[str, Dict[str, List[str]]], save_dir: str):
    for key, videos in all_keyframe_paths.items():
        for video_id, image_paths in tqdm(videos.items(), desc=f"Processing {key}"):
            video_results = process_video(detector, image_paths)
            save_results(save_dir, key, video_id, video_results)

In [None]:
model_path = 'yolov10x.pt'
confidence_threshold = 0.6

detector = ObjectDetector(model_path, threshold= confidence_threshold)
all_keyframe_paths = parse_keyframe_info(keyframes_dir)
process_and_save_results(detector, all_keyframe_paths, save_dir)

Using device: cuda


Processing L01:   0%|          | 0/3 [00:00<?, ?it/s]


image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V001/000000.jpg: 384x640 (no detections), 77.4ms
Speed: 1.0ms preprocess, 77.4ms inference, 9.1ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V001/000002.jpg: 384x640 1 boat, 16.3ms
Speed: 0.9ms preprocess, 16.3ms inference, 20.6ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V001/000005.jpg: 384x640 (no detections), 16.3ms
Speed: 0.8ms preprocess, 16.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V001/000006.jpg: 384x640 (no detections), 16.3ms
Speed: 0.9ms preprocess, 16.3ms i

Processing L01:  33%|███▎      | 1/3 [00:25<00:50, 25.03s/it]


image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V002/000000.jpg: 384x640 (no detections), 14.3ms
Speed: 1.0ms preprocess, 14.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V002/000003.jpg: 384x640 2 boats, 14.3ms
Speed: 1.0ms preprocess, 14.3ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V002/000006.jpg: 384x640 (no detections), 14.3ms
Speed: 0.8ms preprocess, 14.3ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V002/000007.jpg: 384x640 (no detections), 14.2ms
Speed: 1.0ms preprocess, 14.2ms i

Processing L01:  67%|██████▋   | 2/3 [00:43<00:21, 21.02s/it]


image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V003/000000.jpg: 384x640 1 boat, 14.2ms
Speed: 0.9ms preprocess, 14.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V003/000004.jpg: 384x640 1 boat, 14.0ms
Speed: 0.9ms preprocess, 14.0ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V003/000008.jpg: 384x640 (no detections), 14.3ms
Speed: 0.8ms preprocess, 14.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes/L01/V003/000009.jpg: 384x640 (no detections), 14.0ms
Speed: 1.2ms preprocess, 14.0ms inference, 

Processing L01: 100%|██████████| 3/3 [01:09<00:00, 23.12s/it]
