# Task 3 - Data Enrichment with Object Detection (YOLO)

In [1]:
# import libraries
import sys
import os
import pandas as pd
from glob import glob

In [2]:
from pathlib import Path
from importlib import reload
# add the project root to the path
project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

### Database connection

In [3]:
#import the module
from src.database.Connection import Database
import src.database.Connection
# reload the module to ensure we have the latest version
reload(src.database.Connection)

<module 'src.database.Connection' from '/home/chalasimon/Documents/10academy/week 7/challenge/Telegram-Medical-Insights/src/database/Connection.py'>

In [4]:
host = "localhost"
port = 5432
# get the database credentials from environment variables or use defaults
# you can set these in a .env file or directly in your environment
# for example, using dotenv package to load from .env file
from dotenv import load_dotenv
load_dotenv()

host=os.getenv("POSTGRES_HOST")
dbname=os.getenv("POSTGRES_DB")
user=os.getenv("POSTGRES_USER")
password=os.getenv("POSTGRES_PASSWORD")
port=os.getenv("POSTGRES_PORT")

# create a database connection
db = Database(host=host,database=dbname, user=user, password=password, port=port)
# connect to the database
con = db.connect()

Connection to the database established successfully.


### Data Enrichment with YOLO

In [6]:
# import libraries
import os, time, logging
from dotenv import load_dotenv
from src.enrichment.yolo_detect import select_images_to_process, run_yolo_on_image, insert_detections
from ultralytics import YOLO

In [7]:
def main(limit=None, model_name=None, rerun=False):
    load_dotenv()
    model_name = model_name or os.getenv("YOLO_MODEL", "yolov8n.pt")
    logging.info(f"Loading YOLO model: {model_name}")
    model = YOLO(model_name)

    targets = select_images_to_process(con, limit=limit, rerun=rerun)
    logging.info(f"Images to process: {len(targets)}")

    inserted = 0
    t0 = time.time()

    for idx, (message_id, image_path) in enumerate(targets, 1):
        dets = run_yolo_on_image(model, image_path)
        batch = [
            (
                message_id,
                image_path,
                d["detected_class"],
                d["confidence"],
                d["bbox_xmin"],
                d["bbox_ymin"],
                d["bbox_xmax"],
                d["bbox_ymax"],
                model_name,
            )
            for d in dets
        ]
        inserted += insert_detections(con, batch)
        if idx % 50 == 0:
            logging.info(f"Processed {idx}/{len(targets)} images...")

    logging.info(f"Done. Inserted {inserted} detection rows in {time.time()-t0:.1f}s.")


In [9]:
import sys

if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser(description="YOLOv8 detections into raw.image_detections")
    p.add_argument("--limit", type=int, default=None, help="Max images to process")
    p.add_argument("--model", type=str, default=None, help="YOLO model (e.g., yolov8n.pt)")
    p.add_argument("--rerun", action="store_true", help="Recompute even if detections exist")
    a, unknown = p.parse_known_args()   # <-- parse_known_args ignores extra notebook args
    main(limit=a.limit, model_name=a.model, rerun=a.rerun)


2025-08-19 10:11:22,980 [INFO] Loading YOLO model: yolov8n.pt
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt': 100%|██████████| 6.25M/6.25M [00:51<00:00, 126kB/s] 
2025-08-19 10:12:18,226 [INFO] Images to process: 3896
2025-08-19 10:12:39,243 [INFO] Processed 50/3896 images...
2025-08-19 10:12:47,344 [INFO] Processed 100/3896 images...
2025-08-19 10:12:54,871 [INFO] Processed 150/3896 images...
2025-08-19 10:13:02,395 [INFO] Processed 200/3896 images...
2025-08-19 10:13:11,246 [INFO] Processed 250/3896 images...
2025-08-19 10:13:20,621 [INFO] Processed 300/3896 images...
2025-08-19 10:13:30,756 [INFO] Processed 350/3896 images...
2025-08-19 10:13:40,838 [INFO] Processed 400/3896 images...
2025-08-19 10:13:50,296 [INFO] Processed 450/3896 images...
2025-08-19 10:13:59,388 [INFO] Processed 500/3896 images...
2025-08-19 10:14:07,492 [INFO] Processed 550/3896 images...
2025-08-19 10:14:15,247 [INFO] Processed 600/3896 images...
2025-08-