Note: This notebook is inspired from the kaggle notebook at: 
https://www.kaggle.com/code/awsaf49/vinbigdata-cxr-ad-yolov5-14-class-train?kernelSessionId=52422980

In [13]:
from pathlib import Path
import shutil

import pandas as pd
from sklearn.model_selection import GroupKFold

from ultralytics import YOLO

import torch


In [14]:

BASE_DIR = Path(r"c:/Users/jsayed/Downloads/DHBW/lung-disease-detection")
DATASET_DIR = BASE_DIR / "dataset"

IMAGES_ROOT = DATASET_DIR / "images"
LABELS_ROOT = DATASET_DIR / "labels"
YAML_PATH = BASE_DIR / "YOLO/yolov12/my-yolov12.yaml"

TRAIN_IMAGES_DIR = IMAGES_ROOT / "train"
VAL_IMAGES_DIR = IMAGES_ROOT / "val"
TEST_IMAGES_DIR = IMAGES_ROOT / "test"

TRAIN_LABELS_DIR = LABELS_ROOT / "train"
VAL_LABELS_DIR = LABELS_ROOT / "val"

SOURCE_IMAGES_DIR = TRAIN_IMAGES_DIR if TRAIN_IMAGES_DIR.exists() else IMAGES_ROOT
SOURCE_LABELS_DIR = TRAIN_LABELS_DIR if TRAIN_LABELS_DIR.exists() else LABELS_ROOT

IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
YAML_PATH.parent.mkdir(parents=True, exist_ok=True)



In [15]:

for split in ["train", "val"]:
    (IMAGES_ROOT / split).mkdir(parents=True, exist_ok=True)
    (LABELS_ROOT / split).mkdir(parents=True, exist_ok=True)

TEST_IMAGES_DIR.mkdir(parents=True, exist_ok=True)

if SOURCE_IMAGES_DIR == IMAGES_ROOT:
    TRAIN_IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    for image_path in IMAGES_ROOT.glob("*"):
        if image_path.is_file() and image_path.suffix.lower() in IMAGE_EXTENSIONS:
            destination = TRAIN_IMAGES_DIR / image_path.name
            if not destination.exists():
                shutil.move(image_path, destination)
    SOURCE_IMAGES_DIR = TRAIN_IMAGES_DIR

if SOURCE_LABELS_DIR == LABELS_ROOT:
    TRAIN_LABELS_DIR.mkdir(parents=True, exist_ok=True)
    for label_path in LABELS_ROOT.glob("*.txt"):
        destination = TRAIN_LABELS_DIR / label_path.name
        if not destination.exists():
            shutil.move(label_path, destination)
    SOURCE_LABELS_DIR = TRAIN_LABELS_DIR


In [16]:

csv_path = DATASET_DIR / "yolo_aggregated.csv"
df = pd.read_csv(csv_path)

def resolve_image_path(image_id: str) -> str:
    for extension in IMAGE_EXTENSIONS:
        candidates = (
            TRAIN_IMAGES_DIR / f"{image_id}{extension}",
            VAL_IMAGES_DIR / f"{image_id}{extension}",
            SOURCE_IMAGES_DIR / f"{image_id}{extension}",
        )
        for candidate in candidates:
            if candidate.exists():
                return str(candidate)
    return str(TRAIN_IMAGES_DIR / f"{image_id}.png")

df["image_path"] = df["image_id"].apply(resolve_image_path)


In [17]:
df.head(2)

Unnamed: 0,image_id,class_name,x_min,y_min,x_max,y_max,image_path,height,width,class_id,x_mid,y_mid,w,h,area
0,0005e8e3701dfb1dd93d53e2ff537b6e,Lung Opacity,0.29467,0.191344,0.391689,0.289294,c:\Users\jsayed\Downloads\DHBW\lung-disease-de...,3072,3072,11,0.34318,0.240319,0.09702,0.09795,0.009503
1,0007d316f756b3fa0baea2ff514ce945,Pleural thickening,0.355324,0.235103,0.428676,0.295741,c:\Users\jsayed\Downloads\DHBW\lung-disease-de...,2880,2304,17,0.392,0.265422,0.073352,0.060637,0.004448


In [18]:

df["class_id"] = df["class_name"].astype("category").cat.codes
class_map = dict(enumerate(df["class_name"].astype("category").cat.categories))


In [19]:

gkf = GroupKFold(n_splits=5)
df["fold"] = -1
df = df.reset_index(drop=True)

for fold, (_, val_idx) in enumerate(gkf.split(df, groups=df["image_id"])):
    df.loc[val_idx, "fold"] = fold

val_fold = 0  # 20% validation (1 out of 5 folds)

df_ids = set(df["image_id"])
train_ids = {
    Path(image_id).stem for image_id in df[df["fold"] != val_fold]["image_id"]
}
val_ids = {
    Path(image_id).stem for image_id in df[df["fold"] == val_fold]["image_id"]
}


In [20]:

def move_label_to_val(image_id: str) -> None:
    label_name = f"{image_id}.txt"
    for candidate in (
        VAL_LABELS_DIR / label_name,
        TRAIN_LABELS_DIR / label_name,
        SOURCE_LABELS_DIR / label_name,
        LABELS_ROOT / label_name,
    ):
        if candidate.exists():
            if candidate.parent != VAL_LABELS_DIR:
                VAL_LABELS_DIR.mkdir(parents=True, exist_ok=True)
                shutil.move(candidate, VAL_LABELS_DIR / label_name)
            return

def move_image_to_val(image_id: str) -> None:
    for extension in IMAGE_EXTENSIONS:
        image_name = f"{image_id}{extension}"
        for candidate in (
            VAL_IMAGES_DIR / image_name,
            TRAIN_IMAGES_DIR / image_name,
            SOURCE_IMAGES_DIR / image_name,
            IMAGES_ROOT / image_name,
        ):
            if candidate.exists():
                if candidate.parent != VAL_IMAGES_DIR:
                    VAL_IMAGES_DIR.mkdir(parents=True, exist_ok=True)
                    shutil.move(candidate, VAL_IMAGES_DIR / candidate.name)
                return

for image_id in val_ids:
    move_label_to_val(image_id)
    move_image_to_val(image_id)


In [21]:

val_label_count = len([path for path in VAL_LABELS_DIR.glob("*.txt") if path.is_file()])
val_image_count = len([
    path
    for path in VAL_IMAGES_DIR.iterdir()
    if path.is_file() and path.suffix.lower() in IMAGE_EXTENSIONS
])

print(f"labels/val contains {val_label_count} label files")
print(f"images/val contains {val_image_count} image files")

missing_labels = [
    image_id
    for image_id in val_ids
    if not any(
        path.exists()
        for path in (
            VAL_LABELS_DIR / f"{image_id}.txt",
            TRAIN_LABELS_DIR / f"{image_id}.txt",
            LABELS_ROOT / f"{image_id}.txt",
        )
    )
]
if missing_labels:
    print(f"Missing labels for {len(missing_labels)} validation images: {missing_labels[:5]} ...")

missing_images = []
for image_id in val_ids:
    candidates = []
    for extension in IMAGE_EXTENSIONS:
        candidates.extend([
            VAL_IMAGES_DIR / f"{image_id}{extension}",
            TRAIN_IMAGES_DIR / f"{image_id}{extension}",
            SOURCE_IMAGES_DIR / f"{image_id}{extension}",
            IMAGES_ROOT / f"{image_id}{extension}",
        ])
    if not any(path.exists() for path in candidates):
        missing_images.append(image_id)

if missing_images:
    print(f"Missing images for {len(missing_images)} validation IDs: {missing_images[:5]} ...")

print(f"Train set size: {len(train_ids)} | Validation set size: {len(val_ids)}")


labels/val contains 2149 label files
images/val contains 2149 image files
Train set size: 3450 | Validation set size: 862


In [22]:

yaml_content = f"""# Lung Disease Dataset
path: {DATASET_DIR}
train: images/train
val: images/val
test: images/test
nc: {len(class_map)}
names: {list(class_map.values())}
"""
print(yaml_content)
YAML_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(YAML_PATH, "w") as f:
    f.write(yaml_content)


# Lung Disease Dataset
path: c:\Users\jsayed\Downloads\DHBW\lung-disease-detection\dataset
train: images/train
val: images/val
test: images/test
nc: 22
names: ['Aortic enlargement', 'Atelectasis', 'Calcification', 'Cardiomegaly', 'Clavicle fracture', 'Consolidation', 'Edema', 'Emphysema', 'Enlarged PA', 'ILD', 'Infiltration', 'Lung Opacity', 'Lung cavity', 'Lung cyst', 'Mediastinal shift', 'Nodule/Mass', 'Other lesion', 'Pleural effusion', 'Pleural thickening', 'Pneumothorax', 'Pulmonary fibrosis', 'Rib fracture']



In [23]:

run = BASE_DIR / "YOLO/yolov12/runs/detect/latest"
run.parent.mkdir(parents=True, exist_ok=True)


In [None]:

model = YOLO("yolo12m.pt")
print(f"Using model: {model.model_name}")
results = model.train(
    data=YAML_PATH,
    epochs=10,
    imgsz=640,
    batch=16,
    device=DEVICE,
    save=True,
    cache=True,
    project=run,
    name="train",
)
print(f"Training complete. Results saved in: {results.save_dir}")


In [None]:

last_run = run / "train"
weights_path = last_run / "weights" / "best.pt"
if not weights_path.exists():
    raise FileNotFoundError(f"Weights not found at {weights_path}")

print(f"Running inference on first 100 PNG images using weights: {weights_path}")

test_images = sorted(TEST_IMAGES_DIR.glob("*.png"))[:100]
model = YOLO(str(weights_path))
results = model(
    [str(path) for path in test_images],
    save=True,
    conf=0.25,
    save_dir=f"{run}/first_100_inference",
)
