Note: This notebook is inspired from the kaggle notebook at: 
https://www.kaggle.com/code/awsaf49/vinbigdata-cxr-ad-yolov5-14-class-train?kernelSessionId=52422980

In [None]:
import os
from pathlib import Path
import pandas as pd
from sklearn.model_selection import GroupKFold
import logging
import sys
from ultralytics import YOLO
import albumentations as A
from ultralytics.data.augment import LetterBox
import random, shutil
# from ultralytics.engine.callbacks import Callbacks

: 

In [None]:
base = Path('/root')  #Path("/mnt/shared_dataset")
root = base / "YOLO"
dicom_dir = base / "physionet.org/files/vindr-cxr/1.0.0/train"
dicom_test_dir = base / "physionet.org/files/vindr-cxr/1.0.0/test"

png_dir = root / "images"
label_dir = root / "labels"
test_dir = png_dir / "test"
yaml_path = root / "yolov8/my-yolov8.yaml"

In [None]:
for sub in ['train', 'val']:
    os.makedirs(os.path.join(png_dir,sub), exist_ok=True)
    os.makedirs(os.path.join(label_dir, sub), exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

In [None]:
csv_path = root / "yolo_aggregated.csv"
df = pd.read_csv(csv_path)
df["image_path"] = df["image_id"].apply(lambda x: f"YOLO/images/train/{x}.png")  # adjust path/format

In [None]:
png_train = png_dir/"train"
actual_pngs = {p.stem for p in (png_train).glob("*.png")}

# Compare with image IDs in dataframe (after No finding drop)
df_ids = set(df["image_id"])
missing_png_ids = df_ids - actual_pngs
log_dir = "log_dir"
log_file = os.path.join(log_dir, "missing_from_disk_after_drop.txt")

if missing_png_ids:
    os.makedirs(log_dir, exist_ok=True)
    if not os.path.exists(log_file):
        with open(log_file, "w") as f:
            for mid in sorted(missing_png_ids):
                f.write(mid + "\n")
    logging.warning(f"{len(missing_png_ids)} image_ids missing as PNGs on disk.")

In [None]:
df['class_id'] = df['class_name'].astype('category').cat.codes
class_map = dict(enumerate(df['class_name'].astype('category').cat.categories))

In [None]:
gkf = GroupKFold(n_splits=5)
df['fold'] = -1
df = df.reset_index(drop=True)

for fold, (_, val_idx) in enumerate(gkf.split(df, groups=df['image_id'])):
    df.loc[val_idx, 'fold'] = fold

val_fold = 0  # 20% validation (1 out of 5 folds)

df_ids = set(df["image_id"])
train_ids = set(df[df['fold'] != val_fold]['image_id'].apply(lambda x: Path(x).stem))
val_ids = set(df[df['fold'] == val_fold]['image_id'].apply(lambda x: Path(x).stem))

In [None]:
# val_fold = 0
# df_ids = set(df["image_id"])
# train_ids = set(df[df['fold'] != val_fold]['image_id'])
# val_ids = set(df[df['fold'] == val_fold]['image_id'])

# print(f"length of train set: {len(train_ids)} & val set: {len(val_ids)}")
# print(f"📊 TRAIN SIZE: {len(train_ids)} | VAL SIZE: {len(val_ids)}", file=sys.stderr)
# summary_path = "log_dir/train_val_summary.txt"
# with open(summary_path, "w") as f:
#     f.write(f"TRAIN SIZE: {len(train_ids)}\nVAL SIZE: {len(val_ids)}\n")

# print(df.head(10))

In [None]:
labels_train = Path(f"{label_dir}/train")
labels_val = Path(f"{label_dir}/val")
labels_val.mkdir(parents=True, exist_ok=True)

for img_id in val_ids:
    src = labels_train / f"{img_id}.txt"
    dst = labels_val / f"{img_id}.txt"
    if src.exists():
        shutil.move(src, dst)

# === Reporting ===
moved = len([f for f in labels_val.glob("*.txt")])
print(f"✅ Moved {moved} label files to labels/val/")

missing = [img_id for img_id in val_ids if not (labels_train / f"{img_id}.txt").exists()]
if missing:
    print(f"⚠️ Missing {len(missing)} label files for val set: {missing[:5]} ...")

print(f"length of train set: {len(train_ids)} & val set: {len(val_ids)}")
print(f"📊 TRAIN SIZE: {len(train_ids)} | VAL SIZE: {len(val_ids)}", file=sys.stderr)

# === Log ===
summary_path = "log_dir/train_val_summary.txt"
with open(summary_path, "w") as f:
    f.write(f"TRAIN SIZE: {len(train_ids)}\nVAL SIZE: {len(val_ids)}\n")

print(df.head(10))

In [None]:
# albumentations_transform = A.Compose([
#     A.Rotate(limit=5, p=0.7),
#     A.HorizontalFlip(p=0.5),
#     A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=0, p=0.5),
#     A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.7),
#     A.GaussNoise(var_limit=(10.0, 50.0), p=0.4),
#     A.MotionBlur(blur_limit=3, p=0.2),
# ],
#     bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels'], min_visibility=0.3)
# )

# class AlbumentationsCallback(Callbacks):
#     def on_preprocess_batch(self, trainer):
#         for sample in trainer.batch:
#             if random.random() < 0.1:  #  10% chance to skip augmentation
#                 continue  # keep image and labels as-is

#             img = sample['img']
#             bboxes = sample['bboxes']
#             cls = sample['cls']

#             bboxes_list = [bbox.tolist() for bbox in bboxes]
#             cls_list = cls.tolist()

#             aug = albumentations_transform(image=img, bboxes=bboxes_list, class_labels=cls_list)

#             sample['img'] = aug['image']
#             sample['bboxes'] = aug['bboxes']
#             sample['cls'] = aug['class_labels']


# callbacks = AlbumentationsCallback()


In [None]:
yaml_content = f"""# Lung Disease Dataset
path: {root}
train: images/train
val: images/val
test: images/test
nc: {len(class_map)}
names: {list(class_map.values())}
"""
print(yaml_content)
with open(yaml_path, "w") as f:
    f.write(yaml_content)

In [None]:
run = Path(f"{root}/yolov8/runs/detect/latest")

In [None]:
model = YOLO("yolov8l.pt")  # load a pretrained model (recommended for training)
print(f"1 using {model}")
results = model.train(data = yaml_path, epochs=20, imgsz=640, batch = 16, device='cpu',  save = True, cache = True, project=run ,name="train" )
print(f"2 done training. Results saved in: {results.save_dir}")

In [None]:
last_run = run / "train"
weights_path = last_run / "weights" / "best.pt"
if not weights_path.exists():
    raise FileNotFoundError(f"❌ Weights not found at {weights_path}")

print(f"🧪 Testing first 100 .png images from: {weights_path}")

test_images = sorted(test_dir.glob("*.png"))[:100]  # replace with your real test_dir
model = YOLO(str(weights_path))
results = model(
    [str(p) for p in test_images],
    save=True,
    conf=0.25,
    save_dir=f"{run}/first_100_inference"
)


In [None]:
# results = model(test_dir, save=True, conf=0.25, stream = True)
# print(f"3 done testing")