In [1]:
!apt-get update -qq && apt-get install -y libgl1
import ray
import logging
ray.init(address="auto")  # Connect to the cluster
@ray.remote
def debug_ray_cv2():
    try:
        import cv2
        version = cv2.__version__
        build_info = cv2.getBuildInformation()
        return f"✅ OpenCV {version} loaded in Ray.\n{build_info}"
    except Exception as e:
        return f"❌ Failed in Ray worker: {e}"

logging.info(ray.get(debug_ray_cv2.remote()))

Reading package lists... 0%Reading package lists... 100%Reading package lists... Done
Building dependency tree... 0%Building dependency tree... 0%

Building dependency tree... 50%Building dependency tree... 50%

Building dependency tree... Done
Reading state information... 0% Reading state information... 0%Reading state information... Done
libgl1 is already the newest version (1.7.0-1build1).


0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


  from .autonotebook import tqdm as notebook_tqdm
2025-04-05 09:58:26,156	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


2025-04-05 09:58:26,175	INFO worker.py:1654 -- Connecting to existing Ray cluster at address: 212.227.135.240:6379...


2025-04-05 09:58:26,194	INFO worker.py:1841 -- Connected to Ray cluster.


In [2]:
# Step 1: Clone YOLOv5 if not already present
!if [ ! -d "yolov5" ]; then git clone https://github.com/ultralytics/yolov5.git; fi

In [3]:
# Step 2: Install required packages
!pip install -q -r yolov5/requirements.txt
!pip install -q pydicom pillow tqdm pandas scikit-learn
!pip uninstall -y opencv-python opencv-contrib-python opencv-python-headless
!pip install opencv-python-headless
logging.info(ray.get(debug_ray_cv2.remote()))

Found existing installation: opencv-python 4.11.0.86
Uninstalling opencv-python-4.11.0.86:
  Successfully uninstalled opencv-python-4.11.0.86


[0m

Found existing installation: opencv-python-headless 4.11.0.86
Uninstalling opencv-python-headless-4.11.0.86:
  Successfully uninstalled opencv-python-headless-4.11.0.86


Collecting opencv-python-headless
  Using cached opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)


Using cached opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (50.0 MB)


Installing collected packages: opencv-python-headless


Successfully installed opencv-python-headless-4.11.0.86


In [4]:
# Step 3: Imports
import os
from pathlib import Path
import pandas as pd
import pydicom
import cv2
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from time import sleep  
import subprocess


In [5]:
# Step 4: Paths using Pathlib
base = Path("/mnt/shared_dataset")
root = base / "YOLO"
dicom_dir = base / "physionet.org/files/vindr-cxr/1.0.0/train"
dicom_test_dir = base / "physionet.org/files/vindr-cxr/1.0.0/test"

png_dir = root / "images"
label_dir = root / "labels"
test_dir = png_dir / "test"
yaml_path = root / "my-yolov5.yaml"

In [6]:
# Step 5: Create required folders if not exist
for sub in ['train', 'val']:
    os.makedirs(os.path.join(png_dir,sub), exist_ok=True)
    os.makedirs(os.path.join(label_dir, sub), exist_ok=True)
os.makedirs(test_dir, exist_ok=True)


In [7]:
MAX_RETRIES = 3
log_file = base / "log_dir/ray_conversion_results.log"

# Setup root logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()  # optional: shows up in terminal
    ]
)

@ray.remote
def convert_dicom_to_png_remote(dicom_path_str, png_path_str):
    dicom_path = Path(dicom_path_str)
    png_path = Path(png_path_str)

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            ds = pydicom.dcmread(dicom_path, force=True)
            img = ds.pixel_array

            if img.size == 0:
                raise ValueError("Empty pixel data")

            img_normalized = ((img - img.min()) / (img.max() - img.min()) * 255).astype('uint8')

            png_path.parent.mkdir(parents=True, exist_ok=True)
            success = cv2.imwrite(str(png_path), img_normalized)

            if not success:
                raise IOError(f"cv2.imwrite() failed for {png_path}")

            logging.info(f"✅ Conversion succeeded: {dicom_path} -> {png_path}")
            return {"dicom": str(dicom_path), "status": "success"}

        except Exception as e:
            logging.error(f"❌ Attempt {attempt} failed for {dicom_path}: {e}")
            sleep(1)

    logging.error(f"❌ FAILED after {MAX_RETRIES} retries: {dicom_path}")
    return {"dicom": str(dicom_path), "status": "failed", "error": str(e)}


# Convert train & val
for split in ['train', 'val']:
    dicom_subdir = Path(dicom_dir)
    png_subdir = Path(png_dir) / split
    tasks = []
    for dcm_file in dicom_subdir.glob("*.dicom"):
        out_path = png_subdir / f"{dcm_file.stem}.png"
        if not out_path.exists():
            tasks.append(convert_dicom_to_png_remote.remote(str(dcm_file), str(out_path)))

    results = ray.get(tasks)
    failed = [r for r in results if r.get("status") != "success"]
    logging.info(f"Conversion complete for {split}. Failed: {len(failed)}/{len(results)}")


# Convert test
tasks = []
for dcm_file in Path(dicom_test_dir).glob("*.dicom"):
    out_path = Path(test_dir) / f"{dcm_file.stem}.png"
    if not out_path.exists():
        tasks.append(convert_dicom_to_png_remote.remote(str(dcm_file), str(out_path)))

results = ray.get(tasks)
failed = [r for r in results if r["status"] != "success"]

with open("log_dir/failed_conversions.txt", "w") as f:
    for r in failed:
        f.write(f"{r['dicom']} - {r.get('error')}\n")

logging.info(f"Done: {len(results) - len(failed)} succeeded, {len(failed)} failed.")



In [8]:
# Step 7: Load your CSV annotation
csv_path = base / "physionet.org/files/vindr-cxr/1.0.0/annotations/annotations_train.csv"
df = pd.read_csv(csv_path)

In [9]:
# Step 8: Drop rows with no findings
df = df[df['class_name'].notna() & (df['class_name'] != 'No finding')].copy()
df["image_path"] = df["image_id"].apply(lambda x: f"YOLO/images/train/{x}.png")  # adjust path/format
df['width'] = df['x_max'] - df['x_min']
df['height'] = df['y_max'] - df['y_min']

In [10]:
png_train = png_dir/"train"
actual_pngs = {p.stem for p in (png_train).glob("*.png")}

# Compare with image IDs in dataframe (after No finding drop)
df_ids = set(df["image_id"])
missing_png_ids = df_ids - actual_pngs

if missing_png_ids:
    with open("log_dir/missing_from_disk_after_drop.txt", "w") as f:
        for mid in sorted(missing_png_ids):
            f.write(mid + "\n")
    logging.warning(f"{len(missing_png_ids)} image_ids missing as PNGs on disk.")

In [11]:
# Step 10: Map classes
df['class_id'] = df['class_name'].astype('category').cat.codes
class_map = dict(enumerate(df['class_name'].astype('category').cat.categories))

In [12]:
# Step 11: Add fold column using GroupKFold
gkf = GroupKFold(n_splits=5)
df['fold'] = -1
df = df.reset_index(drop=True)
for fold, (_, val_idx) in enumerate(gkf.split(df, groups=df['image_id'])):
    df.loc[val_idx, 'fold'] = fold

In [13]:
# Step 12: Normalize bbox + save to YOLO .txt
def save_labels(df_subset, label_split):
    for image_id, group in df_subset.groupby("image_id"):
        label_file = label_dir / label_split / f"{image_id}.txt"
        with open(label_file, "w") as f:
            for _, row in group.iterrows():
                x_center = ((row.x_min / row.width) + (row.x_max / row.width)) / 2
                y_center = ((row.y_min / row.height) + (row.y_max / row.height)) / 2
                w = (row.x_max - row.x_min) / row.width
                h = (row.y_max - row.y_min) / row.height
                f.write(f"{row.class_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}\n")

In [14]:
# Step 13: Train/val split using folds
val_fold = 0
val_ids = df[df['fold'] == val_fold]['image_id'].unique()
train_ids = df[df['fold'] != val_fold]['image_id'].unique()

val_ids_set = set(val_ids)
train_ids_set = set(train_ids)

# Delete files not belonging to the correct split
for f in ['train', 'val']:
    target_dir = png_dir / f
    valid_ids = train_ids_set if f == 'train' else val_ids_set
    for file in target_dir.iterdir():
        if file.is_file() and file.stem not in valid_ids:
            file.unlink()


save_labels(df[df.image_id.isin(train_ids)], "train")
save_labels(df[df.image_id.isin(val_ids)], "val")
print(df.head(10))

                           image_id rad_id          class_name        x_min  \
0  0005e8e3701dfb1dd93d53e2ff537b6e    R10        Infiltration   900.956970   
1  0005e8e3701dfb1dd93d53e2ff537b6e    R10        Lung Opacity   900.956970   
2  0005e8e3701dfb1dd93d53e2ff537b6e     R8       Consolidation   932.471985   
3  0005e8e3701dfb1dd93d53e2ff537b6e     R8         Nodule/Mass   932.471985   
4  0005e8e3701dfb1dd93d53e2ff537b6e     R9        Lung Opacity   905.224976   
5  0007d316f756b3fa0baea2ff514ce945    R10  Pulmonary fibrosis   818.666016   
6  0007d316f756b3fa0baea2ff514ce945    R10  Pleural thickening   818.666016   
7  0007d316f756b3fa0baea2ff514ce945    R10  Aortic enlargement  1275.290039   
8  0007d316f756b3fa0baea2ff514ce945    R10        Cardiomegaly   902.039978   
9  0007d316f756b3fa0baea2ff514ce945     R8  Pleural thickening   621.364990   

         y_min        x_max        y_max  \
0   587.809021  1205.359985   888.710999   
1   587.809021  1205.359985   888.710999  

In [15]:
# Step 14: Create YAML file
yaml_content = f"""# Lung Disease Dataset
path: {root}
train: images/train
val: images/val
nc: {len(class_map)}
names: {list(class_map.values())}
"""
with open(yaml_path, "w") as f:
    f.write(yaml_content)

In [16]:
# from ray import train
# from ray.train import ScalingConfig
# from ray.train.torch import TorchTrainer

# print("Current working dir:", os.getcwd())
# print("Files:", os.listdir("."))

# def train_yolo(config):
#     command = [
#         "python", "yolov5/train.py",
#         "--img", "1280",
#         "--batch", str(config["batch"]),
#         "--epochs", str(config["epochs"]),
#         "--data", config["data_yaml"],
#         "--weights", "yolov5l.pt",
#         "--cache", "disk"
#     ]

#     result = subprocess.run(command, capture_output=True, text=True)

#     # Save logs for inspection
#     log_path = "log_dir/yolo_train.log"
#     with open(log_path, "w") as f:
#         f.write(result.stdout)
#         f.write(result.stderr)

#     print("=== STDOUT ===")
#     print(result.stdout)
#     print("=== STDERR ===")
#     print(result.stderr)

#     # Do not raise error yet — just return log info
#     weights_path = "yolov5/runs/train/exp/weights/best.pt"
#     exists = os.path.exists(weights_path)

#     return {
#         "status": "done",
#         "weights_found": exists,
#         "weights_path": weights_path if exists else None,
#         "stdout_tail": result.stdout[-500:],
#         "stderr_tail": result.stderr[-500:],
#         "log_file": log_path
#     }


In [17]:

command = [
    "python", "yolov5/train.py",
    "--img", "1280",
    "--batch", "8",
    "--epochs", "2",  # keep small for test
    "--data", "/mnt/shared_dataset/YOLO/my-yolov5.yaml",
    "--weights", "yolov5s.pt",
    "--cache", "disk"
]

result = subprocess.run(command, capture_output=True, text=True)

print(result.stdout)
print(result.stderr)


[31m[1mrequirements:[0m Ultralytics requirement ['opencv-python>=4.1.1'] not found, attempting AutoUpdate...
Collecting opencv-python>=4.1.1
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (63.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.0 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/63.0 MB[0m [31m1.8 MB/s[0m eta [36m0:00:36[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/63.0 MB[0m [31m3.3 MB/s[0m eta [36m0:00:19[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/63.0 MB[0m [31m6.1 MB/s[0m eta [36m0:00:11[0m
[2K   [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/63.0 MB[0m [31m10.2 MB/s[0m eta [36m0:00:07[0m
[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━