In [56]:
# Run this cell at the start of every new Colab session

import os
import sys
from google.colab import drive

# --- Mount Google Drive ---
drive.mount('/content/drive')
print("Drive mounted")

# --- Clone kits21 repo ---
if not os.path.exists("/content/kits21"):
    !git clone https://github.com/neheller/kits21.git /content/kits21 -q
    print("kits21 repo cloned")
else:
    print("kits21 repo already exists")

# --- Install kits21 package ---
%cd /content/kits21
!pip install -e . -q
print("kits21 package installed")

# --- Redirect TRAINING_DIR to Drive ---
paths_file = "/content/kits21/kits21/configuration/paths.py"
new_content = '''from pathlib import Path
import os

TRAINING_DIR = Path("/content/drive/MyDrive/kidney-tumour-detection/dataset/raw")
TESTING_DIR = Path(os.environ["KITS21_TEST_DIR"]).resolve(strict=True) if "KITS21_TEST_DIR" in os.environ.keys() else None
SRC_DIR = Path(os.environ["KITS21_SERVER_DATA"]).resolve(strict=True) if "KITS21_SERVER_DATA" in os.environ.keys() else None
CACHE_FILE = Path(__file__).parent.parent / "annotation" / "cache.json"
'''
with open(paths_file, "w") as f:
    f.write(new_content)
print("TRAINING_DIR redirected to Drive")

# --- Add kits21 to Python path ---
sys.path.insert(0, "/content/kits21")

# --- Verify everything is working ---
from kits21.configuration.paths import TRAINING_DIR

cases_on_drive = len([
    c for c in os.listdir(str(TRAINING_DIR))
    if c.startswith("case_")
])

print(f"\n{'='*50}")
print(f"TRAINING_DIR : {TRAINING_DIR}")
print(f"Drive accessible: {os.path.exists(str(TRAINING_DIR))}")
print(f"Cases on Drive: {cases_on_drive}/300")
print(f"{'='*50}")

if cases_on_drive == 300:
    print("Session ready - all 300 cases confirmed")
else:
    print("Warning: Expected 300 cases, found", cases_on_drive)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted
kits21 repo already exists
/content/kits21
  Preparing metadata (setup.py) ... [?25l[?25hdone
kits21 package installed
TRAINING_DIR redirected to Drive

TRAINING_DIR : /content/drive/MyDrive/kidney-tumour-detection/dataset/raw
Drive accessible: True
Cases on Drive: 300/300
Session ready - all 300 cases confirmed


In [None]:
# DRIVE MOUNT AND ENVIRONMENT VERIFICATION

import os
import sys

# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted.")

# --- Verify project folder exists on Drive ---
PROJECT_ROOT = '/content/drive/MyDrive/kidney-tumour-detection'

required_folders = [
    'dataset/raw',
    'dataset/processed',
    'checkpoints',
    'logs',
    'outputs'
]

print("Verifying folder structure...")
all_good = True
for folder in required_folders:
  full_path = os.path.join(PROJECT_ROOT, folder)
  if os.path.exists(full_path):
    print(f" {folder}")
  else:
    print(f" {folder} missing, creating it now...")
    os.mkdirs(full_path, exist_ok = True)
    print(f" {folder} created!")
    all_good = False

if all_good:
  print("All folders have been verified")
else:
  print("Missing folders have been created")

# --- Check GPU availability ---
import subprocess
print("Checking GPU availability...")
try:
  gpu_info = subprocess.run(
      ['nvidia-smi'],
      capture_output=True,
      text=True
      )
  if gpu_info.returncode == 0:
          for line in gpu_info.stdout.split('\n'):
              if any(x in line for x in ['Tesla', 'A100', 'T4', 'V100', 'L4']):
                  print(f" GPU detected: {line.strip()}")
  else:
    print("nvidia-smi returned an error")
except FileNotFoundError:
    print("No GPU detected - please change runtime type")
    print("Go to: Runtime → Change runtime type → T4 GPU")
except Exception as e:
    print(f" GPU check failed: {e}")

# --- Check RAM ---
import psutil
ram = psutil.virtual_memory()
print(f"RAM available: {ram.available / (1024**3):.1f} gb"
      f"/ {ram.total / (1024**3):.1f} GB total")

# --- Check Runtime Disk Usage ---
disk = psutil.disk_usage('/')
print(f"Runtime disk: {disk.free / (1024**3):.1f} GB free "
      f"/ {disk.total / (1024**3):.1f} GB total")
print("Drive storage: 2TB (psutil cannot read network drives accurately)")
print("Verify manually at drive.google.com")

# --- Check Drive storage ---
drive_disk = psutil.disk_usage(PROJECT_ROOT)
print(f"Drive storage: {drive_disk.free / (1024**3):.1f} GB free "
      f"/ {drive_disk.total / (1024**3):.1f} GB total")

print("\n" + "="*50)
print("Session ready. Project root:", PROJECT_ROOT)



MessageError: Error: credential propagation was unsuccessful

In [4]:
# GITHUB REPOSITORY SYNC

import os

GITHUB_REPO = "https://github.com/danokundaye/kidney-tumour-detection.git"
REPO_NAME = "kidney-tumour-detection"
CLONE_PATH = f"/content/{REPO_NAME}"

# --- Clone or update repository ---
if os.path.exists(CLONE_PATH):
  print("Repository already exists, pulling latest changes...")
  os.chdir(CLONE_PATH)
  os.system("git pull origin main")
  print("Repository updated")
else:
  print("Cloning repository...")
  os.system(f" git clone {GITHUB_REPO} {CLONE_PATH}")
  print("Repository cloned")

# --- Add repo to Python path so we can import our modules ---
import sys
if CLONE_PATH not in sys.path:
    sys.path.insert(0, CLONE_PATH)
    print(f" Added {CLONE_PATH} to Python path")

print(f"\nWorking directory: {CLONE_PATH}")
print("GitHub sync complete")

Cloning repository...
Repository cloned
 Added /content/kidney-tumour-detection to Python path

Working directory: /content/kidney-tumour-detection
GitHub sync complete


In [None]:
# INSTALL REQUIRED LIBRARIES

print("Installing required libraries...")
print("This will take 3-5 minutes. Stay calm.")

import subprocess
import sys

def install(package):
  subprocess.check_call(
      [sys.executable, "-m", "pip", "install", package, "-q"],
      stdout=subprocess.DEVNULL,
      stderr=subprocess.DEVNULL
  )

# Verify PyTorch has already been installed
print(" Checking PyTorch...", end=" ")
try:
    import torch
    if torch.cuda.is_available():
        print(f"already installed (v{torch.__version__})")
    else:
        print("Installed but no CUDA - check runtime type")
except ImportError:
    print("Not found, installing...")
    subprocess.check_call(
        [sys.executable, "-m", "pip", "install",
         "torch", "torchvision", "-q"]
    )
    print("Installed")

libraries = [
    ("ultralytics",                               "YOLOv8"),
    ("segmentation-models-pytorch",               "U-Net with ResNet50"),
    ("monai",                                     "Medical imaging utilities"),
    ("nibabel",                                   "NIfTI file reading"),
    ("albumentations",                            "Data augmentation"),
    ("shap",                                      "Explainability"),
    ("opencv-python-headless",                    "Image processing"),
    ("scikit-learn",                              "Metrics"),
    ("matplotlib",                                "Matplotlib"),
    ("seaborn",                                   "Seaborn"),
    ("tqdm",                                      "Progress bars"),
]

for package, name in libraries:
  print(f" Installing {name}...", end=" ")
  try:
    install(package)
    print("installed")
  except Exception as e:
    print(f" Failed: {e}")

print("\nVerifying critical imports...")
verification = {
    "torch":                      "PyTorch",
    "torchvision":                "TorchVision",
    "ultralytics":                "YOLOv8",
    "segmentation_models_pytorch":"U-Net",
    "monai":                      "MONAI",
    "nibabel":                    "NiBabel",
    "albumentations":             "Albumentations",
    "shap":                       "SHAP",
    "cv2":                        "OpenCV",
    "sklearn":                    "Scikit-learn",
}

all_imported = True
for module, name in verification.items():
  try:
    __import__(module)
    print(f" {name}")
  except ImportError:
    print(f" {name} - FAILED TO IMPORT")
    all_imported = False

if all_imported:
    print("\nAll libraries installed and verified")
else:
    print("\nSome libraries failed - rerun this cell")

# Verify PyTorch access to GPU
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f} GB")
else:
    print("CUDA not available - check runtime type")

Installing required libraries...
This will take 3-5 minutes. Stay calm.
 Checking PyTorch... already installed (v2.9.0+cu128)
 Installing YOLOv8... installed
 Installing U-Net with ResNet50... installed
 Installing Medical imaging utilities... installed
 Installing NIfTI file reading... installed
 Installing Data augmentation... installed
 Installing Explainability... installed
 Installing Image processing... installed
 Installing Metrics... installed
 Installing Matplotlib... installed
 Installing Seaborn... installed
 Installing Progress bars... installed

Verifying critical imports...
 PyTorch
 TorchVision
Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
 YOLOv8
 U-Net




 MONAI
 NiBabel
 Albumentations
 SHAP
 OpenCV
 Scikit-learn

All libraries installed and verified
GPU: Tesla T4
VRAM: 14.6 GB


In [None]:
# Clone official KiTS21 Repository

import os

# Define paths for download
DRIVE_PROJECT = "/content/drive/MyDrive/kidney-tumour-detection"
DATASET_RAW = os.path.join(DRIVE_PROJECT, "dataset", "raw")
KITS_REPO = "/content/kits21" # Temporary code storage

# Clone the KiTS21 repository into KITS_REPO
if not os.path.exists(KITS_REPO):
  !git clone https://github.com/neheller/kits21.git /content/kits21
  print("KiTS21 repository successfully cloned!")
else:
  print("KiTS21 repository already exists")

# Install package
%cd /content/kits21
!pip install -e . -q
print("KiTS21 package installed")

Cloning into '/content/kits21'...
remote: Enumerating objects: 87173, done.[K
remote: Counting objects: 100% (1155/1155), done.[K
remote: Compressing objects: 100% (571/571), done.[K
^C
KiTS21 repository successfully cloned!
[Errno 2] No such file or directory: '/content/kits21'
/content
[31mERROR: file:///content does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0mKiTS21 package installed


In [None]:
import os

# Confirm number of cases
cases = [item for item in os.listdir("/content/kits21/kits21/data") if item.startswith("case_")]
print(f"Total cases found: {len(cases)} \n")

# Confirm case contents
sample_case = "/content/kits21/kits21/data/case_00000"
for item in os.listdir(sample_case):
    print(f"{item}")

# Confirm raw folder contents
print("\n--- raw folder ---")
raw_path = os.path.join(sample_case, "raw")
for item in os.listdir(raw_path):
    print(item)

# Confirm segmentation folder contents
print("\n--- segmentations folder ---")
seg_path = os.path.join(sample_case, "segmentations")
for item in os.listdir(seg_path):
    print(item)

download_path = "/content/kits21/kits21/data"
# Check if there's a download script
for root, dirs, files in os.walk("/content/kits21/kits21"):
    for file in files:
        if "download" in file.lower():
            print(os.path.join(root, file))

FileNotFoundError: [Errno 2] No such file or directory: '/content/kits21/kits21/data'

In [None]:
# Redirect TRAINING_DIR to Google Drive to save downloads permanently
paths_file = "/content/kits21/kits21/configuration/paths.py"

new_content = '''from pathlib import Path
import os

# Redirected to Google Drive for permanent storage
TRAINING_DIR = Path("/content/drive/MyDrive/kidney-tumour-detection/dataset/raw")
TESTING_DIR = Path(os.environ["KITS21_TEST_DIR"]).resolve(strict=True) if "KITS21_TEST_DIR" in os.environ.keys() else None
SRC_DIR = Path(os.environ["KITS21_SERVER_DATA"]).resolve(strict=True) if "KITS21_SERVER_DATA" in os.environ.keys() else None
CACHE_FILE = Path(__file__).parent.parent / "annotation" / "cache.json"
'''

with open(paths_file, "w") as f:
    f.write(new_content)

print("TRAINING_DIR redirected to Drive")

# Verify the change
with open(paths_file, "r") as f:
    print(f.read())

In [None]:
# Copy existing case folders in Temporary Storage to Google Drive

import shutil
import os
from tqdm import tqdm

SOURCE_DIR = "/content/kits21/kits21/data"
DEST_DIR = "/content/drive/MyDrive/kidney-tumour-detection/dataset/raw"

# Get all case folders
cases = sorted([c for c in os.listdir(SOURCE_DIR) if c.startswith("case_")])
print(f"Cases to copy: {len(cases)}")

for case in tqdm(cases, desc="Copying cases to Drive"):
    src = os.path.join(SOURCE_DIR, case)
    dst = os.path.join(DEST_DIR, case)

    # Only copy if not already in Drive
    if not os.path.exists(dst):
      shutil.copytree(src, dst)

print(f"\n All cases copied to Drive")
print(f"Contents of Drive raw folder:")
print(len(os.listdir(DEST_DIR)), "items")

In [None]:
# Verify all 300 cases and their contents are on Drive

import os

DEST_DIR = "/content/drive/MyDrive/kidney-tumour-detection/dataset/raw"

# Count case folders
cases_on_drive = sorted([
    item for item in os.listdir(DEST_DIR)
    if item.startswith("case_")
])

print(f"Total case folders on Drive: {len(cases_on_drive)}")
print(f"First case: {cases_on_drive[0]}")
print(f"Last case: {cases_on_drive[-1]}")

# Also verify case_00000 has its contents
sample = os.path.join(DEST_DIR, "case_00000")
print(f"\nContents of case_00000 on Drive:")
for item in os.listdir(sample):
    print(f"  {item}")

In [None]:
# Download CT Scans to Drive

import os
import sys
sys.path.insert(0, "/content/kits21")

from pathlib import Path
from kits21.configuration.paths import TRAINING_DIR
import requests
import shutil
from tqdm import tqdm

# Verify TRAINING_DIR destination in Drive
print(f"Download destination: {TRAINING_DIR}")
assert "drive" in str(TRAINING_DIR), "TRAINING_DIR is not pointing to Drive! Stop and fix this."

imaging_url = "https://kits19.sfo2.digitaloceanspaces.com/"
imaging_name_tmplt =  "master_{:05d}.nii.gz"
temp_f = Path("/content/temp.tmp")

def get_destination(i):
    return TRAINING_DIR / "case_{:05d}".format(i) / "imaging.nii.gz"

def download_case(cid):
   remote_name = imaging_name_tmplt.format(cid)
   url = imaging_url + remote_name
   dst = get_destination(cid)
   try:
    with requests.get(url, stream = True) as r:
      r.raise_for_status()
      with temp_f.open('wb') as f:
        shutil.copyfileobj(r.raw, f)
    shutil.move(str(temp_f), str(dst))
    return True
   except Exception as e:
    if temp_f.exists():
      temp_f.unlink()
    print(f"\n Case {cid:05d} failed: {e}")
    return False

# Find cases still needing download
left_to_download = []
for i in range(300):
  dst = get_destination(i)
  if not dst.exists():
    left_to_download.append(i)

print(f"Cases already downloaded: {300 - len(left_to_download)}")
print(f"Cases remaining: {len(left_to_download)}")
print(f"Starting download...\n")

failed = []
for i, cid in enumerate(tqdm(left_to_download, desc="Downloading CT scans")):
    success = download_case(cid)
    if not success:
        failed.append(cid)

print(f"\n Download complete")
print(f"Successful: {len(left_to_download) - len(failed)}")
print(f"Failed: {len(failed)}")
if failed:
    print(f"Failed cases: {failed}")

In [None]:
# Step 4.1: Pull latest and Run Data Exploration

!cd /content/kidney-tumour-detection && git pull origin main

import subprocess
result = subprocess.run(
    ["python", "/content/kidney-tumour-detection/src/preprocessing/data_exploration.py"],
    capture_output=True,
    text=True
)
print(result.stdout)


KiTS21 DATASET EXPLORATION

Dataset root: /content/drive/MyDrive/kidney-tumour-detection/dataset/raw
Logs output:  /content/drive/MyDrive/kidney-tumour-detection/outputs/logs

Step 1: File Integrity Check

Total cases expected:        300
Folders found:               300
Imaging files found:         300
Segmentation files found:    300
Successfully loaded (both):  300

All cases passed integrity check

Step 2: Slice Count Distribution
Min slices:    512
Max slices:    796
Mean slices:   512.95
Median slices: 512.00
Std deviation: 16.40

All cases have 50+ slices

Step 3: Intensity Statistics (sample of 20 cases)

Global intensity min:  -2048.0 HU
Global intensity max:  3071.0 HU
Mean of case means:    -551.6 HU
Mean of case stds:     545.8 HU
5th percentile min:    -2048.0 HU
95th percentile max:   3071.0 HU

Config window range:   -79 to 304 HU
→ Review if global min/max suggests a different window is needed

Step 4: Segmentation Label Analysis (sample of 50 cases)

Out of 50 sampled

In [None]:
# Pull latest and run Step 4.2: Data Splitting

!cd /content/kidney-tumour-detection && git pull origin main

import subprocess
result = subprocess.run(
    ["python", "/content/kidney-tumour-detection/src/preprocessing/data_splitting.py"],
    capture_output=True,
    text=True
)
print(result.stdout)
if result.stderr:
    print("STDERR:")
    print(result.stderr)

From https://github.com/danokundaye/kidney-tumour-detection
 * branch            main       -> FETCH_HEAD
Already up to date.

 Patient-level Data Splitting

Strategy : Stratified by malignant label
Seed     : 42
Split    : 110 detection / 120 segmentation / 70 test

Total cases loaded: 300
 Malignant: 275
 Benign:    25

 Splits must sum to 300. 300 splits confirmed.
Patient-level Split Summary

Detection Train (110 cases):
   Malignant : 101 (91.8)
   Benign    : 9 (8.2)

Segmentation Train (120 cases):
   Malignant : 110 (91.7)
   Benign    : 10 (8.3)

Test (70 cases):
   Malignant : 64 (91.4)
   Benign    : 6 (8.6)

 Uniqueness Check
  Total cases assigned : 300
  Unique case IDs      : 300
  No overlaps detected : PASS

Splits saved to: /content/drive/MyDrive/kidney-tumour-detection/dataset/processed/splits
  detection_train.csv    (110 cases)
  segmentation_train.csv (120 cases)
  test.csv               (70 cases)
  split_summary.txt
Splitting Complete 

Do NOT modify the split C

In [57]:
!cd /content/kidney-tumour-detection && git pull origin main

# Single Slice Extraction Test
import subprocess
result = subprocess.run(
    ["python", "-c", """
import yaml
import numpy as np
import nibabel as nib
from PIL import Image
from pathlib import Path
import sys
sys.path.insert(0, '/content/kidney-tumour-detection/src/preprocessing')
from slice_extraction import process_case

with open('/content/kidney-tumour-detection/configs/config.yaml', 'r') as f:
    import yaml
    config = yaml.safe_load(f)

dataset_root = Path(config['paths']['dataset_root'])
slices_dir   = Path(config['paths']['slices_dir'])
window_min   = config['preprocessing']['ct_window_min']
window_max   = config['preprocessing']['ct_window_max']

# Re-process the test case (will overwrite previous test output)
test_case  = 'case_00000'
output_dir = slices_dir / 'test_single_case'

result = process_case(
    case_id      = test_case,
    dataset_root = dataset_root,
    output_dir   = output_dir,
    window_min   = window_min,
    window_max   = window_max,
    kidney_only  = False
)

print(f"Total slices : {result['total_slices']}")
print(f"Saved slices : {result['saved_slices']}")

# Check a MIDDLE slice instead of the first one
images_dir  = output_dir / test_case / 'images'
masks_dir   = output_dir / test_case / 'masks'
all_images  = sorted(images_dir.glob('*.png'))

# Pick slice 250 (middle of a 512-slice volume)
mid_img  = np.array(Image.open(all_images[250]))
mid_mask = np.array(Image.open(masks_dir / all_images[250].name))

print(f"\\nMiddle slice ({all_images[250].name}):")
print(f"  Image - min: {mid_img.min()}, max: {mid_img.max()}, dtype: {mid_img.dtype}")
print(f"  Mask  - unique values: {np.unique(mid_mask).tolist()}")
"""],
    capture_output=True,
    text=True
)
print(result.stdout)
if result.stderr:
    print("STDERR:")
    print(result.stderr)

From https://github.com/danokundaye/kidney-tumour-detection
 * branch            main       -> FETCH_HEAD
Already up to date.
Total slices : 512
Saved slices : 512

Middle slice (slice_0250.png):
  Image - min: 0, max: 255, dtype: uint8
  Mask  - unique values: [0]



In [55]:
# Find a slice with actual kidney content and check its mask
import numpy as np
import nibabel as nib
from pathlib import Path

seg_path = Path("/content/drive/MyDrive/kidney-tumour-detection/dataset/raw/case_00000/aggregated_MAJ_seg.nii.gz")
seg_data = np.round(nib.load(str(seg_path)).get_fdata()).astype(np.uint8)

# Find all slices with non-zero labels
kidney_slices = []
for i in range(seg_data.shape[2]):
    unique = np.unique(seg_data[:, :, i]) # reveals the number of values/masks
    if len(unique) > 1:  # more than just background
        kidney_slices.append((i, unique.tolist()))

print(f"Slices with organ content: {len(kidney_slices)}")
print(f"\nFirst 5:")
for idx, labels in kidney_slices[:5]:
    print(f"  slice_{idx:04d}: {labels}")
print(f"\nLast 5:")
for idx, labels in kidney_slices[-5:]:
    print(f"  slice_{idx:04d}: {labels}")

Slices with organ content: 151

First 5:
  slice_0139: [0, 1]
  slice_0140: [0, 1]
  slice_0141: [0, 1]
  slice_0142: [0, 1]
  slice_0143: [0, 1]

Last 5:
  slice_0361: [0, 1]
  slice_0362: [0, 1]
  slice_0363: [0, 1]
  slice_0364: [0, 1]
  slice_0365: [0, 1]


In [54]:
from PIL import Image
import numpy as np
from pathlib import Path

# Check the saved mask for slice_0139 (first kidney slice)
mask_path = Path("/content/drive/MyDrive/kidney-tumour-detection/dataset/processed/slices/test_single_case/case_00000/masks/slice_0139.png")

mask = np.array(Image.open(mask_path))
print(f"Mask unique values: {np.unique(mask).tolist()}")
print(f"Expected          : [0, 85]")
print(f"  0  = background")
print(f"  85 = kidney (1 × 85)")

Mask unique values: [0, 85]
Expected          : [0, 85]
  0  = background
  85 = kidney (1 × 85)


In [7]:
# Step 4.3: Full Slice Extraction
!python /content/kidney-tumour-detection/src/preprocessing/slice_extraction.py



Slice Extraction
Dataset root : /content/drive/MyDrive/kidney-tumour-detection/dataset/raw
Output dir   : /content/drive/MyDrive/kidney-tumour-detection/dataset/processed/slices
CT window    : [-79, 304] HU

Processing:       Detection_train
Kidney-only filter: False
Total cases    : 110
Already done   : 110
To process     : 0
All cases already processed. Skipping.

Processing:       Segmentation_train
Kidney-only filter: True
Total cases    : 120
Already done   : 120
To process     : 0
All cases already processed. Skipping.

Processing:       Test
Kidney-only filter: False
Total cases    : 70
Already done   : 50
To process     : 20
Extracting test: 100% 20/20 [07:14<00:00, 21.73s/it]

test summary:
  Cases processed  : 20
  Total slices     : 10240
  Saved slices     : 10240
  Skipped slices   : 0
  Avg saved/case   : 512.0

 Slice extraction complete
Check your Drive for processed/slices/
Verify a few slices visually.


In [14]:
# Delete and re-slice case_00261 due to mismatch in image and mask (79, 78)
import shutil
import yaml
import sys
from pathlib import Path
sys.path.insert(0, '/content/kidney-tumour-detection/src/preprocessing')
from slice_extraction import process_case

with open('/content/kidney-tumour-detection/configs/config.yaml', 'r') as f:
    import yaml
    config = yaml.safe_load(f)

slices_dir   = Path(config['paths']['slices_dir'])
dataset_root = Path(config['paths']['dataset_root'])
window_min   = config['preprocessing']['ct_window_min']
window_max   = config['preprocessing']['ct_window_max']

case_id    = 'case_00261'
output_dir = slices_dir / 'detection_train'
case_dir   = output_dir / case_id

# Delete incomplete folder
shutil.rmtree(case_dir)
print(f"Deleted: {case_dir}")

# Reprocess
result = process_case(
    case_id      = case_id,
    dataset_root = dataset_root,
    output_dir   = output_dir,
    window_min   = window_min,
    window_max   = window_max,
    kidney_only  = False
)

# Verify
images_count = len(list((output_dir / case_id / 'images').glob('*.png')))
masks_count  = len(list((output_dir / case_id / 'masks').glob('*.png')))
print(f"Images : {images_count}")
print(f"Masks  : {masks_count}")
print(f"Match  : {images_count == masks_count}")

Deleted: /content/drive/MyDrive/kidney-tumour-detection/dataset/processed/slices/detection_train/case_00261
Images : 512
Masks  : 512
Match  : True


In [29]:
# Drive File Verification — Check slice extraction completeness
import pandas as pd
from pathlib import Path

slices_dir = Path("/content/drive/MyDrive/kidney-tumour-detection/dataset/processed/slices")
splits_dir = Path("/content/drive/MyDrive/kidney-tumour-detection/dataset/processed/splits")

splits = {
    "detection_train"   : splits_dir / "detection_train.csv",
    "segmentation_train": splits_dir / "segmentation_train.csv",
    "test"              : splits_dir / "test.csv"
}

print("Drive Verification")

for split_name, csv_path in splits.items():
    cases_df = pd.read_csv(csv_path)
    case_ids = cases_df['case_id'].tolist()

    complete   = []
    incomplete = []
    missing    = []

    for case_id in case_ids:
        images_dir = slices_dir / split_name / case_id / "images"
        masks_dir  = slices_dir / split_name / case_id / "masks"

        if not images_dir.exists():
            missing.append(case_id)
        else:
            img_count  = len(list(images_dir.glob("*.png")))
            mask_count = len(list(masks_dir.glob("*.png")))

            if img_count == 0 or mask_count == 0:
                incomplete.append((case_id, img_count, mask_count))
            elif img_count != mask_count:
                incomplete.append((case_id, img_count, mask_count))
            else:
                complete.append(case_id)

    print(f"\n{split_name.upper()}")
    print(f"  Expected  : {len(case_ids)} cases")
    print(f"  Complete  : {len(complete)} cases")
    print(f"  Incomplete: {len(incomplete)} cases")
    print(f"  Missing   : {len(missing)} cases")

    if incomplete:
        print(f"\n  Incomplete cases (case_id, images, masks):")
        for item in incomplete:
            print(f"    {item}")

    if missing:
        print(f"\n  Missing cases:")
        for case_id in missing:
            print(f"    {case_id}")

Drive Verification

DETECTION_TRAIN
  Expected  : 110 cases
  Complete  : 110 cases
  Incomplete: 0 cases
  Missing   : 0 cases

SEGMENTATION_TRAIN
  Expected  : 120 cases
  Complete  : 120 cases
  Incomplete: 0 cases
  Missing   : 0 cases

TEST
  Expected  : 70 cases
  Complete  : 70 cases
  Incomplete: 0 cases
  Missing   : 0 cases


In [36]:
 # Pull latest changes and run Step 4.4: YOLO Label Generation
!cd /content/kidney-tumour-detection && git pull origin main

!python /content/kidney-tumour-detection/src/preprocessing/yolo_label_generation.py

remote: Enumerating objects: 13, done.[K
remote: Counting objects:   7% (1/13)[Kremote: Counting objects:  15% (2/13)[Kremote: Counting objects:  23% (3/13)[Kremote: Counting objects:  30% (4/13)[Kremote: Counting objects:  38% (5/13)[Kremote: Counting objects:  46% (6/13)[Kremote: Counting objects:  53% (7/13)[Kremote: Counting objects:  61% (8/13)[Kremote: Counting objects:  69% (9/13)[Kremote: Counting objects:  76% (10/13)[Kremote: Counting objects:  84% (11/13)[Kremote: Counting objects:  92% (12/13)[Kremote: Counting objects: 100% (13/13)[Kremote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (1/1)[Kremote: Compressing objects: 100% (1/1), done.[K
remote: Total 7 (delta 5), reused 7 (delta 5), pack-reused 0 (from 0)[K
Unpacking objects:  14% (1/7)Unpacking objects:  28% (2/7)Unpacking objects:  42% (3/7)Unpacking objects:  57% (4/7)Unpacking objects:  71% (5/7)Unpacking objects:  85% (6/7)Unpacking objects: 100% (7

In [38]:
import random
from pathlib import Path

slices_dir = Path("/content/drive/MyDrive/kidney-tumour-detection/dataset/processed/slices")
detection_dir = slices_dir / "detection_train"

# Pick 3 random cases
cases = sorted(detection_dir.iterdir())
sample_cases = random.sample(cases, 3)

for case_dir in sample_cases:
    labels_dir = case_dir / "labels"
    all_labels = sorted(labels_dir.glob("*.txt"))

    positive = [f for f in all_labels if f.stat().st_size > 0]
    empty    = [f for f in all_labels if f.stat().st_size == 0]

    print(f"\n{case_dir.name}")
    print(f"  Total label files : {len(all_labels)}")
    print(f"  Positive          : {len(positive)}")
    print(f"  Empty             : {len(empty)}")

    # Show content of first positive label
    if positive:
        print(f"  Sample ({positive[0].name}):")
        with open(positive[0]) as f:
            for line in f:
                parts = line.strip().split()
                print(f"    class={parts[0]}  cx={parts[1]}  cy={parts[2]}  w={parts[3]}  h={parts[4]}")
                # Verify values are in range
                values = [float(x) for x in parts[1:]]
                assert all(0.0 <= v <= 1.0 for v in values), "VALUE OUT OF RANGE"

print("\nAll checks passed")


case_00297
  Total label files : 512
  Positive          : 153
  Empty             : 359
  Sample (slice_0136.txt):
    class=0  cx=0.601562  cy=0.046875  w=0.074219  h=0.007812

case_00238
  Total label files : 512
  Positive          : 174
  Empty             : 338
  Sample (slice_0127.txt):
    class=0  cx=0.551758  cy=0.074219  w=0.087891  h=0.007812

case_00104
  Total label files : 512
  Positive          : 167
  Empty             : 345
  Sample (slice_0110.txt):
    class=0  cx=0.525391  cy=0.091797  w=0.058594  h=0.007812

All checks passed


In [40]:
from pathlib import Path

detection_dir = Path("/content/drive/MyDrive/kidney-tumour-detection/dataset/processed/slices/detection_train")

# Check first 3 cases
for case_dir in sorted(detection_dir.iterdir())[:3]:
    images  = (case_dir / "images").exists()
    masks   = (case_dir / "masks").exists()
    labels  = (case_dir / "labels").exists()

    label_count = len(list((case_dir / "labels").glob("*.txt"))) if labels else 0

    print(f"{case_dir.name}  images={images}  masks={masks}  labels={labels}  ({label_count} files)")

case_00000  images=True  masks=True  labels=True  (512 files)
case_00004  images=True  masks=True  labels=True  (512 files)
case_00005  images=True  masks=True  labels=True  (512 files)


In [52]:
# Pull latest changes and run Step 4.5: YOLO Dataset Structure
!cd /content/kidney-tumour-detection && git pull origin main

!python /content/kidney-tumour-detection/src/preprocessing/yolo_dataset_structure.py

From https://github.com/danokundaye/kidney-tumour-detection
 * branch            main       -> FETCH_HEAD
Already up to date.

YOLO Dataset Structure

Step 1: Splitting detection cases...
  Malignant cases : 101
  Benign cases    : 9
  Train cases : 100
  Val cases   : 10
  Val set     : ['case_00034', 'case_00052', 'case_00089', 'case_00127', 'case_00193', 'case_00222', 'case_00233', 'case_00238', 'case_00267', 'case_00289']

Step 2: Collecting image paths...
  Train images : 51484
  Val images   : 5120

Step 3: Writing output files...
  Written : /content/drive/MyDrive/kidney-tumour-detection/dataset/processed/splits/yolo_train.txt (51484 images)
  Written : /content/drive/MyDrive/kidney-tumour-detection/dataset/processed/splits/yolo_val.txt (5120 images)
  Written : /content/drive/MyDrive/kidney-tumour-detection/dataset/processed/splits/yolo_data.yaml

Step 4: Verification checks...
  Total images (train + val) : 56604 — matches Step 4.4? YES
  Sample path exists         : YES
  Sam