In [92]:
import SimpleITK as sitk
import numpy as np
import csv
import pandas as pd
import os
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline


In [93]:
def load_itk_image(filename):
    itkimage = sitk.ReadImage(filename)
    numpyImage = sitk.GetArrayFromImage(itkimage)
     
    numpyOrigin = np.array(list(reversed(itkimage.GetOrigin())))
    numpySpacing = np.array(list(reversed(itkimage.GetSpacing())))
     
    return numpyImage, numpyOrigin, numpySpacing

In [94]:
def readCSV(filename):
    lines = []
    with open(filename, "rb") as f:
        csvreader = csv.reader(f)
        for line in csvreader:
            lines.append(line)
    return lines

In [95]:
def worldToVoxelCoord(worldCoord, origin, spacing):
     
    stretchedVoxelCoord = np.absolute(worldCoord - origin)
    voxelCoord = stretchedVoxelCoord / spacing
    return voxelCoord

In [96]:
def normalizePlanes(npzarray):
     
    maxHU = 400.
    minHU = -1000.
 
    npzarray = (npzarray - minHU) / (maxHU - minHU)
    npzarray[npzarray>1] = 1.
    npzarray[npzarray<0] = 0.
    return npzarray

In [97]:
folder_path = "subset0"
files = os.listdir(folder_path)
mhd_files = [f for f in files if f.endswith('.mhd')]
first_file = os.path.join(folder_path, mhd_files[0])



In [98]:

volumes = {}

for mhd_file in mhd_files:
    full_path = os.path.join(folder_path, mhd_file)

    # load the image
    numpyImage, numpyOrigin, numpySpacing = load_itk_image(full_path)

    # make a key from filename (this will match seriesuid in candidates.csv)
    seriesuid = os.path.splitext(mhd_file)[0]

    volumes[seriesuid] = {
        "image": numpyImage,
        "origin": numpyOrigin,
        "spacing": numpySpacing,
        "path": full_path,
    }

# optional: inspect what we loaded
print(f"Loaded {len(volumes)} volumes.")
for uid, data in list(volumes.items())[:3]:   # show first 3
    print("UID:", uid)
    print("  shape:", data["image"].shape)
    print("  origin:", data["origin"])
    print("  spacing:", data["spacing"])


Loaded 89 volumes.
UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.430109407146633213496148200410
  shape: (733, 512, 512)
  origin: [-441.         -332.68164062 -168.68164062]
  spacing: [0.5        0.63671875 0.63671875]
UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.621916089407825046337959219998
  shape: (133, 512, 512)
  origin: [-357.5 -220.  -223. ]
  spacing: [2.5      0.859375 0.859375]
UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.238522526736091851696274044574
  shape: (183, 512, 512)
  origin: [-342.6     -286.66797 -169.66797]
  spacing: [1.79999995 0.6640625  0.6640625 ]


In [99]:
  # show first few files

img_path  = 'data/1.3.6.1.4.1.14519.5.2.1.6279.6001.148447286464082095534651426689.mhd'
cand_path = "candidates_V2.csv"

In [100]:
numpyImage, numpyOrigin, numpySpacing = load_itk_image(first_file)
print(numpyImage.shape)
print(numpyOrigin)
print(numpySpacing)

(733, 512, 512)
[-441.         -332.68164062 -168.68164062]
[0.5        0.63671875 0.63671875]


In [101]:
cands = pd.read_csv(cand_path)
cands.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,68.42,-74.48,-288.7,0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-95.209361,-91.809406,-377.42635,0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-24.766755,-120.379294,-273.361539,0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-63.08,-65.74,-344.24,0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,52.946688,-92.688873,-241.067872,0


In [102]:
for idx, row in cands.iterrows():
    worldCoord = np.asarray([row["coordX"], row["coordY"], row["coordZ"]], dtype=float)
    voxelCoord = worldToVoxelCoord(worldCoord, numpyOrigin, numpySpacing)
    voxelWidth = 65


In [103]:

outputDir = "patches"
os.makedirs(outputDir, exist_ok=True)

voxelWidth = 65
half = voxelWidth // 2   # 32

n_total = 0
n_uid_missing = 0
n_saved = 0
n_padded = 0

for _, row in cands.iterrows():
    n_total += 1
    uid = row["seriesuid"]

    # 1) skip candidates from CTs we didn't load
    if uid not in volumes:
        n_uid_missing += 1
        continue

    vol = volumes[uid]
    numpyImage  = vol["image"]   # (Z, Y, X)
    numpyOrigin = vol["origin"]
    numpySpacing= vol["spacing"]

    # CSV is (X,Y,Z); our origin/spacing are (Z,Y,X) â†’ reverse
    worldCoord = np.array([row["coordZ"], row["coordY"], row["coordX"]], dtype=float)

    voxelCoord = worldToVoxelCoord(worldCoord, numpyOrigin, numpySpacing).astype(int)
    z, y, x = voxelCoord
    Z, Y, X = numpyImage.shape

    # z must be valid
    if z < 0 or z >= Z:
        continue

    # slice bounds (clamped)
    y1 = max(0, y - half)
    y2 = min(Y, y + half + 1)   # +1 so we can actually reach 65 sometimes
    x1 = max(0, x - half)
    x2 = min(X, x + half + 1)

    # extract whatever we can
    patch = numpyImage[z, y1:y2, x1:x2]

    # now pad to exactly (65, 65) if needed
    pad_y = voxelWidth - patch.shape[0]
    pad_x = voxelWidth - patch.shape[1]

    if pad_y > 0 or pad_x > 0:
        # pad as (top,bottom), (left,right)
        patch = np.pad(
            patch,
            ((0, max(0, pad_y)), (0, max(0, pad_x))),
            mode='constant',
            constant_values=0
        )
        n_padded += 1

    # normalize
    patch = normalizePlanes(patch)

    # save
    save_name = f"patch_{uid}_{worldCoord[0]:.2f}_{worldCoord[1]:.2f}_{worldCoord[2]:.2f}.tiff"
    save_path = os.path.join(outputDir, save_name)
    Image.fromarray((patch * 255).astype(np.uint8)).save(save_path)
    n_saved += 1

# summary
print("TOTAL candidates:", n_total)
print("  skipped (uid not loaded):", n_uid_missing)
print("  padded patches:", n_padded)
print("  SAVED patches:", n_saved)
print("Files in patches/:", os.listdir(outputDir)[:10])


TOTAL candidates: 754975
  skipped (uid not loaded): 675840
  padded patches: 62
  SAVED patches: 79135
Files in patches/: ['patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.210837812047373739447725050963_-263.43_-80.75_-93.55.tiff', 'patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.657775098760536289051744981056_-161.22_58.23_69.25.tiff', 'patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.716498695101447665580610403574_-181.19_-112.83_47.07.tiff', 'patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.525937963993475482158828421281_-72.02_57.39_47.44.tiff', 'patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.194440094986948071643661798326_-98.80_-165.46_-64.19.tiff', 'patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.905371958588660410240398317235_-120.16_43.19_78.45.tiff', 'patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.137763212752154081977261297097_-447.90_220.12_-39.09.tiff', 'patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.323859712968543712594665815359_-167.81_13.40_73.29.tiff', 'patch_1.3.6.1.4.1.14519.5.2.1.6279.6001.310548927038333190233889983845_-121.