In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import torch

from diffdrr.drr import DRR
from diffdrr.data import load_example_ct
from diffdrr.visualization import plot_drr

In [None]:
# Read in the volume
volume, spacing = load_example_ct()
device = "cuda" if torch.cuda.is_available() else "cpu"

# Get parameters for the detector
bx, by, bz = np.array(volume.shape) * np.array(spacing) / 2
translations = torch.tensor([[bx, by, bz]]).to(device)
rotations = torch.tensor([[np.pi, 0, np.pi / 2]]).to(device)

In [None]:
#|cuda
height = 100

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0).to(device)
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

9.27 ms ± 359 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
#|cuda
height = 200

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0).to("cuda" if torch.cuda.is_available() else "cpu")
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

33.4 ms ± 57.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
#|cuda
height = 300

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0).to("cuda" if torch.cuda.is_available() else "cpu")
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

72.3 ms ± 16.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
#|cuda
height = 400

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0).to("cuda" if torch.cuda.is_available() else "cpu")
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

123 ms ± 31.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
#|cuda
height = 500

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0).to("cuda" if torch.cuda.is_available() else "cpu")
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

187 ms ± 15 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Memory constraints

Up until this point, we could compute every ray in the DRR in one go on the GPU. However, as the DRRs get bigger, we will quickly run out of memory. For example, on a 12 GB GPU, computing a 600 by 600 DRR will raise a CUDA memory error.

In [None]:
#|cuda
height = 600
patch_size = 150

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0, patch_size=patch_size).to("cuda" if torch.cuda.is_available() else "cpu")
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

184 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
#|cuda
height = 750
patch_size = 150

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0, patch_size=patch_size).to("cuda" if torch.cuda.is_available() else "cpu")
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

261 ms ± 367 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
#|cuda
height = 1000
patch_size = 250

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0, patch_size=patch_size).to("cuda" if torch.cuda.is_available() else "cpu")
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

419 ms ± 85.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
#|cuda
height = 1500
patch_size = 250

drr = DRR(volume, spacing, sdr=300.0, height=height, delx=4.0, patch_size=patch_size).to("cuda" if torch.cuda.is_available() else "cpu")
%timeit drr(rotations, translations, "euler_angles", "ZYX")
del drr

837 ms ± 273 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


With `patch_size`, the only limitation is DRR storage, not computation.