Load file

In [1]:
import os
from pathlib import Path
from typing import Dict, Optional, Union

import numpy as np
import pandas as pd
from numpy.typing import NDArray


def load_file(file_path: Path) -> Union[NDArray, Dict[str, NDArray]]:
    """
    Load a file in one of the formats provided in the OpenKBP dataset
    """
    if file_path.stem == "voxel_dimensions":
        return np.loadtxt(file_path)

    loaded_file_df = pd.read_csv(file_path, index_col=0)
    if loaded_file_df.isnull().values.any():  # Data is a mask
        loaded_file = np.array(loaded_file_df.index).squeeze()
    else:  # Data is a sparse matrix
        loaded_file = {"indices": loaded_file_df.index.values, "data": loaded_file_df.data.values}

    return loaded_file


def get_paths(directory_path: Path, extension: Optional[str] = None) -> list[Path]:
    """
    Get the paths of every file contained in `directory_path` that also has the extension `extension` if one is provided.
    """
    all_paths = []

    if not directory_path.is_dir():
        pass
    elif extension is None:
        dir_list = os.listdir(directory_path)
        for name in dir_list:
            if "." != name[0]:  # Ignore hidden files
                all_paths.append(directory_path / str(name))
    else:
        data_root = Path(directory_path)
        for file_path in data_root.glob("*.{}".format(extension)):
            file_path = Path(file_path)
            if "." != file_path.stem[0]:
                all_paths.append(file_path)

    return all_paths


def sparse_vector_function(x, indices=None) -> dict[str, NDArray]:
    """Convert a tensor into a dictionary of the non-zero values and their corresponding indices
    :param x: the tensor or, if indices is not None, the values that belong at each index
    :param indices: the raveled indices of the tensor
    :return:  sparse vector in the form of a dictionary
    """
    if indices is None:
        y = {"data": x[x > 0], "indices": np.nonzero(x.flatten())[-1]}
    else:
        y = {"data": x[x > 0], "indices": indices[x > 0]}
    return y


Data Shpae

In [2]:
from typing import Union

from numpy.typing import NDArray


class DataShapes:
    def __init__(self, num_rois):
        self.num_rois = num_rois
        self.patient_shape = (128, 128, 128)

    @property
    def dose(self) -> tuple[int, int, int, int]:
        """Dose deposited within the patient tensor"""
        return self.patient_shape + (1,)

    @property
    def predicted_dose(self) -> tuple[int, int, int, int]:
        """Predicted dose that should be deposited within the patient tensor"""
        return self.dose

    @property
    def ct(self) -> tuple[int, int, int, int]:
        """CT image grey scale within the patient tensor"""
        return self.patient_shape + (1,)

    @property
    def structure_masks(self) -> tuple[int, int, int, int]:
        """Mask of all structures in patient"""
        return self.patient_shape + (self.num_rois,)

    @property
    def possible_dose_mask(self) -> tuple[int, int, int, int]:
        """Mask where dose can be deposited"""
        return self.patient_shape + (1,)

    @property
    def voxel_dimensions(self) -> tuple[float]:
        """Physical dimensions of patient voxels (in mm)"""
        return tuple((3,))

    def from_data_names(self, data_names: list[str]) -> dict[str, Union[NDArray, tuple[float]]]:
        data_shapes = {}
        for name in data_names:
            data_shapes[name] = getattr(self, name)
        return data_shapes


Batch Data


In [3]:
from __future__ import annotations

from pathlib import Path
from typing import Optional

import numpy as np
from numpy.typing import NDArray


class DataBatch:
    def __init__(
        self,
        dose: Optional[NDArray] = None,
        predicted_dose: Optional[NDArray] = None,
        ct: Optional[NDArray] = None,
        structure_masks: Optional[NDArray] = None,
        structure_mask_names: Optional[list[str]] = None,
        possible_dose_mask: Optional[NDArray] = None,
        voxel_dimensions: Optional[NDArray] = None,
        patient_list: Optional[list[str]] = None,
        patient_path_list: Optional[list[Path]] = None,
    ):
        self.dose = dose
        self.predicted_dose = predicted_dose
        self.ct = ct
        self.structure_masks = structure_masks
        self.structure_mask_names = structure_mask_names
        self.possible_dose_mask = possible_dose_mask
        self.voxel_dimensions = voxel_dimensions
        self.patient_list = patient_list
        self.patient_path = patient_path_list

    @classmethod
    def initialize_from_required_data(cls, data_dimensions: dict[str, NDArray], batch_size: int) -> DataBatch:
        attribute_values = {}
        for data, dimensions in data_dimensions.items():
            batch_data_dimensions = (batch_size, *dimensions)
            attribute_values[data] = np.zeros(batch_data_dimensions)
        return cls(**attribute_values)

    def set_values(self, data_name: str, batch_index: int, values: NDArray):
        getattr(self, data_name)[batch_index] = values

    def get_index_structure_from_structure(self, structure_name: str):
        return self.structure_mask_names.index(structure_name)


Data Load

In [4]:
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Union

import numpy as np
from more_itertools import windowed
from numpy.typing import NDArray
from tqdm import tqdm


class DataLoader:
    """Loads OpenKBP csv data in structured format for dose prediction models."""

    def __init__(self, patient_paths: List[Path], batch_size: int = 2):
        """
        :param patient_paths: list of the paths where data for each patient is stored
        :param batch_size: the number of data points to lead in a single batch
        """
        self.patient_paths = patient_paths
        self.batch_size = batch_size

        # Light processing of attributes
        self.paths_by_patient_id = {patient_path.stem: patient_path for patient_path in self.patient_paths}
        self.required_files: Optional[Dict] = None
        self.mode_name: Optional[str] = None

        # Parameters that should not be changed unless OpenKBP data is modified
        self.rois = dict(
            oars=["Brainstem", "SpinalCord", "RightParotid", "LeftParotid", "Esophagus", "Larynx", "Mandible"],
            targets=["PTV56", "PTV63", "PTV70"],
        )
        self.full_roi_list = sum(map(list, self.rois.values()), [])  # make a list of all rois
        self.num_rois = len(self.full_roi_list)
        self.data_shapes = DataShapes(self.num_rois)

    @property
    def patient_id_list(self) -> List[str]:
        return list(self.paths_by_patient_id.keys())

    def get_batches(self) -> Iterator[DataBatch]:
        batches = windowed(self.patient_paths, n=self.batch_size, step=self.batch_size)
        complete_batches = (batch for batch in batches if None not in batch)
        for batch_paths in tqdm(complete_batches):
            yield self.prepare_data(batch_paths)

    def get_patients(self, patient_list: List[str]) -> DataBatch:
        file_paths_to_load = [self.paths_by_patient_id[patient] for patient in patient_list]
        return self.prepare_data(file_paths_to_load)

    def set_mode(self, mode: str) -> None:
        """Set parameters based on `mode`."""
        self.mode_name = mode
        if mode == "training_model":
            required_data = ["dose", "ct", "structure_masks", "possible_dose_mask", "voxel_dimensions"]
        elif mode == "predicted_dose":
            required_data = [mode]
            self._force_batch_size_one()
        elif mode == "evaluation":
            required_data = ["dose", "structure_masks", "possible_dose_mask", "voxel_dimensions"]
            self._force_batch_size_one()
        elif mode == "dose_prediction":
            required_data = ["ct", "structure_masks", "possible_dose_mask", "voxel_dimensions"]
            self._force_batch_size_one()
        else:
            raise ValueError(f"Mode `{mode}` does not exist. Mode must be either training_model, prediction, predicted_dose, or evaluation")
        self.required_files = self.data_shapes.from_data_names(required_data)

    def _force_batch_size_one(self) -> None:
        if self.batch_size != 1:
            self.batch_size = 1
            Warning("Batch size has been changed to 1 for dose prediction mode")

    def shuffle_data(self) -> None:
        np.random.shuffle(self.patient_paths)

    def prepare_data(self, file_paths_to_load: List[Path]) -> DataBatch:
        """Prepares data containing samples in batch so that they are loaded in the proper shape: (n_samples, *dim, n_channels)"""

        batch_data = DataBatch.initialize_from_required_data(self.required_files, self.batch_size)
        batch_data.patient_list = [patient_path.stem for patient_path in file_paths_to_load]
        batch_data.patient_path_list = file_paths_to_load
        batch_data.structure_mask_names = self.full_roi_list

        # Populate batch with requested data
        for index, patient_path in enumerate(file_paths_to_load):
            raw_data = self.load_data(patient_path)
            for key in self.required_files:
                batch_data.set_values(key, index, self.shape_data(key, raw_data))

        return batch_data

    def load_data(self, path_to_load: Path) -> Union[NDArray, dict[str, NDArray]]:
        """Load data in its raw form."""
        data = {}
        if path_to_load.is_dir():
            files_to_load = get_paths(path_to_load)
            for file_path in files_to_load:
                is_required = file_path.stem in self.required_files
                is_required_roi = file_path.stem in self.full_roi_list
                if is_required or is_required_roi:
                    data[file_path.stem] = load_file(file_path)
        else:
            data[self.mode_name] = load_file(path_to_load)

        return data

    def shape_data(self, key: str, data: dict) -> NDArray:
        """Shapes into form that is amenable to tensorflow and other deep learning packages."""

        shaped_data = np.zeros(self.required_files[key])

        if key == "structure_masks":
            for roi_idx, roi in enumerate(self.full_roi_list):
                if roi in data.keys():
                    np.put(shaped_data, self.num_rois * data[roi] + roi_idx, int(1))
        elif key == "possible_dose_mask":
            np.put(shaped_data, data[key], int(1))
        elif key == "voxel_dimensions":
            shaped_data = data[key]
        else:
            np.put(shaped_data, data[key]["indices"], data[key]["data"])

        return shaped_data


Dose Evalution

In [5]:
from typing import Optional

import numpy as np
import pandas as pd
from numpy.typing import NDArray

class DoseEvaluator:
    """Evaluate a full dose distribution against the reference dose on the OpenKBP competition metrics"""

    def __init__(self, reference_data_loader: DataLoader, prediction_loader: Optional[DataLoader] = None):
        self.reference_data_loader = reference_data_loader
        self.prediction_loader = prediction_loader

        # Initialize objects for later
        self.reference_batch: Optional[DataBatch] = None
        self.prediction_batch: Optional[DataBatch] = None

        # Define evaluation metrics for each roi
        oar_dvh_metrics = {oar: ["D_0.1_cc", "mean"] for oar in self.reference_data_loader.rois["oars"]}
        target_dvh_metrics = {target: ["D_99", "D_95", "D_1"] for target in self.reference_data_loader.rois["targets"]}
        self.all_dvh_metrics = oar_dvh_metrics | target_dvh_metrics

        # Make data frames to cache evaluation metrics
        metric_columns = [(m, roi) for roi, metrics in self.all_dvh_metrics.items() for m in metrics]
        self.dose_errors = pd.Series(index=self.reference_data_loader.patient_id_list, data=None, dtype=float)
        self.dvh_metric_differences_df = pd.DataFrame(index=self.reference_data_loader.patient_id_list, columns=metric_columns)
        self.reference_dvh_metrics_df = self.dvh_metric_differences_df.copy()
        self.prediction_dvh_metrics_df = self.dvh_metric_differences_df.copy()

    def evaluate(self):
        """Calculate the  dose and DVH scores for the "new_dose" relative to the "reference_dose"""
        if not self.reference_data_loader.patient_paths:
            raise ValueError("No reference patient data was provided, so no metrics can be calculated")
        if self.prediction_loader:
            Warning("No predicted dose loader was provided. Metrics were only calculated for the reference dose.")
        self._set_data_loader_mode()

        for self.reference_batch in self.reference_data_loader.get_batches():
            self.reference_dvh_metrics_df = self._calculate_dvh_metrics(self.reference_dvh_metrics_df, self.reference_dose)

            self.prediction_batch = self.prediction_loader.get_patients([self.patient_id]) if self.prediction_loader else None
            if self.predicted_dose is not None:
                patient_dose_error = np.sum(np.abs(self.reference_dose - self.predicted_dose)) / np.sum(self.possible_dose_mask)
                self.dose_errors[self.patient_id] = patient_dose_error
                self.prediction_dvh_metrics_df = self._calculate_dvh_metrics(self.prediction_dvh_metrics_df, self.predicted_dose)

    def get_scores(self) -> tuple[NDArray, NDArray]:
        dose_score = np.nanmean(self.dose_errors)
        dvh_errors = np.abs(self.reference_dvh_metrics_df - self.prediction_dvh_metrics_df)
        dvh_score = np.nanmean(dvh_errors.values)
        return dose_score, dvh_score

    def _set_data_loader_mode(self) -> None:
        self.reference_data_loader.set_mode("evaluation")
        if self.prediction_loader:
            self.prediction_loader.set_mode("predicted_dose")

    def _calculate_dvh_metrics(self, metric_df: pd.DataFrame, dose: NDArray) -> pd.DataFrame:
        """
        Calculate the DVH values that were used to evaluate submissions in the competition.
        :param metric_df: A DataFrame with columns indexed by the metric name and the structure name
        :param dose: the dose to be evaluated
        :return: the same metric_df that is input, but now with the metrics for the provided dose
        """
        voxels_within_tenths_cc = np.maximum(1, np.round(100 / self.voxel_size))
        for roi in self.reference_data_loader.full_roi_list:
            roi_mask = self.get_roi_mask(roi)
            if roi_mask is None:
                continue  # Skip over ROIs when the ROI is missing (i.e., not contoured)
            roi_dose = dose[roi_mask]
            for metric in self.all_dvh_metrics[roi]:
                if metric == "D_0.1_cc":
                    roi_size = len(roi_dose)
                    fractional_volume_to_evaluate = 100 - voxels_within_tenths_cc / roi_size * 100
                    metric_value = np.percentile(roi_dose, fractional_volume_to_evaluate)
                elif metric == "mean":
                    metric_value = roi_dose.mean()
                elif metric == "D_99":
                    metric_value = np.percentile(roi_dose, 1)
                elif metric == "D_95":
                    metric_value = np.percentile(roi_dose, 5)
                elif metric == "D_1":
                    metric_value = np.percentile(roi_dose, 99)
                else:
                    raise ValueError(f"Metrics {metric} is not supported.")
                metric_df.at[self.patient_id, (metric, roi)] = metric_value

        return metric_df

    def get_roi_mask(self, roi_name: str) -> Optional[NDArray]:
        roi_index = self.reference_batch.get_index_structure_from_structure(roi_name)
        mask = self.reference_batch.structure_masks[:, :, :, :, roi_index].astype(bool)
        flat_mask = mask.flatten()
        return flat_mask if any(flat_mask) else None

    @property
    def patient_id(self) -> str:
        patient_id, *_ = self.reference_batch.patient_list if self.reference_batch.patient_list else [None]
        return patient_id

    @property
    def voxel_size(self) -> NDArray:
        return np.prod(self.reference_batch.voxel_dimensions)

    @property
    def possible_dose_mask(self) -> NDArray:
        return self.reference_batch.possible_dose_mask

    @property
    def reference_dose(self) -> NDArray:
        return self.reference_batch.dose.flatten()

    @property
    def predicted_dose(self) -> NDArray:
        return self.prediction_batch.predicted_dose.flatten()


In [6]:
import tensorflow as tf
from keras.layers import Layer
class Mish(Layer):
    def __init__(self):
        super(Mish, self).__init__()

    def call(self, inputs):
        return inputs * tf.math.tanh(tf.math.softplus(inputs))

tf.keras.utils.get_custom_objects().update({'Mish': Mish})

Network Achitecture

In [7]:
from typing import Optional
from keras.layers import Activation, AveragePooling3D, Conv3D, Conv3DTranspose, Input, LeakyReLU, SpatialDropout3D, concatenate, MaxPooling3D, Dense, UpSampling3D
from keras.layers import BatchNormalization
from keras.models import Model

class DefineDoseFromCT:

    def __init__(
        self,
        data_shapes: DataShapes,
        gen_optimizer,
    ):
        self.data_shapes = data_shapes
        self.gen_optimizer = gen_optimizer

    def make_downsample_block(self, number_of_filter, x, apply_batchnorm = True):
        x = Conv3D(filters=number_of_filter, kernel_size=(4,4,4), strides= 2, padding='same', use_bias=False)(x)
        if apply_batchnorm == True:
            x = BatchNormalization(momentum=0.99, epsilon=1e-3)(x)
        x = Mish()(x)

        return x

    def make_upsamle_block(self, number_of_filter, x, apply_dropout = True, skip_x = None):
        if skip_x is not None:
            x = concatenate([x, skip_x])
        x = Conv3DTranspose(filters=number_of_filter, kernel_size=(4,4,4), strides=(2,2,2), padding='same', use_bias=False)(x)
        x = BatchNormalization(momentum=0.99, epsilon=1e-3)(x)
        if apply_dropout:
            x = SpatialDropout3D(0.2)(x)
        x = Mish()(x)
        return x

    def make_resnet_block(self, input_filter, x):
        x = Conv3D(filters=64, kernel_size=(1,1,1), strides=(1,1,1), padding='same', use_bias=False)(x)
        x = Conv3D(filters=64, kernel_size=(3,3,3), strides=(1,1,1), padding='same', use_bias=False)(x)
        x = Conv3D(filters=input_filter, kernel_size=(1,1,1), strides=(1,1,1), padding='same', use_bias=False)(x)
        return x

    def define_generator(self) -> Model:

        ct_image = Input(self.data_shapes.ct)
        roi_masks = Input(self.data_shapes.structure_masks)

        x = concatenate([ct_image, roi_masks])

        conv0 = self.make_downsample_block(64, x)
        conv0 = self.make_resnet_block(64, conv0)

        conv1 = self.make_downsample_block(128, conv0)
        conv1 = self.make_resnet_block(128, conv1)

        conv2 = self.make_downsample_block(256, conv1)
        conv2 = self.make_resnet_block(256, conv2)

        conv3 = self.make_downsample_block(512, conv2)
        conv3 = self.make_resnet_block(512, conv3)

        conv4 = self.make_downsample_block(512, conv3)
        conv4 = self.make_resnet_block(512, conv4)

        conv5 = self.make_downsample_block(512, conv4)
        conv5 = self.make_resnet_block(512, conv5)

        conv6 = self.make_downsample_block(512, conv5)

        conv5b = self.make_upsamle_block(number_of_filter=512, x = conv6, apply_dropout=False)
        conv5b = self.make_resnet_block(512, conv5b)

        conv4b = self.make_upsamle_block(number_of_filter=512, x = conv5b, skip_x=conv5)
        conv4b = self.make_resnet_block(512, conv4b)

        conv3b = self.make_upsamle_block(number_of_filter=512, x = conv4b, skip_x=conv4, apply_dropout=False)
        conv3b = self.make_resnet_block(512, conv3b)

        conv2b = self.make_upsamle_block(number_of_filter=256, x = conv3b, skip_x=conv3)
        conv2b = self.make_resnet_block(256, conv2b)

        conv1b = self.make_upsamle_block(number_of_filter=128, x = conv2b, skip_x=conv2, apply_dropout=False)
        conv1b = self.make_resnet_block(128, conv1b)

        conv0b = self.make_upsamle_block(number_of_filter=64, x=conv1b, skip_x=conv1, apply_dropout=False)
        conv0b = self.make_resnet_block(64, conv0b)

        xb = concatenate([conv0b, conv0])
        x0b = Conv3DTranspose(1, kernel_size=(4,4,4), strides=(2,2,2), padding="same")(xb)

        x_final = AveragePooling3D((3, 3, 3), strides=(1, 1, 1), padding="same")(x0b)
        output = Activation("relu")(x_final)

        generator = Model(inputs=[ct_image, roi_masks], outputs=output, name="generator")
        generator.compile(loss="mean_absolute_error", optimizer=self.gen_optimizer)
        generator.summary()
        return generator


In [8]:
import os
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
from keras.models import load_model
from keras.optimizers import Adam

class PredictionModel(DefineDoseFromCT):
    def __init__(self, data_loader: DataLoader, results_patent_path: Path, model_name: str, stage: str) -> None:
        """
        :param data_loader: An object that loads batches of image data
        :param results_patent_path: The path at which all results and generated models will be saved
        :param model_name: The name of your model, used when saving and loading data
        :param stage: Identify stage of model development (train, validation, test)
        """
        super().__init__(
            data_shapes=data_loader.data_shapes,
            gen_optimizer=Adam(learning_rate=0.0002, beta_1=0.5, beta_2=0.999),
        )

        # set attributes for data shape from data loader
        self.generator = None
        self.model_name = model_name
        self.data_loader = data_loader
        self.full_roi_list = data_loader.full_roi_list

        # Define training parameters
        self.current_epoch = 0
        self.last_epoch = 200

        # Make directories for data and models
        model_results_path = results_patent_path / model_name
        self.model_dir = model_results_path / "models"
        self.model_dir.mkdir(parents=True, exist_ok=True)
        self.prediction_dir = model_results_path / f"{stage}-predictions"
        self.prediction_dir.mkdir(parents=True, exist_ok=True)

        # Make template for model path
        self.model_path_template = self.model_dir / "epoch_"

    def train_model(self, epochs: int = 200, save_frequency: int = 5, keep_model_history: int = 2) -> None:
        """
        :param epochs: the number of epochs the model will be trained over
        :param save_frequency: how often the model will be saved (older models will be deleted to conserve storage)
        :param keep_model_history: how many models are kept on a rolling basis (deletes older than save_frequency * keep_model_history epochs)
        """
        self._set_epoch_start()
        self.last_epoch = epochs
        self.initialize_networks()
        if self.current_epoch == epochs:
            print(f"The model has already been trained for {epochs}, so no more training will be done.")
            return
        self.data_loader.set_mode("training_model")
        for epoch in range(self.current_epoch, epochs):
            self.current_epoch = epoch
            print(f"Beginning epoch {self.current_epoch}")
            self.data_loader.shuffle_data()

            for idx, batch in enumerate(self.data_loader.get_batches()):
                model_loss = self.generator.train_on_batch([batch.ct, batch.structure_masks], [batch.dose])
                print(f"Model loss at epoch {self.current_epoch} batch {idx} is {model_loss:.3f}")

            self.manage_model_storage(save_frequency, keep_model_history)

    def _set_epoch_start(self) -> None:
        all_model_paths = get_paths(self.model_dir, extension="h5")
        for model_path in all_model_paths:
            *_, epoch_number = model_path.stem.split("epoch_")
            if epoch_number.isdigit():
                self.current_epoch = max(self.current_epoch, int(epoch_number))

    def initialize_networks(self) -> None:
        if self.current_epoch >= 1:
            self.generator = load_model(self._get_generator_path(self.current_epoch))
        else:
            self.generator = self.define_generator()

    def manage_model_storage(self, save_frequency: int = 1, keep_model_history: Optional[int] = None) -> None:
        """
        Manage the model storage while models are trained. Note that old models are deleted based on how many models the users has asked to keep.
        We overwrite old files (rather than deleting them) to ensure the Collab users don't fill up their Google Drive trash.
        :param save_frequency: how often the model will be saved (older models will be deleted to conserve storage)
        :param keep_model_history: how many models back are kept (older models will be deleted to conserve storage)
        """
        effective_epoch_number = self.current_epoch + 1  # Epoch number + 1 because we're at the start of the next epoch
        if 0 < np.mod(effective_epoch_number, save_frequency) and effective_epoch_number != self.last_epoch:
            Warning(f"Model at the end of epoch {self.current_epoch} was not saved because it is skipped when save frequency {save_frequency}.")
            return

        # The code below is clunky and was only included to bypass the Google Drive trash, which fills quickly with normal save/delete functions
        epoch_to_overwrite = effective_epoch_number - keep_model_history * (save_frequency or float("inf"))
        if epoch_to_overwrite >= 0:
            initial_model_path = self._get_generator_path(epoch_to_overwrite)
            self.generator.save(initial_model_path)
            os.rename(initial_model_path, self._get_generator_path(effective_epoch_number))  # Helps bypass Google Drive trash
        else:  # Save via more conventional method because there is no model to overwrite
            self.generator.save(self._get_generator_path(effective_epoch_number))

    def _get_generator_path(self, epoch: Optional[int] = None) -> Path:
        epoch = epoch or self.current_epoch
        return self.model_dir / f"epoch_{epoch}.h5"

    def predict_dose(self, epoch: int = 1) -> None:
        """Predicts the dose for the given epoch number"""
        self.generator = load_model(self._get_generator_path(epoch))
        os.makedirs(self.prediction_dir, exist_ok=True)
        self.data_loader.set_mode("dose_prediction")

        print("Predicting dose with generator.")
        for batch in self.data_loader.get_batches():
            dose_pred = self.generator.predict([batch.ct, batch.structure_masks])
            dose_pred = dose_pred * batch.possible_dose_mask
            dose_pred = np.squeeze(dose_pred)
            dose_to_save = sparse_vector_function(dose_pred)
            dose_df = pd.DataFrame(data=dose_to_save["data"].squeeze(), index=dose_to_save["indices"].squeeze(), columns=["data"])
            (patient_id,) = batch.patient_list
            dose_df.to_csv("{}/{}.csv".format(self.prediction_dir, patient_id))


In [9]:
import shutil
from pathlib import Path

if __name__ == "__main__":

    prediction_name = "baseline"  # Name model to train and number of epochs to train it for
    test_time = False  # Only change this to True when the model has been fully tuned on the validation set
    num_epochs = 100  # This should probably be increased to 100-200 after your dry run

    # Define project directories
    primary_directory = Path().resolve()  # directory where everything is stored
    provided_data_dir = primary_directory / "provided-data"
    training_data_dir = provided_data_dir / "train-pats"
    validation_data_dir = provided_data_dir / "validation-pats"
    testing_data_dir = provided_data_dir / "test-pats"
    results_dir = primary_directory / "results_3"  # where any data generated by this code (e.g., predictions, models) are stored

    # Prepare the data directory
    training_plan_paths = get_paths(training_data_dir)  # gets the path of each plan's directory

    # Train a model
    data_loader_train = DataLoader(training_plan_paths, batch_size = 2)
    dose_prediction_model_train = PredictionModel(data_loader_train, results_dir, prediction_name, "train")
    dose_prediction_model_train.train_model(num_epochs, save_frequency=1, keep_model_history=1)

    # Define hold out set
    hold_out_data_dir = validation_data_dir if test_time is False else testing_data_dir
    stage_name, _ = hold_out_data_dir.stem.split("-")
    hold_out_plan_paths = get_paths(hold_out_data_dir)

    # Predict dose for the held out set
    data_loader_hold_out = DataLoader(hold_out_plan_paths)
    dose_prediction_model_hold_out = PredictionModel(data_loader_hold_out, results_dir, model_name=prediction_name, stage=stage_name)
    dose_prediction_model_hold_out.predict_dose(epoch=num_epochs)

    # Evaluate dose metrics
    data_loader_hold_out_eval = DataLoader(hold_out_plan_paths)
    prediction_paths = get_paths(dose_prediction_model_hold_out.prediction_dir, extension="csv")
    hold_out_prediction_loader = DataLoader(prediction_paths)
    dose_evaluator = DoseEvaluator(data_loader_hold_out_eval, hold_out_prediction_loader)

    # print out scores if data was left for a hold out set
    if not data_loader_hold_out_eval.patient_paths:
        print("No patient information was given to calculate metrics")
    else:
        dose_evaluator.evaluate()
        dvh_score, dose_score = dose_evaluator.get_scores()
        print(f"For this out-of-sample test on {stage_name}:\n\tthe DVH score is {dvh_score:.3f}\n\tthe dose score is {dose_score:.3f}")

    # Zip dose to submit
    submission_dir = results_dir / "submissions"
    submission_dir.mkdir(exist_ok=True)
    shutil.make_archive(str(submission_dir / prediction_name), "zip", dose_prediction_model_hold_out.prediction_dir)


The model has already been trained for 100, so no more training will be done.
Predicting dose with generator.


0it [00:00, ?it/s]



1it [00:16, 16.80s/it]



2it [00:33, 16.55s/it]



3it [00:37, 10.90s/it]



4it [00:42,  8.50s/it]



5it [00:47,  7.41s/it]



6it [00:53,  6.82s/it]



7it [00:59,  6.54s/it]



8it [01:05,  6.35s/it]



9it [01:10,  6.15s/it]



10it [01:16,  6.00s/it]



11it [01:22,  6.02s/it]



12it [01:28,  5.94s/it]



13it [01:33,  5.70s/it]



14it [01:39,  5.75s/it]



15it [01:46,  6.17s/it]



16it [01:53,  6.32s/it]



17it [02:00,  6.67s/it]



18it [02:05,  6.22s/it]



19it [02:12,  6.47s/it]



20it [02:19,  6.64s/it]



21it [02:24,  6.14s/it]



22it [02:30,  5.90s/it]



23it [02:36,  5.89s/it]



24it [02:41,  5.76s/it]



25it [02:47,  5.76s/it]



26it [02:52,  5.59s/it]



27it [02:57,  5.50s/it]



28it [03:03,  5.62s/it]



29it [03:09,  5.54s/it]



30it [03:14,  5.51s/it]



31it [03:19,  5.33s/it]



32it [03:24,  5.26s/it]



33it [03:30,  5.55s/it]



34it [03:36,  5.72s/it]



35it [03:42,  5.74s/it]



36it [03:48,  5.74s/it]



37it [03:53,  5.59s/it]



38it [03:59,  5.62s/it]



39it [04:04,  5.53s/it]



40it [04:10,  6.26s/it]
40it [00:40,  1.01s/it]


For this out-of-sample test on validation:
	the DVH score is 6.680
	the dose score is 4.179
