In [1]:
import cv2
import os
import numpy as np
import pandas as pd
import pydicom
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
# Training a 3D classifier on 1 MRI Type

# define some data hyperparameters
MRI_TYPES = ["FLAIR", "T1w", "T1wCE", "T2w"]
MRI_TYPE = "FLAIR"
DATA_DIR = "/kaggle/input/rsna-miccai-brain-tumor-radiogenomic-classification"
BUFFER_SIZE = 128

VALID_PROP = 0.2
BATCH_SIZE = 32
EPOCHS = 10
WIDTH = 128
HEIGHT = 128
DEPTH = 16

# get the training labels and remove bad data
label_df = pd.read_csv(os.path.join(DATA_DIR, "train_labels.csv"))
label_df = label_df.loc[~label_df["BraTS21ID"].isin([109, 123, 709])]

# create train/validation split
train_df, valid_df = train_test_split(label_df, test_size=VALID_PROP)
print(train_df.shape, valid_df.shape)
train_df.head()

(465, 2) (117, 2)


Unnamed: 0,BraTS21ID,MGMT_value
233,339,0
142,214,0
123,186,1
82,124,0
232,338,1


In [3]:
# 1st WAY TO LOAD AND PREPROCESS DATA - SLOW

# sorting a file by its number
def filenum(filename):
    return int(re.search("\d+", filename).group(0))

# define function that obtains data given id
def id2data(id, mri_type, split="train"):
    scans_dir = os.path.join(DATA_DIR, split, id, mri_type)
    scan_files = sorted(os.listdir(scans_dir), key=filenum)
    data = []

    # want DEPTH scans
    num_scans = len(scan_files)
    if num_scans >= DEPTH:
        # get middle scans if enough scans
        start = num_scans // 2 - DEPTH // 2
        selected_files = scan_files[start:start+DEPTH]
    else:
        # pad if not enough scans
        total_pad = DEPTH - num_scans
        left_pad = ["PAD"] * (total_pad // 2)
        right_pad = ["PAD"] * (total_pad - len(left_pad))
        selected_files = left_pad + scan_files + right_pad
    
    data3d = []
    for scan_file in selected_files:
        if scan_file != "PAD":
            # read the scan and reshape
            scan = pydicom.dcmread(os.path.join(scans_dir, scan_file))
            scan_arr = scan.pixel_array
            scan_arr = cv2.resize(scan_arr, (WIDTH, HEIGHT), cv2.INTER_LANCZOS4)
        else:
            # pad with zero image
            scan_arr = np.zeros((WIDTH, HEIGHT))
        
        data3d.append(scan_arr)
        
    # stack in to nparray and remove nans
    data3d = np.stack(data3d)
    
    # remove nans and normalize pixel values to [0, 1]
    data3d = np.nan_to_num(data3d, nan=0.0)
    data3d = data3d - np.min(data3d)
    if np.max(data3d) > 0:
        data3d = data3d / np.max(data3d)
    
    assert data3d.shape == (DEPTH, WIDTH, HEIGHT)
    assert not np.any(np.isnan(data3d))
    return tf.reshape(tf.convert_to_tensor(data3d, dtype=tf.float32), (DEPTH, WIDTH, HEIGHT, 1))
    

# create a generator initializer
def file_gen_init(df, mri_type):
    
    def file_gen():
        for idx in range(df.shape[0]):
            id = str(df["BraTS21ID"].iloc[idx]).zfill(5)
            x = id2data(id, mri_type)
            y = int(df["MGMT_value"].iloc[idx])
            yield x, y
            
    return file_gen

In [4]:
# 2nd WAY TO LOAD DATA AND PREPROCESS DATA - slow at start, but faster loading times

def pad_depth(data):
    
    num_scans = data.shape[0]
    if num_scans >= DEPTH:
        # choose middle DEPTH scans
        start = num_scans // 2 - DEPTH // 2
        return data[start:start+DEPTH]
    else:
        # pad the scans with empty images
        pad_size = DEPTH - num_scans
        pad_left = np.zeros((pad_size // 2, WIDTH, HEIGHT, 1))
        pad_right = np.zeros((pad_size - pad_left.shape[0], WIDTH, HEIGHT, 1))
        return np.concatenate([pad_left, data, pad_right])
        

def load_data(df, mri_type):
    
    data = {}
    
    # for each patients scan
    for id in tqdm(sorted(df["BraTS21ID"])):
        patient_id = str(id).zfill(5)
        scans_dir = os.path.join(DATA_DIR, "train", patient_id, mri_type)
        scans = []
        for scan_file in sorted(os.listdir(scans_dir), key=filenum):
            scan = pydicom.dcmread(os.path.join(scans_dir, scan_file))
            scan_arr = scan.pixel_array
            
            # include non-empty files, replace nans, reshape, and normalize to range [0, 1]
            if np.max(scan_arr) != 0:
                scan_arr = np.nan_to_num(scan_arr, nan=0.0)
                scan_arr = cv2.resize(scan_arr, (WIDTH, HEIGHT), interpolation=cv2.INTER_LANCZOS4)
                scan_arr = scan_arr - np.min(scan_arr)
                scan_arr = scan_arr / np.max(scan_arr)
                scan_arr = np.reshape(scan_arr, (WIDTH, HEIGHT, 1))
                
                scans.append(scan_arr)
                
        data[patient_id] = pad_depth(np.stack(scans))
        
    return data
    

def data_gen_init(df, mri_type):
    
    data = load_data(df, mri_type)
    info = [(str(id).zfill(5), label) for id, label in zip(df["BraTS21ID"].tolist(), df["MGMT_value"].tolist())]
    
    def file_gen():
        for id, label in info:
            yield data[id], label

    return file_gen

In [5]:
train_ds = tf.data.Dataset.from_generator(data_gen_init(train_df, "FLAIR"), output_signature=(tf.TensorSpec(shape=(DEPTH, WIDTH, HEIGHT, 1)), tf.TensorSpec(shape=(), dtype=tf.int32)))
valid_ds = tf.data.Dataset.from_generator(data_gen_init(valid_df, "FLAIR"), output_signature=(tf.TensorSpec(shape=(DEPTH, WIDTH, HEIGHT, 1)), tf.TensorSpec(shape=(), dtype=tf.int32)))

train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(1)
valid_ds = valid_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(1)

100%|██████████| 465/465 [09:20<00:00,  1.21s/it]
100%|██████████| 117/117 [02:34<00:00,  1.32s/it]


In [6]:
import tensorflow.keras.layers as layers

# create a model
def create_model():
    model = tf.keras.Sequential()
    model.add(layers.InputLayer(input_shape=(DEPTH, WIDTH, HEIGHT, 1)))
    model.add(layers.Conv3D(8, (3,3,3), strides=(2,2,2)))
    model.add(layers.Conv3D(16, (3,3,3), strides=(1,1,1)))
    model.add(layers.MaxPool3D((3,3,3)))
    
    model.add(layers.Conv3D(32, (1,3,3), strides=(1,1,1)))
    model.add(layers.MaxPool3D((1,3,3)))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(32, activation="relu"))
    model.add(layers.Dense(1, activation="sigmoid"))
    
    return model

model = create_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d (Conv3D)              (None, 7, 63, 63, 8)      224       
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 5, 61, 61, 16)     3472      
_________________________________________________________________
max_pooling3d (MaxPooling3D) (None, 1, 20, 20, 16)     0         
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 1, 18, 18, 32)     4640      
_________________________________________________________________
max_pooling3d_1 (MaxPooling3 (None, 1, 6, 6, 32)       0         
_________________________________________________________________
flatten (Flatten)            (None, 1152)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                3

In [7]:

# compile and train the model
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["accuracy"])

model.fit(train_ds,
          epochs=EPOCHS,
          validation_data=valid_ds,
          verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f10d2a8e110>

In [8]:
from tqdm import tqdm
import pandas as pd

# predict on test data and submit

results = {"BraTS21ID": [], "MGMT_value": []}
test_dir = os.path.join(DATA_DIR, "test")

for patient_id in tqdm(sorted(os.listdir(test_dir), key=filenum)):
    y_pred = model.predict(tf.expand_dims(id2data(patient_id, "FLAIR", split="test"), axis=0))
    results["BraTS21ID"].append(patient_id)
    results["MGMT_value"].append(y_pred[0][0])

submission_df = pd.DataFrame(results)
submission_df.to_csv("submission.csv", index=False)

100%|██████████| 87/87 [00:17<00:00,  4.84it/s]


In [9]:
pd.read_csv("submission.csv").head(5)

Unnamed: 0,BraTS21ID,MGMT_value
0,1,0.53438
1,13,0.574129
2,15,0.496479
3,27,0.525022
4,37,0.565296
