In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.callbacks import LearningRateScheduler
from sklearn.model_selection import train_test_split
# from skimage import io
import pandas as pd
import os
import matplotlib.pyplot as plt
import pathlib
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import tensorflow_io as tfio

import PIL
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [2]:
PROJECT_PATH = os.getcwd()

os.chdir('data/Healthcare AI Datasets/Brain_MRI')
os.listdir()

['data.csv',
 'data_mask.csv',
 'resnet-50-MRI.json',
 'ResUNet-MRI.json',
 'TCGA_CS_4941_19960909',
 'TCGA_CS_4942_19970222',
 'TCGA_CS_4943_20000902',
 'TCGA_CS_4944_20010208',
 'TCGA_CS_5393_19990606',
 'TCGA_CS_5395_19981004',
 'TCGA_CS_5396_20010302',
 'TCGA_CS_5397_20010315',
 'TCGA_CS_6186_20000601',
 'TCGA_CS_6188_20010812',
 'TCGA_CS_6290_20000917',
 'TCGA_CS_6665_20010817',
 'TCGA_CS_6666_20011109',
 'TCGA_CS_6667_20011105',
 'TCGA_CS_6668_20011025',
 'TCGA_CS_6669_20020102',
 'TCGA_DU_5849_19950405',
 'TCGA_DU_5851_19950428',
 'TCGA_DU_5852_19950709',
 'TCGA_DU_5853_19950823',
 'TCGA_DU_5854_19951104',
 'TCGA_DU_5855_19951217',
 'TCGA_DU_5871_19941206',
 'TCGA_DU_5872_19950223',
 'TCGA_DU_5874_19950510',
 'TCGA_DU_6399_19830416',
 'TCGA_DU_6400_19830518',
 'TCGA_DU_6401_19831001',
 'TCGA_DU_6404_19850629',
 'TCGA_DU_6405_19851005',
 'TCGA_DU_6407_19860514',
 'TCGA_DU_6408_19860521',
 'TCGA_DU_7008_19830723',
 'TCGA_DU_7010_19860307',
 'TCGA_DU_7013_19860523',
 'TCGA_DU_7014_

In [3]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    if history.history.get('val_loss'):
        plt.plot(history.history['val_loss'], label='val_loss', alpha=0.5)
    plt.yscale('log')
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid(True)
    plt.show()

In [4]:
data = pd.read_csv('data_mask.csv')

In [5]:
def balance_data(data):
  data_healthy = data.query('mask == 0')
  data_unhealthy = data.query('mask == 1')

  data_healthy = data_healthy.sample(n=data_unhealthy.shape[0], random_state=2022)

  return pd.concat([data_healthy, data_unhealthy]).sample(frac=1).reset_index()

In [6]:
data = balance_data(data)

In [7]:
data

Unnamed: 0,index,patient_id,image_path,mask_path,mask
0,1186,TCGA_DU_7013_19860523,TCGA_HT_8018_19970411/TCGA_HT_8018_19970411_11...,TCGA_HT_8018_19970411/TCGA_HT_8018_19970411_11...,1
1,2359,TCGA_FG_A4MT_20020212,TCGA_HT_7616_19940813/TCGA_HT_7616_19940813_22...,TCGA_HT_7616_19940813/TCGA_HT_7616_19940813_22...,1
2,2806,TCGA_HT_7690_19960312,TCGA_DU_A5TP_19970614/TCGA_DU_A5TP_19970614_28...,TCGA_DU_A5TP_19970614/TCGA_DU_A5TP_19970614_28...,1
3,1777,TCGA_DU_A5TT_19980318,TCGA_DU_5872_19950223/TCGA_DU_5872_19950223_17...,TCGA_DU_5872_19950223/TCGA_DU_5872_19950223_17...,0
4,3681,TCGA_FG_6688_20020215,TCGA_DU_A5TT_19980318/TCGA_DU_A5TT_19980318_50...,TCGA_DU_A5TT_19980318/TCGA_DU_A5TT_19980318_50...,1
...,...,...,...,...,...
2741,17,TCGA_CS_5395_19981004,TCGA_DU_5872_19950223/TCGA_DU_5872_19950223_1.tif,TCGA_DU_5872_19950223/TCGA_DU_5872_19950223_1_...,0
2742,1253,TCGA_DU_7008_19830723,TCGA_DU_A5TS_19970726/TCGA_DU_A5TS_19970726_12...,TCGA_DU_A5TS_19970726/TCGA_DU_A5TS_19970726_12...,1
2743,3168,TCGA_HT_8111_19980330,TCGA_DU_6399_19830416/TCGA_DU_6399_19830416_34...,TCGA_DU_6399_19830416/TCGA_DU_6399_19830416_34...,0
2744,2669,TCGA_HT_7616_19940813,TCGA_FG_5964_20010511/TCGA_FG_5964_20010511_26...,TCGA_FG_5964_20010511/TCGA_FG_5964_20010511_26...,0


In [8]:
os.chdir(PROJECT_PATH)
os.chdir('data/')

if 'tumors' not in os.listdir():
    import shutil

    os.mkdir('tumors')
    os.mkdir('tumors/0')
    os.mkdir('tumors/1')

    for i in data.values:
        src = 'Healthcare AI Datasets/Brain_MRI/'+i[2]

        print(src)
        if i[4] == 0:
            dest = 'tumors/0/'
            shutil.copy(src, dest)
        else:
            dest = 'tumors/1/'
            shutil.copy(src, dest)

In [None]:
data_dir = pathlib.Path('./tumors/')
batch_size = 64
img_height = 256
img_width = 256

In [None]:
# train_ds = tf.keras.utils.image_dataset_from_directory(
#   data_dir,
#   validation_split=0.2,
#   subset="training",
#   seed=123,
#   image_size=(img_height, img_width),
#   batch_size=batch_size)

In [None]:
image_count = len(list(data_dir.glob('*/*.tif')))
print(image_count)

In [None]:
list_ds = tf.data.Dataset.list_files(f'{data_dir}/*/*', shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)

In [None]:
for f in list_ds.take(5):
  print(f.numpy())

In [None]:
class_names = np.array(sorted([item.name for item in data_dir.glob('*') if item.name != "LICENSE.txt"]))
print(class_names)

In [None]:
val_size = int(image_count * 0.2)
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)

print(tf.data.experimental.cardinality(train_ds).numpy())
print(tf.data.experimental.cardinality(val_ds).numpy())

In [None]:
def get_label(file_path):
  # Convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  one_hot = parts[-2] == class_names
  # Integer encode the label
  return tf.argmax(one_hot)

In [None]:
def decode_img(img):
  # Convert the compressed string to a 3D uint8 tensor
  img = tfio.experimental.image.decode_tiff(img)
  # Resize the image to the desired size
  # return tf.image.resize(img, [img_height, img_width])
  return img

In [None]:
def process_path(file_path):
  label = get_label(file_path)
  # Load the raw data from the file as a string
  img = tf.io.read_file(file_path)
  img = decode_img(img)
  return img, label

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=AUTOTUNE)

In [None]:
for image, label in train_ds.take(1):
  print("Image shape: ", image.numpy().shape)
  print("Label: ", label.numpy())

In [None]:
def configure_for_performance(ds):
  ds = ds.cache()
  ds = ds.shuffle(buffer_size=1000)
  ds = ds.batch(batch_size)
  ds = ds.prefetch(buffer_size=AUTOTUNE)
  return ds

train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)

In [None]:
image_batch, label_batch = next(iter(train_ds))

plt.figure(figsize=(10, 10))
for i in range(9):
  ax = plt.subplot(3, 3, i + 1)
  plt.imshow(image_batch[i].numpy().astype("uint8"))
  label = label_batch[i]
  plt.title(class_names[label])
  plt.axis("off")

In [None]:
num_classes = len(class_names)

model = Sequential([
  layers.Rescaling(1./255, input_shape=(img_height, img_width, 4)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
epochs=30
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=4)
reduce_lr_cb = tf.keras.callbacks.ReduceLROnPlateau(patience=2)

history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[early_stopping_cb, reduce_lr_cb]
)

In [None]:
model.evaluate(val_ds)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(13)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

def model300():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Flatten(input_shape=(256, 256, 3)))
    model.add(tf.keras.layers.Dense(1000, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dropout(.2))
    model.add(tf.keras.layers.Dense(250, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dropout(.2))
    model.add(tf.keras.layers.Dense(100, activation=tf.nn.relu))
    return model


def nn():
    reduce_lr_cb = tf.keras.callbacks.ReduceLROnPlateau(patience=5)
    early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10)

    model = model300()

    model.add(tf.keras.layers.Dense(2, activation=tf.nn.softmax))
    model.compile(optimizer=tf.optimizers.Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    history = model.fit(train_ds, validation_data=val_ds, epochs=200, validation_split=0.2,
                          callbacks=[reduce_lr_cb, early_stopping_cb])

    val_loss, val_acc = model.evaluate(train_ds)
    plot_loss(history)
    print('Validation loss: ', val_loss)
    print('Accuracy: ', round(val_acc, 2))


nn()

In [None]:
# X_train.shape

In [None]:
# from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
#
# model = ResNet50(input_shape=(256,256,3),include_top=False)
#
# model.compile(optimizer=tf.optimizers.Adam(),
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])
#
# history = model.fit(X_train, y_train, epochs=5, validation_split=0.2)
#
# val_loss, val_acc = model.evaluate(X_test, y_test)

In [None]:
# from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
#
# model = ResNet50(input_shape=(256,256,3),include_top=False)
#
# model.compile(optimizer=tf.optimizers.Adam(),
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])
#
# history = model.fit(X_train, y_train, epochs=5, validation_split=0.2)
#
# val_loss, val_acc = model.evaluate(X_test, y_test)