In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import torch

In [2]:
# run this if you are using colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# if this does not run, change to the file path where ALL is located on your device
!unzip gdrive/MyDrive/ALL.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ALL/all_pro/all_pro_4792.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_4792.jpg  
  inflating: ALL/all_pro/all_pro_1832.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_1832.jpg  
  inflating: ALL/all_pro/all_pro_0292.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_0292.jpg  
  inflating: ALL/all_pro/all_pro_3943.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_3943.jpg  
  inflating: ALL/all_pro/all_pro_2485.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_2485.jpg  
  inflating: ALL/all_pro/all_pro_3957.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_3957.jpg  
  inflating: ALL/all_pro/all_pro_2491.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_2491.jpg  
  inflating: ALL/all_pro/all_pro_0286.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_0286.jpg  
  inflating: ALL/all_pro/all_pro_1198.jpg  
  inflating: __MACOSX/ALL/all_pro/._all_pro_1198.jpg  
  inflating: ALL/all_pro/all_pro_1826.jpg  


In [5]:
data_dir = 'ALL'

In [6]:
# training and validation sets
train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.3,  # 70% train
    subset="training",
    seed=123,
    image_size=(128, 128),  # resize all images
    batch_size=32
)

temp_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.3, # 30% temporary dataset
    subset="validation",
    seed=123,
    image_size=(128, 128),
    batch_size=32
)

Found 20000 files belonging to 4 classes.
Using 14000 files for training.
Found 20000 files belonging to 4 classes.
Using 6000 files for validation.


In [7]:
# split temp_ds into val (50%) + test (50%) -> 15% each
# temp_ds originally has 6000 (files) / 32 (batch size) = 187.5 (round up = 188)
val_batches = tf.data.experimental.cardinality(temp_ds) # cardinality is the rounded number of batches
val_ds = temp_ds.take(val_batches // 2) # takes first half
test_ds = temp_ds.skip(val_batches // 2) # skips first half and takes second half

# expect cardinality = 94 (188 / 2)
print("Cardinality of validation set:", tf.data.experimental.cardinality(val_ds).numpy())
print("Cardinality of test set:", tf.data.experimental.cardinality(test_ds).numpy())

Cardinality of validation set: 94
Cardinality of test set: 94


In [8]:
# normalize the data so that values are [0,1] instead of [0,255]
# improves convergence speed and ensures consistency among all features
normalization_layer = tf.keras.layers.Rescaling(1./255)
train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))
test_ds = test_ds.map(lambda x, y: (normalization_layer(x), y))

In [9]:
# use GPU if available
if torch.cuda.is_available():
  print("GPU detected")
  device = torch.device("cuda")
else:
  print("No GPU detected")
  device = torch.device("cpu")

GPU detected


In [12]:
num_classes = 4  # benign, early, pre, pro

model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(128, 128, 3)),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')  # multi-class
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 53ms/step - accuracy: 0.6038 - loss: 0.9702 - val_accuracy: 0.7244 - val_loss: 0.6619
Epoch 2/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 50ms/step - accuracy: 0.8629 - loss: 0.3312 - val_accuracy: 0.9239 - val_loss: 0.1965
Epoch 3/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 47ms/step - accuracy: 0.9439 - loss: 0.1503 - val_accuracy: 0.9714 - val_loss: 0.0839
Epoch 4/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 51ms/step - accuracy: 0.9632 - loss: 0.0979 - val_accuracy: 0.9541 - val_loss: 0.1233
Epoch 5/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 47ms/step - accuracy: 0.9843 - loss: 0.0502 - val_accuracy: 0.9621 - val_loss: 0.1072
Epoch 6/10
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 54ms/step - accuracy: 0.9776 - loss: 0.0655 - val_accuracy: 0.9731 - val_loss: 0.0776
Epoch 7/10
[1m4