## Classification of skin cancer

In [None]:
import tensorflow as tf

# GPU setup for remote server
gpus = tf.config.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
    print(f"Found GPU {gpu.name}, and set memory growth to True")

2024-11-17 23:39:12.062147: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731883155.205229 1127666 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731883155.758714 1127666 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Found GPU /physical_device:GPU:0, and set memory growth to True


## Loading the dataset

The dataset in processed for is too big to fit into memory *133 * 133 * 3 * float32*, even if we would use a data type with smaller precision. Our solution is to use tf.keras.utils.PyDataset as a base class for our dataset, and let it handle the dynamic loading of the data. The `create_dataset()` utility function uses this class to create a dataset object from the metadata that it receives.

However first, we are going to train an autoencoder model to create an embedding for our data, to which we can append the metadata. The `SkinCancerReconstructionDataset` object generates batches where the taget is the same as the input. It has a utility function as well: `create_reconstruction_dataset()`.

In [2]:
from preprocessing import create_reconstruction_dataset, load_metadata, upsample_metadata
from sklearn.model_selection import train_test_split
import pandas as pd


pd.options.mode.copy_on_write = True

# Load the metadata and create train, test and validation split
metadata = load_metadata()
metadata = upsample_metadata(metadata, upsample_factor=5)
metadata_train, metadata_test = train_test_split(metadata, test_size=0.3)
metadata_test, metadata_valid = train_test_split(metadata_test, test_size=0.4)

# Load the dataset generators
batch_size=32
ds_train = create_reconstruction_dataset(metadata_train, batch_size)
ds_test = create_reconstruction_dataset(metadata_test, batch_size)
ds_valid = create_reconstruction_dataset(metadata_valid, batch_size)

  metadata = pd.read_csv(METADATA_PATH, dtype={"target": "int8", "age_approx": "Int8"})


In [3]:
# Construct the input shape from the size of the images
# and the number of channels (RGB)

input_shape = (*ds_train[0][0].shape[1:3], 3)
input_shape

(133, 133, 3)

In [4]:
from tensorflow.keras.layers import Input, Conv2D, Conv2DTranspose, Cropping2D #, Dropout, Flatten, Dense, Reshape
from tensorflow.keras.models import Sequential, Model


class Autoencoder(Model):
    """Autoencoder to create an embedding for the images"""

    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = Sequential([
            Input(input_shape),
            Conv2D(32, 5, activation="relu", padding="same", strides=2),
            Conv2D(16, 3, activation="relu", padding="same", strides=2),
            Conv2D(1, 3, activation="relu", padding="same", strides=2),
        ])
        self.decoder = Sequential([
            Conv2DTranspose(8, 3, strides=2, padding="same", activation="relu"),
            Conv2DTranspose(16, 3, strides=2, padding="same", activation="relu"),
            Conv2DTranspose(32, 5, strides=2, padding="same", activation="relu"),
            Conv2D(1, 3, activation="sigmoid", padding="same"),
            Cropping2D(((2,1), (2,1)))
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

model = Autoencoder()
model.compile(optimizer="adam", loss="mean_absolute_error", metrics=["mean_squared_error"])
model.encoder.summary()

I0000 00:00:1731883310.958452 1127666 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 234 MB memory:  -> device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:01:00.0, compute capability: 6.1


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import wandb


run = wandb.init(project="skin-cancer-detection")

callbacks = [
    EarlyStopping(patience=20, start_from_epoch=20, restore_best_weights=True),
    ModelCheckpoint("autoencoder.keras", save_best_only=True),
    wandb.keras.WandbMetricsLogger(),
    wandb.keras.WandbModelCheckpoint("autoencoder.keras", save_best_only=True)
]

model.fit(ds_train, batch_size=batch_size, epochs=150, validation_data=ds_valid, callbacks=callbacks)

run.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcsonto-benjamin[0m ([33mcorgi-vision[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/300
[1m7812/8817[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m2:33[0m 153ms/step - loss: 0.0149

2024-11-17 21:02:07.104284: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:557] Omitted potentially buggy algorithm eng14{} for conv (f32[4,1,136,136]{3,2,1,0}, u8[0]{0}) custom-call(f32[4,64,136,136]{3,2,1,0}, f32[1,64,3,3]{3,2,1,0}, f32[1]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]}


[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - loss: 0.0149

2024-11-17 21:11:42.096763: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:557] Omitted potentially buggy algorithm eng14{} for conv (f32[12,1,136,136]{3,2,1,0}, u8[0]{0}) custom-call(f32[12,64,136,136]{3,2,1,0}, f32[1,64,3,3]{3,2,1,0}, f32[1]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]}


[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1775s[0m 201ms/step - loss: 0.0149 - val_loss: 0.0150
Epoch 2/300
[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m941s[0m 107ms/step - loss: 0.0149 - val_loss: 0.0148
Epoch 3/300
[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1150s[0m 130ms/step - loss: 0.0149 - val_loss: 0.0148
Epoch 4/300
[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m656s[0m 74ms/step - loss: 0.0149 - val_loss: 0.0148
Epoch 5/300
[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1018s[0m 115ms/step - loss: 0.0148 - val_loss: 0.0148
Epoch 6/300
[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1271s[0m 144ms/step - loss: 0.0149 - val_loss: 0.0148
Epoch 7/300
[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1272s[0m 144ms/step - loss: 0.0148 - val_loss: 0.0148
Epoch 8/300
[1m8817/8817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - loss: 0.0148

### Class weights

Positive samples are heavily under-represented, which needs to be balanced out. We use the following techniques to compensate:
* **Upsampling**<br>
    Datapoints which belong to the positive samples are added to the dataset multiple times. This is indicated by the `upscale_factor` <br>
    parameter when calling the `upscale_metata()` method.
* **Data augmenting**<br>
    To make the upsampled images more unique, some image augmentation techniques are applied. In particular horizontal and vertical mirroring <br>
    and cropping then rescaling the images. Either one or two methods are applied randomly.
* **Sample weights**<br>
    For each sample the loss function is evaluated using a corresponding weight, <br>
    which is higher for the positive samples. We use to following formula: $c_d / (2 * c_s)$, <br>
    where $c_d$ is the count of all samples and $c_s$ is the count of samples for a given class of labels.

In [10]:
ds_train.class_weights

{0: 0.5029020849376801, 1: 86.64496314496314}