## Classification of skin cancer

## Loading the dataset

The dataset in processed for is too big to fit into memory *133 * 133 * 3 * float32*, even if we would use a data type with smaller precision. Our solution is to use tf.keras.utils.PyDataset as a base class for our dataset, and let it handle the dynamic loading of the data. The `create_dataset()` utility function uses this class to create a dataset object from the metadata that it receives.

However first, we are going to train an autoencoder model to create an embedding for our data, to which we can append the metadata. The `SkinCancerReconstructionDataset` object generates batches where the taget is the same as the input. It has a utility function as well: `create_reconstruction_dataset()`.

In [1]:
from preprocessing import create_dataset, load_metadata, upsample_metadata
from sklearn.model_selection import train_test_split
import pandas as pd


pd.options.mode.copy_on_write = True

metadata = load_metadata(.01)
metadata = upsample_metadata(metadata, upsample_factor=5)
metadata_train, metadata_test = train_test_split(metadata, test_size=0.3)
metadata_test, metadata_valid = train_test_split(metadata_test, test_size=0.4)

# Load the dataset generators
batch_size = 32
ds_train = create_dataset(metadata_train, batch_size)
ds_test = create_dataset(metadata_test, batch_size)
ds_valid = create_dataset(metadata_valid, batch_size)


2024-11-19 01:03:05.219673: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-19 01:03:05.229446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-19 01:03:05.240440: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-19 01:03:05.243567: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-19 01:03:05.251805: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Construct the input shape from the size of the images
# and the number of channels (RGB)

input_shape = (*ds_train[0][0].shape[1:3], 3)
input_shape

(133, 133, 3)

In [3]:
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2,preprocess_input,decode_predictions
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D,BatchNormalization, Dropout
from tensorflow.keras import backend as K
import numpy as np

base_model = InceptionResNetV2(weights='imagenet', include_top=False)
x = base_model.output
x = GlobalAveragePooling2D()(x)  
x = BatchNormalization()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = False

model.compile(optimizer='adam', metrics=['accuracy'],loss='binary_crossentropy')
model.summary()

2024-11-19 01:03:07.907578: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-11-19 01:03:07.907593: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:135] retrieving CUDA diagnostic information for host: ec3ac572229b
2024-11-19 01:03:07.907596: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:142] hostname: ec3ac572229b
2024-11-19 01:03:07.907686: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:166] libcuda reported version is: 560.35.3
2024-11-19 01:03:07.907695: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:170] kernel reported version is: NOT_FOUND: could not find kernel module information in driver version file contents: "NVRM version: NVIDIA UNIX Open Kernel Module for x86_64  560.35.03  Release Build  (dvs-builder@U16-I1-N07-12-3)  Fri Aug 16 21:42:42 UTC 2024
GCC version:  gcc version 14.2.0 (Ubuntu 14.2.0-4u

In [4]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import wandb


#run = wandb.init(project="skin-cancer-detection")

callbacks = [
    EarlyStopping(patience=20, start_from_epoch=20, restore_best_weights=True),
    ModelCheckpoint("resnet.keras", save_best_only=True),
    #wandb.keras.WandbMetricsLogger(),
    #wandb.keras.WandbModelCheckpoint("resnet.keras", save_best_only=True)
]

model.fit(ds_train, batch_size=batch_size, epochs=5,validation_steps=10, validation_data=ds_valid, callbacks=callbacks)



Epoch 1/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 306ms/step - accuracy: 0.9651 - loss: 0.1353 - val_accuracy: 0.9969 - val_loss: 0.0815
Epoch 2/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step - accuracy: 0.9965 - loss: 0.0351

2024-11-19 01:04:08.710477: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
  self.gen.throw(typ, value, traceback)


[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 267ms/step - accuracy: 0.9965 - loss: 0.0350 - val_accuracy: 1.0000 - val_loss: 9.0557e-04
Epoch 3/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 313ms/step - accuracy: 0.9961 - loss: 0.0284 - val_accuracy: 0.9969 - val_loss: 0.0154
Epoch 4/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 299ms/step - accuracy: 0.9998 - loss: 0.0026 - val_accuracy: 0.9948 - val_loss: 0.0120
Epoch 5/5


2024-11-19 01:05:04.831980: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 291ms/step - accuracy: 0.9994 - loss: 0.0182 - val_accuracy: 0.9969 - val_loss: 0.0044


<keras.src.callbacks.history.History at 0x722e0fb14710>

### Training the last layers of the used InceptionResNetV2
Time consuming process, we won't use it at the current phase

In [5]:
for layer in model.layers[:350]:
       layer.trainable = False
for layer in model.layers[350:]:
       layer.trainable = True

model.compile(optimizer=SGD(learning_rate=0.0001, momentum=0.9), metrics=['accuracy'], loss='binary_crossentropy')
model.fit(ds_train, batch_size=batch_size, epochs=5, validation_data=ds_valid, callbacks=callbacks)
#run.finish()


Epoch 1/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 616ms/step - accuracy: 0.9768 - loss: 0.1470 - val_accuracy: 0.9897 - val_loss: 0.0494
Epoch 2/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 633ms/step - accuracy: 0.9931 - loss: 0.0394 - val_accuracy: 0.9855 - val_loss: 0.0474
Epoch 3/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 599ms/step - accuracy: 0.9858 - loss: 0.0611 - val_accuracy: 0.9938 - val_loss: 0.0473
Epoch 4/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 647ms/step - accuracy: 0.9951 - loss: 0.0249 - val_accuracy: 0.9917 - val_loss: 0.0429
Epoch 5/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 586ms/step - accuracy: 0.9947 - loss: 0.0162 - val_accuracy: 0.9917 - val_loss: 0.0649


<keras.src.callbacks.history.History at 0x722e1305c650>

### Class weights

Positive samples are heavily under-represented, which needs to be balanced out. We use the following techniques to compensate:
* **Upsampling**<br>
    Datapoints which belong to the positive samples are added to the dataset multiple times. This is indicated by the `upscale_factor` <br>
    parameter when calling the `upscale_metata()` method.
* **Data augmenting**<br>
    To make the upsampled images more unique, some image augmentation techniques are applied. In particular horizontal and vertical mirroring <br>
    and cropping then rescaling the images. Either one or two methods are applied randomly.
* **Sample weights**<br>
    For each sample the loss function is evaluated using a corresponding weight, <br>
    which is higher for the positive samples. We use to following formula: $c_d / (2 * c_s)$, <br>
    where $c_d$ is the count of all samples and $c_s$ is the count of samples for a given class of labels.

In [6]:
ds_train.class_weights

{0: 0.5023172905525847, 1: 108.38461538461539}

## Calculate accuracy measurement

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# 1. Make predictions on the validation or test set
y_true = []  # True labels
y_pred = []  # Predicted labels

# Iterate over your validation dataset to get the true and predicted labels
for batch in ds_test:
    images, labels = batch
    predictions = model.predict(images) 
    
    y_true.extend(labels)     
    y_pred.extend((predictions > 0.5).astype(int))  



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

  self.gen.throw(typ, value, traceback)


ValueError: math domain error

In [None]:
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', cbar=False, 
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
 
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')