## Classification of skin cancer

## Loading the dataset

The dataset in processed for is too big to fit into memory *133 * 133 * 3 * float32*, even if we would use a data type with smaller precision. Our solution is to use tf.keras.utils.PyDataset as a base class for our dataset, and let it handle the dynamic loading of the data. The `create_dataset()` utility function uses this class to create a dataset object from the metadata that it receives.

However first, we are going to train an autoencoder model to create an embedding for our data, to which we can append the metadata. The `SkinCancerReconstructionDataset` object generates batches where the taget is the same as the input. It has a utility function as well: `create_reconstruction_dataset()`.

In [None]:
%pip install keras-visualizer
from preprocessing import create_dataset, load_metadata, upsample_metadata
from sklearn.model_selection import train_test_split
import pandas as pd


pd.options.mode.copy_on_write = True

metadata = load_metadata(.01)
metadata = upsample_metadata(metadata, upsample_factor=5)
metadata_train, metadata_test = train_test_split(metadata, test_size=0.3)
metadata_test, metadata_valid = train_test_split(metadata_test, test_size=0.2)

# Load the dataset generators
batch_size = 32
ds_train = create_dataset(metadata_train, batch_size)
ds_test = create_dataset(metadata_test, batch_size)
ds_valid = create_dataset(metadata_valid, batch_size)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


  metadata = pd.read_csv(METADATA_PATH, dtype={"target": "int8", "age_approx": "Int8"})
  1: half_count / positive_samples


In [None]:
# Construct the input shape from the size of the images
# and the number of channels (RGB)

input_shape = (*ds_train[0][0].shape[1:3], 3)
input_shape

(133, 133, 3)

In [None]:

from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2,preprocess_input,decode_predictions
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D,BatchNormalization, Dropout, Concatenate, Conv2D
from tensorflow.keras import backend as K

def create_metadata_model():
	# define our MLP network
	model = Sequential()
	model.add(Dense(8, activation="relu"))
	model.add(Dense(4, activation="relu"))
	
	return model

In [None]:
def create_cnn_model():
    cnn_model = InceptionResNetV2(weights='imagenet', include_top=False)
    x = cnn_model.output
    x = GlobalAveragePooling2D()(x)  
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    return x;



In [None]:
import numpy as np
from keras_visualizer import visualizer

metadata_model= create_metadata_model()

cnn_model = create_cnn_model();

combined = Concatenate()([cnn_model,metadata_model])

x = Dense(4, activation="relu")(combined)
x = Dense(1, activation="linear")(x)

predictions = Dense(1, activation='sigmoid')(x)

model = Model(inputs=cnn_model.input, outputs=predictions)

for layer in cnn_model.layers:
    layer.trainable = False

model.compile(optimizer='adam', metrics=['accuracy'],loss='binary_crossentropy')
#model.summary()
visualizer(model, format='png')

2024-12-07 22:29:51.942656: W external/local_tsl/tsl/framework/bfc_allocator.cc:482] Allocator (GPU_0_bfc) ran out of memory trying to allocate 540.0KiB (rounded to 552960)requested by op StatelessRandomUniformV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-12-07 22:29:51.942724: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-12-07 22:29:51.942730: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 840, Chunks in use: 840. 210.0KiB allocated for chunks. 210.0KiB in use in bin. 119.2KiB client-requested in use in bin.
2024-12-07 22:29:51.942733: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 829, Chunks in use: 829. 564.5KiB allocated for chunks. 564.5KiB in use in bin. 537.9KiB client-requested in use in bi

ResourceExhaustedError: {{function_node __wrapped__StatelessRandomUniformV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[3,3,80,192] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StatelessRandomUniformV2] name: 

In [None]:
visualizer(model, format='png')

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import wandb


#run = wandb.init(project="skin-cancer-detection")

callbacks = [
    EarlyStopping(patience=20, start_from_epoch=20, restore_best_weights=True),
    ModelCheckpoint("resnet.keras", save_best_only=True),
    #wandb.keras.WandbMetricsLogger(),
    #wandb.keras.WandbModelCheckpoint("resnet.keras", save_best_only=True)
]

model.fit(ds_train, batch_size=batch_size, epochs=5,validation_steps=10, validation_data=ds_valid, callbacks=callbacks)



Epoch 1/5


I0000 00:00:1733605747.929735     182 service.cc:146] XLA service 0x7a76740cd2f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733605747.929759     182 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2024-12-07 21:09:08.320960: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-07 21:09:10.628371: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8906


[1m 3/89[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 44ms/step - accuracy: 0.6840 - loss: 0.6296  

I0000 00:00:1733605758.465926     182 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m14/89[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m8s[0m 108ms/step - accuracy: 0.8439 - loss: 0.3316

Process Keras_worker_ForkPoolWorker-5:
Process Keras_worker_ForkPoolWorker-3:
Process Keras_worker_ForkPoolWorker-1:
Process Keras_worker_ForkPoolWorker-2:
Process Keras_worker_ForkPoolWorker-6:
Process Keras_worker_ForkPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.11/multiprocessing/process.

KeyboardInterrupt: 

### Training the last layers of the used InceptionResNetV2
Time consuming process, we won't use it at the current phase

In [None]:
for layer in model.layers[:350]:
       layer.trainable = False
for layer in model.layers[350:]:
       layer.trainable = True

model.compile(optimizer=SGD(learning_rate=0.0001, momentum=0.9), metrics=['accuracy'], loss='binary_crossentropy')
model.fit(ds_train, batch_size=batch_size, epochs=5, validation_data=ds_valid, callbacks=callbacks)
#run.finish()


Epoch 1/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 616ms/step - accuracy: 0.9768 - loss: 0.1470 - val_accuracy: 0.9897 - val_loss: 0.0494
Epoch 2/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 633ms/step - accuracy: 0.9931 - loss: 0.0394 - val_accuracy: 0.9855 - val_loss: 0.0474
Epoch 3/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 599ms/step - accuracy: 0.9858 - loss: 0.0611 - val_accuracy: 0.9938 - val_loss: 0.0473
Epoch 4/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 647ms/step - accuracy: 0.9951 - loss: 0.0249 - val_accuracy: 0.9917 - val_loss: 0.0429
Epoch 5/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 586ms/step - accuracy: 0.9947 - loss: 0.0162 - val_accuracy: 0.9917 - val_loss: 0.0649


<keras.src.callbacks.history.History at 0x722e1305c650>

### Class weights

Positive samples are heavily under-represented, which needs to be balanced out. We use the following techniques to compensate:
* **Upsampling**<br>
    Datapoints which belong to the positive samples are added to the dataset multiple times. This is indicated by the `upscale_factor` <br>
    parameter when calling the `upscale_metata()` method.
* **Data augmenting**<br>
    To make the upsampled images more unique, some image augmentation techniques are applied. In particular horizontal and vertical mirroring <br>
    and cropping then rescaling the images. Either one or two methods are applied randomly.
* **Sample weights**<br>
    For each sample the loss function is evaluated using a corresponding weight, <br>
    which is higher for the positive samples. We use to following formula: $c_d / (2 * c_s)$, <br>
    where $c_d$ is the count of all samples and $c_s$ is the count of samples for a given class of labels.

In [None]:
ds_train.class_weights

{0: 0.5023172905525847, 1: 108.38461538461539}

## Calculate accuracy measurement

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# 1. Make predictions on the validation or test set
y_true = []  # True labels
y_pred = []  # Predicted labels

# Iterate over your validation dataset to get the true and predicted labels
for batch in ds_test:
    images, labels = batch
    predictions = model.predict(images) 
    
    y_true.extend(labels)     
    y_pred.extend((predictions > 0.5).astype(int))  



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

  self.gen.throw(typ, value, traceback)


ValueError: math domain error

In [None]:
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', cbar=False, 
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
 
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')