Load the dataset from kaggle as the harvard link did not work

In [None]:
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download HAM10000 dataset
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0
Downloading skin-cancer-mnist-ham10000.zip to /content
100% 5.20G/5.20G [02:50<00:00, 35.5MB/s]
100% 5.20G/5.20G [02:50<00:00, 32.7MB/s]


Unzip the dataset

In [None]:
import os
import zipfile
import pandas as pd

with zipfile.ZipFile("skin-cancer-mnist-ham10000.zip", 'r') as zip_ref:
    zip_ref.extractall("HAM10000")



**Columns:**


1. lesion_id: Identifier for the lesion which may be shared by multiple images of the same lesion
2. image_id: Unique identifier for each image
3. dx: Diagnosis label for the lesion:
  - nv: Melanocytic nevi
  - mel: Melanoma
  - bkl: Benign keratosis
  - ...
4. dx_type: Method to obtain the diagnosis:
  - histo: Analyzing a biopsy under microscope
  - follow_up: Diagnosis was determined through clinical follow-up over time
5. age: Age of the participant
6. sex: Sex of the participant
7. localization: Body site where the lesion was found



In [None]:
metadata_path = "HAM10000/HAM10000_metadata.csv"
metadata = pd.read_csv(metadata_path)

print(metadata.head())

     lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear


Check data for correctness

In [None]:
metadata.isnull().sum()

Unnamed: 0,0
lesion_id,0
image_id,0
dx,0
dx_type,0
age,57
sex,0
localization,0


As we are training a model which should determine the illness based on the image, rows with age column == null do not have to be dropped

In [None]:
%cd HAM10000

/content/HAM10000


In [None]:
import shutil

target_dir = "HAM10000_images_combined"
os.makedirs(target_dir, exist_ok=True)

for folder in ["HAM10000_images_part_1", "HAM10000_images_part_2"]:
    for file in os.listdir(folder):
        shutil.copy(os.path.join(folder, file), target_dir)

1. HAM10000_images_part_1/ and HAM10000_images_part_2/: These are the directories being removed.
2. ham10000_images_part_1/ and ham10000_images_part_2/: These are the directories being removed.

In [None]:
!rm -r HAM10000_images_part_1/
!rm -r HAM10000_images_part_2/
!rm -r ham10000_images_part_1/
!rm -r ham10000_images_part_2/

Check if there are entries without images

In [None]:
import os

image_dir = "HAM10000_images_combined/"

image_files = set(os.listdir(image_dir))

metadata['image_id'] = metadata['image_id'] + ".jpg"

missing_files = metadata.loc[~metadata['image_id'].isin(image_files), 'image_id']

if len(missing_files) > 0:
    print(f"Missing files for {len(missing_files)} image IDs:")
    print(missing_files.tolist())
else:
    print("All image files are present.")


All image files are present.


Add the file path for each image to the metadata

In [None]:
metadata['image_path'] = metadata['image_id'].apply(lambda x: os.path.join("HAM10000_images_combined", f"{x}"))

In [None]:
print(metadata.head())

     lesion_id          image_id   dx dx_type   age   sex localization  \
0  HAM_0000118  ISIC_0027419.jpg  bkl   histo  80.0  male        scalp   
1  HAM_0000118  ISIC_0025030.jpg  bkl   histo  80.0  male        scalp   
2  HAM_0002730  ISIC_0026769.jpg  bkl   histo  80.0  male        scalp   
3  HAM_0002730  ISIC_0025661.jpg  bkl   histo  80.0  male        scalp   
4  HAM_0001466  ISIC_0031633.jpg  bkl   histo  75.0  male          ear   

                                  image_path  
0  HAM10000_images_combined/ISIC_0027419.jpg  
1  HAM10000_images_combined/ISIC_0025030.jpg  
2  HAM10000_images_combined/ISIC_0026769.jpg  
3  HAM10000_images_combined/ISIC_0025661.jpg  
4  HAM10000_images_combined/ISIC_0031633.jpg  


Balance the label set

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(metadata['dx']),
    y=metadata['dx']
)
class_weights_dict = dict(enumerate(class_weights))

In [None]:
print(class_weights_dict)

{0: 4.375273044997815, 1: 2.78349082823791, 2: 1.301832835044846, 3: 12.440993788819876, 4: 1.2854575792581184, 5: 0.21338020666879728, 6: 10.075452716297788}


Generate train / test split

In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(metadata, test_size=0.2, stratify=metadata['dx'], random_state=42)

Boilerplate for data loading

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

print(train_df.head())

img_size = 128

datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.2
)

train_generator = datagen.flow_from_dataframe(
    train_df,
    x_col='image_path',
    y_col='dx',
    target_size=(img_size, img_size),
    batch_size=32,
    class_mode='categorical'
)

val_generator = datagen.flow_from_dataframe(
    val_df,
    x_col='image_path',
    y_col='dx',
    target_size=(img_size, img_size),
    batch_size=32,
    class_mode='categorical'
)


        lesion_id          image_id     dx    dx_type   age     sex  \
8050  HAM_0005972  ISIC_0033319.jpg     nv      histo  35.0  female   
4898  HAM_0004902  ISIC_0030823.jpg     nv  follow_up  40.0    male   
9695  HAM_0005282  ISIC_0028730.jpg  akiec      histo  65.0    male   
4090  HAM_0000475  ISIC_0027299.jpg     nv  follow_up  40.0    male   
8625  HAM_0000949  ISIC_0032444.jpg     nv      histo  65.0    male   

         localization                                 image_path  
8050  lower extremity  HAM10000_images_combined/ISIC_0033319.jpg  
4898            trunk  HAM10000_images_combined/ISIC_0030823.jpg  
9695  lower extremity  HAM10000_images_combined/ISIC_0028730.jpg  
4090  lower extremity  HAM10000_images_combined/ISIC_0027299.jpg  
8625             back  HAM10000_images_combined/ISIC_0032444.jpg  
Found 8012 validated image filenames belonging to 7 classes.
Found 2003 validated image filenames belonging to 7 classes.


Train the model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

num_classes = train_df['dx'].nunique()

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(img_size, img_size, 3)),
    MaxPooling2D((2, 2)),
    BatchNormalization(),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    BatchNormalization(),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    class_weight=class_weights_dict
)


Epoch 1/10


  self._warn_if_super_not_called()


[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 1s/step - accuracy: 0.2761 - loss: 23.1052 - val_accuracy: 0.0295 - val_loss: 2.3635
Epoch 2/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m366s[0m 1s/step - accuracy: 0.2294 - loss: 6.3015 - val_accuracy: 0.0704 - val_loss: 1.9699
Epoch 3/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 1s/step - accuracy: 0.1478 - loss: 3.0821 - val_accuracy: 0.0744 - val_loss: 2.0031
Epoch 4/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 1s/step - accuracy: 0.1510 - loss: 2.5484 - val_accuracy: 0.2886 - val_loss: 2.2320
Epoch 5/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 1s/step - accuracy: 0.1526 - loss: 2.2719 - val_accuracy: 0.1288 - val_loss: 1.9459
Epoch 6/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 1s/step - accuracy: 0.1328 - loss: 2.0747 - val_accuracy: 0.1393 - val_loss: 2.3207
Epoch 7/10
[1m251/251[0m [32m

Monte Carlo Dropout

In [None]:
import tensorflow as tf
import numpy as np

mc_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(img_size, img_size, 3)),
    MaxPooling2D((2, 2)),
    BatchNormalization(),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    BatchNormalization(),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5, name='mc_dropout'),
    Dense(num_classes, activation='softmax')
])

mc_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

mc_model.set_weights(model.get_weights())

@tf.function
def monte_carlo_predictions(mc_model, x_batch, n_simulations=50):
    preds = []
    for _ in range(n_simulations):
        preds.append(mc_model(x_batch, training=True))  # Force dropout during inference
    preds = tf.stack(preds, axis=0)  # Shape: (n_simulations, batch_size, num_classes)
    return tf.reduce_mean(preds, axis=0), tf.math.reduce_std(preds, axis=0)

Save the models

In [None]:
model.save("skin_lesion_model.keras")

mc_model.save("monte_carlo_model.keras")

Test on Validation Data

In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array

test_image_path = val_df.iloc[0]['image_path']
test_image_result = val_df.iloc[0]['dx']
test_image = load_img(test_image_path, target_size=(img_size, img_size))
test_image_array = img_to_array(test_image) / 255.0
test_image_array = np.expand_dims(test_image_array, axis=0)

mean_pred, uncertainty = monte_carlo_predictions(mc_model, test_image_array, n_simulations=50)

predicted_class = np.argmax(mean_pred.numpy(), axis=-1)[0]
print("Predicted class:", val_df['dx'].unique()[predicted_class])
print("Real class:", test_image_result)
print("Uncertainty:", uncertainty.numpy())

Predicted class: nv
Real class: nv
Uncertainty: [[4.4703484e-08 4.4703484e-08 5.9604645e-08 8.9406967e-08 5.9604645e-08
  2.9802322e-08 7.4505806e-08]]
