# Skin Disease Detection using Mobile Application
## Final Year Project 2
Ahmad Daniel Ikhwan Bin Rosli <br>
1201103071

In [14]:
import os
import shutil
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNet, MobileNetV2, MobileNetV3Small
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K


In [None]:
# kaggle.json path
os.environ['PATH'] += os.pathsep + r'C:\Users\ADMIN\AppData\Roaming\Microsoft\Windows\Recent\kaggle.json.lnk'

In [None]:
# download dataset
!kaggle datasets download -d surajghuwalewala/ham1000-segmentation-and-classification -p datasets/ham10000_raw --unzip

Dataset URL: https://www.kaggle.com/datasets/surajghuwalewala/ham1000-segmentation-and-classification


  0%|          | 0.00/2.59G [00:00<?, ?B/s]
  3%|▎         | 70.0M/2.59G [00:00<00:03, 691MB/s]
  6%|▌         | 152M/2.59G [00:00<00:03, 757MB/s] 
  8%|▊         | 225M/2.59G [00:00<00:04, 631MB/s]
 11%|█         | 288M/2.59G [00:00<00:03, 640MB/s]
 13%|█▎        | 351M/2.59G [00:00<00:04, 547MB/s]
 16%|█▌        | 422M/2.59G [00:00<00:03, 597MB/s]
 18%|█▊        | 482M/2.59G [00:00<00:03, 573MB/s]
 20%|██        | 539M/2.59G [00:00<00:03, 577MB/s]
 22%|██▏       | 596M/2.59G [00:01<00:03, 544MB/s]
 25%|██▍       | 659M/2.59G [00:01<00:04, 494MB/s]
 27%|██▋       | 718M/2.59G [00:01<00:03, 525MB/s]
 29%|██▉       | 777M/2.59G [00:01<00:03, 550MB/s]
 31%|███▏      | 831M/2.59G [00:01<00:03, 511MB/s]
 33%|███▎      | 886M/2.59G [00:01<00:03, 528MB/s]
 35%|███▌      | 941M/2.59G [00:01<00:03, 541MB/s]
 37%|███▋      | 994M/2.59G [00:01<00:03, 501MB/s]
 40%|███▉      | 1.02G/2.59G [00:01<00:03, 518MB/s]
 42%|████▏     | 1.08G/2.59G [00:02<00:03, 527MB/s]
 43%|████▎     | 1.12G/2.59G [00:


License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading ham1000-segmentation-and-classification.zip to datasets/ham10000_raw



In [None]:
# create path to each directory inside the dataset
raw_path = Path("datasets/ham10000_raw")
images_path = raw_path / "images"
labels_path = raw_path / "GroundTruth.csv"

df = pd.read_csv(labels_path)

In [5]:
df.columns

Index(['image', 'MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC'], dtype='object')

In [7]:
df.head()

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC
0,ISIC_0024306,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0024307,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0024308,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0024309,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0024310,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df["label"] = df[["MEL", "NV", "BCC", "AKIEC", "BKL", "DF", "VASC"]].idxmax(axis=1)

In [9]:
df.head()

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC,label
0,ISIC_0024306,0.0,1.0,0.0,0.0,0.0,0.0,0.0,NV
1,ISIC_0024307,0.0,1.0,0.0,0.0,0.0,0.0,0.0,NV
2,ISIC_0024308,0.0,1.0,0.0,0.0,0.0,0.0,0.0,NV
3,ISIC_0024309,0.0,1.0,0.0,0.0,0.0,0.0,0.0,NV
4,ISIC_0024310,1.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL


In [None]:
# count total for each target class
df["label"] = df["label"].str.lower()

target_classes = ["mel", "nv", "bcc", "bkl", "akiec"]

df = df[df["label"].isin(target_classes)].reset_index(drop=True)

df["label"].value_counts()

label
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
Name: count, dtype: int64

In [None]:
# create file path for each image
image_dir = Path("datasets/ham10000_raw/images")

df["file_path"] = df["image"].apply(lambda x: image_dir / f"{x}.jpg")

df.head()

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC,label,file_path
0,ISIC_0024306,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nv,datasets\ham10000_raw\images\ISIC_0024306.jpg
1,ISIC_0024307,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nv,datasets\ham10000_raw\images\ISIC_0024307.jpg
2,ISIC_0024308,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nv,datasets\ham10000_raw\images\ISIC_0024308.jpg
3,ISIC_0024309,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nv,datasets\ham10000_raw\images\ISIC_0024309.jpg
4,ISIC_0024310,1.0,0.0,0.0,0.0,0.0,0.0,0.0,mel,datasets\ham10000_raw\images\ISIC_0024310.jpg


In [None]:
# splitting 80 10 10
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)

print("\nTrain Distribution:\n", train_df["label"].value_counts())
print("\nVal Distribution:\n", val_df["label"].value_counts())
print("\nTest Distribution:\n", test_df["label"].value_counts())

Train: (7806, 10)
Val: (976, 10)
Test: (976, 10)

Train Distribution:
 label
nv       5364
mel       890
bkl       879
bcc       411
akiec     262
Name: count, dtype: int64

Val Distribution:
 label
nv       671
mel      111
bkl      110
bcc       52
akiec     32
Name: count, dtype: int64

Test Distribution:
 label
nv       670
mel      112
bkl      110
bcc       51
akiec     33
Name: count, dtype: int64


In [16]:
# copy target classes to new directory
output_dir = Path("datasets/ham10000_prepared")
splits = [("train", train_df), ("val", val_df), ("test", test_df)]

for split_name, df_split in splits:
    for _, row in df_split.iterrows():
        label = row["label"]
        src = row["file_path"]
        dst = output_dir / split_name / label
        dst.mkdir(parents=True, exist_ok=True)
        shutil.copy(src, dst / Path(src).name)

print("All images copied to datasets/ham10000_prepared/")

All images copied to datasets/ham10000_prepared/


In [17]:
# add class weight to handle dataset imbalanced
class_names = sorted(train_df['label'].unique())

label_to_index = {label: idx for idx, label in enumerate(class_names)}
train_df['label_idx'] = train_df['label'].map(label_to_index)

weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label_idx']),
    y=train_df['label_idx']
)

class_weights_dict = dict(enumerate(weights))
print("Class weights:", class_weights_dict)

Class weights: {0: 5.958778625954198, 1: 3.7985401459854016, 2: 1.7761092150170648, 3: 1.7541573033707865, 4: 0.2910514541387025}


In [19]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

# data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    shear_range=0.15,
    fill_mode='nearest'
)

val_test_datagen = ImageDataGenerator(rescale=1./255)

In [20]:
# create data generators 
base_dir = Path("datasets/ham10000_prepared")

train_gen = train_datagen.flow_from_directory(
    directory=base_dir / "train",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=True
)

val_gen = val_test_datagen.flow_from_directory(
    directory=base_dir / "val",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=False
)

test_gen = val_test_datagen.flow_from_directory(
    directory=base_dir / "test",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=False
)

Found 7806 images belonging to 5 classes.
Found 976 images belonging to 5 classes.
Found 976 images belonging to 5 classes.


In [22]:
# mobilenet
print("Training MobileNet (50 epochs)...")
base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = GlobalAveragePooling2D()(base_model.output)
x = Dense(128, activation='relu')(x)
num_classes = len(train_gen.class_indices)
preds = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=preds)

for layer in base_model.layers:
    layer.trainable = False

model.compile(optimizer=Adam(1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(patience=20, restore_best_weights=True, verbose=1)

history_mobilenet = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=50,
    callbacks=[early_stop],
    class_weight=class_weights_dict
)

model.save("mobilenet_model.h5")
K.clear_session()

Training MobileNet (50 epochs)...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [23]:
# mobilenetv2
print("Training MobileNetV2 (50 epochs)...")
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = GlobalAveragePooling2D()(base_model.output)
x = Dense(128, activation='relu')(x)
num_classes = len(train_gen.class_indices)
preds = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=preds)

for layer in base_model.layers:
    layer.trainable = False

model.compile(optimizer=Adam(1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(patience=20, restore_best_weights=True, verbose=1)

history_mobilenetv2 = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=50,
    callbacks=[early_stop],
    class_weight=class_weights_dict
)

model.save("mobilenetv2_model.h5")
K.clear_session()


Training MobileNetV2 (50 epochs)...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
# mobilenetv3
print("Training MobileNetV3Small (50 epochs)...")
base_model = MobileNetV3Small(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = GlobalAveragePooling2D()(base_model.output)
x = Dense(128, activation='relu')(x)
num_classes = len(train_gen.class_indices)
preds = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=preds)

for layer in base_model.layers:
    layer.trainable = False

model.compile(optimizer=Adam(1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(patience=20, restore_best_weights=True, verbose=1)

history_mobilenetv3 = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=50,
    callbacks=[early_stop],
    class_weight=class_weights_dict
)

model.save("mobilenetv3Large_model.h5")
K.clear_session()


Training MobileNetV3Small (50 epochs)...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 24: early stopping
