# Skin Disease Detection using Mobile Application
## Final Year Project 2
Ahmad Daniel Ikhwan Bin Rosli <br>
1201103071

### Load dataset

In [9]:
import os
import shutil
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, array_to_img, save_img
from tensorflow.keras.applications import MobileNet, MobileNetV2, MobileNetV3Small
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K


In [None]:
# kaggle.json path
os.environ['PATH'] += os.pathsep + r'C:\Users\ADMIN\AppData\Roaming\Microsoft\Windows\Recent\kaggle.json.lnk'

In [None]:
# download dataset
!kaggle datasets download -d surajghuwalewala/ham1000-segmentation-and-classification -p datasets/ham10000_raw --unzip

Dataset URL: https://www.kaggle.com/datasets/surajghuwalewala/ham1000-segmentation-and-classification


  0%|          | 0.00/2.59G [00:00<?, ?B/s]
  3%|▎         | 70.0M/2.59G [00:00<00:03, 691MB/s]
  6%|▌         | 152M/2.59G [00:00<00:03, 757MB/s] 
  8%|▊         | 225M/2.59G [00:00<00:04, 631MB/s]
 11%|█         | 288M/2.59G [00:00<00:03, 640MB/s]
 13%|█▎        | 351M/2.59G [00:00<00:04, 547MB/s]
 16%|█▌        | 422M/2.59G [00:00<00:03, 597MB/s]
 18%|█▊        | 482M/2.59G [00:00<00:03, 573MB/s]
 20%|██        | 539M/2.59G [00:00<00:03, 577MB/s]
 22%|██▏       | 596M/2.59G [00:01<00:03, 544MB/s]
 25%|██▍       | 659M/2.59G [00:01<00:04, 494MB/s]
 27%|██▋       | 718M/2.59G [00:01<00:03, 525MB/s]
 29%|██▉       | 777M/2.59G [00:01<00:03, 550MB/s]
 31%|███▏      | 831M/2.59G [00:01<00:03, 511MB/s]
 33%|███▎      | 886M/2.59G [00:01<00:03, 528MB/s]
 35%|███▌      | 941M/2.59G [00:01<00:03, 541MB/s]
 37%|███▋      | 994M/2.59G [00:01<00:03, 501MB/s]
 40%|███▉      | 1.02G/2.59G [00:01<00:03, 518MB/s]
 42%|████▏     | 1.08G/2.59G [00:02<00:03, 527MB/s]
 43%|████▎     | 1.12G/2.59G [00:


License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading ham1000-segmentation-and-classification.zip to datasets/ham10000_raw



In [6]:
# create label
df = pd.read_csv("datasets/ham10000_raw/GroundTruth.csv")
df["label"] = df[["MEL", "NV", "BCC", "AKIEC", "BKL", "DF", "VASC"]].idxmax(axis=1).str.lower()
df = df[df["label"].isin(["mel", "nv", "bcc", "akiec", "bkl"])].reset_index(drop=True)
df["file_path"] = df["image"].apply(lambda x: f"datasets/ham10000_raw/images/{x}.jpg")


In [7]:
df.head()

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC,label,file_path
0,ISIC_0024306,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nv,datasets/ham10000_raw/images/ISIC_0024306.jpg
1,ISIC_0024307,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nv,datasets/ham10000_raw/images/ISIC_0024307.jpg
2,ISIC_0024308,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nv,datasets/ham10000_raw/images/ISIC_0024308.jpg
3,ISIC_0024309,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nv,datasets/ham10000_raw/images/ISIC_0024309.jpg
4,ISIC_0024310,1.0,0.0,0.0,0.0,0.0,0.0,0.0,mel,datasets/ham10000_raw/images/ISIC_0024310.jpg


In [8]:
# count total for each target class
df["label"] = df["label"].str.lower()

target_classes = ["mel", "nv", "bcc", "bkl", "akiec"]

df = df[df["label"].isin(target_classes)].reset_index(drop=True)

df["label"].value_counts()

label
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
Name: count, dtype: int64

### Phase 1 Prep

In [10]:
# targeted data augmentation and save it
aug = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    shear_range=0.15,
    fill_mode='nearest'
)

augmentation_plan = {
    "mel": 6,
    "bkl": 6,
    "bcc": 12,
    "akiec": 20
}

augmented_dir = Path("datasets/ham10000_augmented")
augmented_dir.mkdir(parents=True, exist_ok=True)

for label, times in augmentation_plan.items():
    subset = df[df["label"] == label]
    save_folder = augmented_dir / label
    save_folder.mkdir(parents=True, exist_ok=True)
    for _, row in subset.iterrows():
        img = load_img(row["file_path"])
        img_array = img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        gen = aug.flow(img_array, batch_size=1)
        for i in range(times):
            aug_img = next(gen)[0].astype(np.uint8)
            save_img(save_folder / f"{row['image']}_aug{i}.jpg", aug_img)


In [11]:
# copy original + augmented to phase1_prepared/skin_disease/
phase1_path = Path("datasets/phase1_prepared/skin_disease")
phase1_path.mkdir(parents=True, exist_ok=True)

for _, row in df.iterrows():
    shutil.copy(row["file_path"], phase1_path / f"{row['image']}.jpg")

for label in augmentation_plan:
    for f in (augmented_dir / label).glob("*.jpg"):
        shutil.copy(f, phase1_path / f.name)

In [12]:
# splitting to 80 10 10
image_paths = list(phase1_path.glob("*.jpg"))

df_phase1 = pd.DataFrame({
    "file_path": image_paths,
    "label": ["skin"] * len(image_paths)
})

train_df, temp_df = train_test_split(df_phase1, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

split_data = [("train", train_df), ("val", val_df), ("test", test_df)]
for split_name, split_df in split_data:
    out_dir = Path(f"datasets/phase1_prepared/{split_name}/skin")
    out_dir.mkdir(parents=True, exist_ok=True)
    for _, row in split_df.iterrows():
        shutil.copy(row["file_path"], out_dir / row["file_path"].name)

print("Done splitting and copying Phase 1 dataset.")

Done splitting and copying Phase 1 dataset.


### Phase 1 Trian Model