# Skin Disease Detection using Mobile Application
## Final Year Project 2
Ahmad Daniel Ikhwan Bin Rosli <br>
1201103071

In [15]:
import os
import shutil
import random
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [1]:
img_size = (224, 224)
split_ratio = [0.8, 0.1, 0.1]  
random_seed = 42

In [2]:
!kaggle datasets download -d shubhamgoel27/dermnet --unzip -p datasets/dermnet_raw

Dataset URL: https://www.kaggle.com/datasets/shubhamgoel27/dermnet
License(s): Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)


In [12]:
target_classes = {
    "Acne and Rosacea Photos": "Acne",
    "Eczema Photos": "Eczema",
    "Psoriasis pictures Lichen Planus and related diseases": "Psoriasis",
    "Tinea Ringworm Candidiasis and other Fungal Infections": "Tinea",
    "Melanoma Skin Cancer Nevi and Moles": "Melanoma"
}

raw_path = Path("datasets/dermnet_raw")     
prep_path = Path("datasets/dermnet_prep")    

In [13]:
if prep_path.exists():
    shutil.rmtree(prep_path)
prep_path.mkdir(parents=True)

for original_name, clean_name in target_classes.items():
    dst_dir = prep_path / clean_name
    dst_dir.mkdir(parents=True, exist_ok=True)
    
    img_counter = 0

    for split in ['train', 'test']:
        src_dir = raw_path / split / original_name
        if not src_dir.exists():
            continue

        for img_file in tqdm(list(src_dir.glob("*.jpg")), desc=f"Copying {clean_name} ({split})"):
            dst_file = dst_dir / f"{clean_name}_{img_counter:05d}.jpg"
            shutil.copy(img_file, dst_file)
            img_counter += 1

Copying Acne (train): 100%|██████████| 840/840 [00:08<00:00, 104.16it/s]
Copying Acne (test): 100%|██████████| 308/308 [00:02<00:00, 131.26it/s]
Copying Eczema (train): 100%|██████████| 1235/1235 [00:11<00:00, 103.21it/s]
Copying Eczema (test): 100%|██████████| 309/309 [00:02<00:00, 105.07it/s]
Copying Psoriasis (train): 100%|██████████| 1405/1405 [00:13<00:00, 103.14it/s]
Copying Psoriasis (test): 100%|██████████| 352/352 [00:03<00:00, 103.64it/s]
Copying Tinea (train): 100%|██████████| 1300/1300 [00:12<00:00, 105.06it/s]
Copying Tinea (test): 100%|██████████| 325/325 [00:03<00:00, 106.83it/s]
Copying Melanoma (train): 100%|██████████| 463/463 [00:04<00:00, 100.85it/s]
Copying Melanoma (test): 100%|██████████| 116/116 [00:01<00:00, 103.52it/s]


In [16]:
filepaths = []
labels = []

for class_folder in prep_path.iterdir():
    if class_folder.is_dir():
        for img_file in class_folder.glob("*.jpg"):
            filepaths.append(str(img_file))
            labels.append(class_folder.name)

df = pd.DataFrame({
    'filepath': filepaths,
    'label': labels
})

print(df.head())
print(df['label'].value_counts())


                                    filepath label
0  datasets\dermnet_prep\Acne\Acne_00000.jpg  Acne
1  datasets\dermnet_prep\Acne\Acne_00001.jpg  Acne
2  datasets\dermnet_prep\Acne\Acne_00002.jpg  Acne
3  datasets\dermnet_prep\Acne\Acne_00003.jpg  Acne
4  datasets\dermnet_prep\Acne\Acne_00004.jpg  Acne
label
Psoriasis    1757
Tinea        1625
Eczema       1544
Acne         1148
Melanoma      579
Name: count, dtype: int64


In [17]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)


Train: (5322, 2)
Val: (665, 2)
Test: (666, 2)
