# **Import used libraries**


In [6]:
import pandas as pd
from google.colab import drive
import os
%matplotlib notebook
from random import shuffle
import numpy as np
from tqdm import tqdm
import zipfile
from shutil import copy, move
from PIL import Image

# **Connecting to Google Drive**





Use `flush_and_unmount` and `force_remount=True` for safe connection.

---

In [41]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **Helper functions and information are here**

> Specify folders path to access later in code






In [8]:
metadata_path = "/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/Metadata/challenge-2019-training_metadata_2023-07-28.csv"
images_lake_path = "/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/ISIC_2019_Training_Input"
partitioned_images_path = "/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/Partitioned Data"
test_dataset_path = "/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/Test Dataset"
validation_dataset_path = "/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/Validation Dataset"
train_dataset_path = "/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/Trainning Dataset"
fl_test_dataset_path = "/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/FL Test Dataset"
fl_train_dataset_path = "/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/FL Training Dataset"

In [9]:
# use this for 3 type detection (in use)
diseases = ['nevus', 'melanoma', 'basal cell carcinoma']

# use this for 8 type detection, (you shoud make many changes to use this list in detection)
# diseases = ['nevus', 'melanoma', 'actinic keratosis', 'basal cell carcinoma', 'dermatofibroma', \
# 'pigmented benign keratosis', 'squamous cell carcinoma', 'vascular lesion']

> Implement function for zip folder to download in server.

In [10]:
def zip_folder(folder_path, output_zip):
    """
    Zip the contents of a folder.

    :param folder_path: The path to the folder you want to zip.
    :param output_zip: The path to the output zip file.
    """
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname=arcname)

**Partitioning Process**
> Convert data image lake to the partition with smaller size to process them easier.

In [11]:
def create_partitions(images_lake_path: str, partitioned_images_path: str, filenames_list: list, partition_size: int):

  partition_number = int(len(filenames_list)/partition_size)+1

  for partition_id in range(partition_number):
    partition_name = "images_partition_{}".format(partition_id)
    destination_abs_path = os.path.join(partitioned_images_path, partition_name)
    os.mkdir(destination_abs_path)

    try:
      partition_filenames = filenames_list[partition_id*partition_size:(partition_id+1)*partition_size]
    except IndexError:
      partition_filenames = filenames_list[partition_id*partition_size:]

    for filename in tqdm(partition_filenames, desc="Partitionind data lake ... "):
      filename_abs_path = images_lake_path+"/"+filename
      copy(filename_abs_path, destination_abs_path)

    print(" --- Partition {} is created successfully.".format(partition_id))

**Cleaning Process**

> Format the datasets' folders including:


*   Train Dataset
*   Test Dataset
*   Validation Dataset





In [12]:
def format_dataset(target_dataset_path: str):
  folders_name_list = os.listdir(target_dataset_path)
  print("There are {} folders in this path: ".format(len(folders_name_list)))
  for folder_name in folders_name_list: print(folder_name)

  print("\n\nStart removing files from folders in this path ... \n\n")
  for folder_name in folders_name_list:
    folder_path = os.path.join(target_dataset_path, folder_name)
    filenames_list = os.listdir(folder_path)
    shuffle(filenames_list)

    for file_name in tqdm(filenames_list, desc="Removing {} files in {} folder - path {}".format(len(filenames_list), folder_name, target_dataset_path)):
      file_path = os.path.join(folder_path, file_name)

      if os.path.isfile(file_path):
         os.remove(file_path)


**Datasets Status**

> Show dataset image files count per disease






In [13]:
def show_dataset_status():
  # add dataset path you need to its status to the below dict
  datasets_path_list = {"Train": train_dataset_path, "Validation": validation_dataset_path, "Test": test_dataset_path, \
                        "FL Train": fl_train_dataset_path, "FL Test": fl_test_dataset_path, }

  for dataset_name in datasets_path_list:
    print("\n\nCount of images in {} dataset per disease: \n".format(dataset_name))
    dataset_path = datasets_path_list[dataset_name]

    folders_name_list = os.listdir(dataset_path)
    for folders_name in folders_name_list:
      folder_path = os.path.join(dataset_path, folders_name)
      files_number_per_folder = len(os.listdir(folder_path))
      print("Number of images in {} dataset : {}  --> {}".format(dataset_name, folders_name, files_number_per_folder))

**Categorising Images**

> Categorize and load images to training dataset folder.


In [14]:
def filter_images(partitioned_images_path: str, train_dataset_path: str):
  partitions_names_list = os.listdir(partitioned_images_path)

  for partition_name in partitions_names_list:
    partition_directory_abs_path = os.path.join(partitioned_images_path, partition_name)
    filenames_per_partition = os.listdir(partition_directory_abs_path)

    for filename in tqdm(filenames_per_partition, desc="Filtering the partion {} ...".format(partition_name.split("_")[-1])):
        disease_label = (' '.join(metadata_df.loc[metadata_df['isic_id'] == filename[:12]]['diagnosis'].to_string().split(" ")[4:]))
        filename_abs_path = os.path.join(partition_directory_abs_path, filename)
        destination_abs_path = os.path.join(train_dataset_path, disease_label)
        # img = Image.open(filename_abs_path)

        if disease_label in diseases:
          copy(filename_abs_path, destination_abs_path)

    print(" --- Partition {} filtered into the categories.".format(partition_name.split("_")[-1]))

**Create validation dataset**


> Move **validation_percent** of images from train dataset to validation dataset






In [39]:
def create_validation_dataset(train_dataset_path: str, validation_dataset_path:str, validation_percent: float):
    disease_names_list = os.listdir(train_dataset_path)
    # disease_names_list = ['basal cell carcinoma']

    for disease_name in disease_names_list:
        disease_directory_abs_path = os.path.join(train_dataset_path, disease_name)
        filenames_per_disease = os.listdir(disease_directory_abs_path)

        shuffle(filenames_per_disease)
        # instance_number_validation = int(len(filenames_per_disease)*validation_percent)
        instance_number_validation = 44

        destination_abs_path = os.path.join(validation_dataset_path, disease_name)
        for filename in tqdm(filenames_per_disease[:instance_number_validation], desc="Moving {} images from {} folder to validation dataset...".format(instance_number_validation, disease_name)):
          filename_abs_path = os.path.join(disease_directory_abs_path, filename)
          move(filename_abs_path, destination_abs_path)

# **Analyze images metadata**

In [None]:
metadata_df = pd.read_csv(metadata_path)

In [None]:
metadata_df

Unnamed: 0,isic_id,attribution,copyright_license,age_approx,anatom_site_general,benign_malignant,clin_size_long_diam_mm,dermoscopic_type,diagnosis,diagnosis_confirm_type,family_hx_mm,image_type,melanocytic,nevus_type,personal_hx_mm,sex
0,ISIC_0000000,Anonymous,CC-0,55.0,anterior torso,benign,,,nevus,,,dermoscopic,True,,,female
1,ISIC_0000000,Anonymous,CC-0,55.0,anterior torso,benign,,,nevus,,,dermoscopic,True,,,female
2,ISIC_0000000,Anonymous,CC-0,55.0,anterior torso,benign,,,nevus,,,dermoscopic,True,,,female
3,ISIC_0000000,Anonymous,CC-0,55.0,anterior torso,benign,,,nevus,,,dermoscopic,True,,,female
4,ISIC_0000001,Anonymous,CC-0,30.0,anterior torso,benign,,,nevus,,,dermoscopic,True,,,female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44050,ISIC_0073247,Hospital Clínic de Barcelona,CC-BY-NC,85.0,head/neck,,,,basal cell carcinoma,,,,False,,,female
44051,ISIC_0073248,Hospital Clínic de Barcelona,CC-BY-NC,65.0,anterior torso,,,,seborrheic keratosis,histopathology,,,False,,,male
44052,ISIC_0073249,Hospital Clínic de Barcelona,CC-BY-NC,70.0,lower extremity,malignant,,,melanoma,histopathology,,,True,,,male
44053,ISIC_0073251,Hospital Clínic de Barcelona,CC-BY-NC,55.0,palms/soles,benign,,,nevus,histopathology,,,True,,,female


In [None]:
# remove duplicate values and reset indexing
metadata_df.drop_duplicates(inplace=True)
metadata_df.reset_index(drop=True, inplace=True)
metadata_df

Unnamed: 0,isic_id,attribution,copyright_license,age_approx,anatom_site_general,benign_malignant,clin_size_long_diam_mm,dermoscopic_type,diagnosis,diagnosis_confirm_type,family_hx_mm,image_type,melanocytic,nevus_type,personal_hx_mm,sex
0,ISIC_0000000,Anonymous,CC-0,55.0,anterior torso,benign,,,nevus,,,dermoscopic,True,,,female
1,ISIC_0000001,Anonymous,CC-0,30.0,anterior torso,benign,,,nevus,,,dermoscopic,True,,,female
2,ISIC_0000002,Anonymous,CC-0,60.0,upper extremity,malignant,,,melanoma,histopathology,,dermoscopic,True,,,female
3,ISIC_0000003,Anonymous,CC-0,30.0,upper extremity,benign,,,nevus,,,dermoscopic,True,,,male
4,ISIC_0000004,Anonymous,CC-0,80.0,posterior torso,malignant,,,melanoma,histopathology,,dermoscopic,True,,,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,Hospital Clínic de Barcelona,CC-BY-NC,85.0,head/neck,,,,basal cell carcinoma,,,,False,,,female
25327,ISIC_0073248,Hospital Clínic de Barcelona,CC-BY-NC,65.0,anterior torso,,,,seborrheic keratosis,histopathology,,,False,,,male
25328,ISIC_0073249,Hospital Clínic de Barcelona,CC-BY-NC,70.0,lower extremity,malignant,,,melanoma,histopathology,,,True,,,male
25329,ISIC_0073251,Hospital Clínic de Barcelona,CC-BY-NC,55.0,palms/soles,benign,,,nevus,histopathology,,,True,,,female


**Statistical Information of ISIC 2019 Dataset**










In [None]:
metadata_df['sex'].value_counts(normalize=True, dropna=False) * 100



```
male      52.449568
female    46.034503
NaN        1.515929
Name: sex, dtype: float64
```




In [None]:
metadata_df['anatom_site_general'].value_counts(normalize=True, dropna=False) * 100



```
anterior torso     27.298567
lower extremity    19.699183
head/neck          18.108247
upper extremity    11.487900
posterior torso    11.002329
NaN                10.386483
palms/soles         1.571197
oral/genital        0.232916
lateral torso       0.213178
Name: anatom_site_general, dtype: float64
```



In [None]:
metadata_df['age_approx'].value_counts(normalize=True, dropna=False) * 100



```
45.0    10.204887
50.0     9.825905
40.0     8.866606
55.0     8.566579
70.0     8.369192
65.0     8.191544
60.0     8.037582
75.0     7.090127
35.0     6.517706
80.0     5.759741
85.0     5.207059
30.0     4.733331
25.0     2.672615
NaN      1.902807
20.0     1.531720
15.0     1.480400
10.0     0.560578
5.0      0.446094
0.0      0.035530
Name: age_approx, dtype: float64
```



In [None]:
metadata_df['benign_malignant'].value_counts(normalize=True, dropna=False) * 100



```
benign                  52.319293
NaN                     29.793534
malignant               17.855592
indeterminate            0.019739
indeterminate/benign     0.011843
Name: benign_malignant, dtype: float64
```



In [None]:
metadata_df['diagnosis'].value_counts(normalize=True, dropna=False) * 100



```
nevus                         50.827050
melanoma                      17.851644
basal cell carcinoma          13.118314
seborrheic keratosis           5.195215
pigmented benign keratosis     4.338557
actinic keratosis              3.422684
squamous cell carcinoma        2.479176
vascular lesion                0.998776
dermatofibroma                 0.943508
solar lentigo                  0.825076
Name: diagnosis, dtype: float64
```



In [None]:
metadata_df['melanocytic'].value_counts(normalize=True, dropna=False) * 100



```
True     68.678694
False    31.321306
Name: melanocytic, dtype: float64
```



In [None]:
metadata_df['dermoscopic_type'].value_counts(normalize=True, dropna=False) * 100



```
NaN                      94.354743
contact non-polarized     5.645257
Name: dermoscopic_type, dtype: float64
```



In [None]:
metadata_df['image_type'].value_counts(normalize=True, dropna=False) * 100



```
dermoscopic    50.996802
NaN            49.003198
Name: image_type, dtype: float64
```



# **Dataset handling**



---



>   **Be aware to use these codes**



---



In [None]:
# make a list from all images filename

# filenames_list = os.listdir(images_lake_path)

In [None]:
# create_partitions(images_lake_path, partitioned_images_path, filenames_list, 250)

In [None]:
# format_dataset(fl_train_dataset_path)

In [None]:
# filter_images(partitioned_images_path, train_dataset_path)

In [38]:
# create_validation_dataset(test_dataset_path, validation_dataset_path , 0.1)

In [None]:
# show_dataset_status()

> **Zip dataset folder for download in server**

In [None]:
# Example usage:
folder_to_zip = '/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/FL Training Dataset'  # Replace with the path to your folder
output_zipfile = '/content/drive/MyDrive/BSc Project/Skin Cancer Datasets/ISIC_2019_Dataset/FL_Training_Dataset.zip'  # Replace with the desired output path

# zip_folder(folder_to_zip, output_zipfile)