## Import all of used library

is needed to run all of the cells in this section

In [1]:
# Import library that needed for colab
from google.colab import drive # colab only
from google.colab import files # colab only

import os
import cv2
import zipfile
import pandas as pd
import requests
import random

from shutil import move
from shutil import copy
from shutil import make_archive
from shutil import rmtree

In [2]:
# To mounting Google Drive

def google_drive_mount(mounting=False):
  if mounting:
    drive.mount('/content/drive')

## Converting the datasets

Purpose for this section is to Converting the datasets into smaller and more memory friendly format
This section has already ran, so no need to run all of the cells again.

Resizing the datasets into small size and rearranging to make training more faster, <br>
Source datasets: https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000

In [None]:
google_drive_mount(mounting=True)

In [3]:
# Upload the *.json file to the colab, *.json file is for auth to kaggle 
# You need an auth *.json files from your kaggle account
# To download the files through the API
files.upload() 

# install some requirements for kaggle
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json  # set permission

# Source: https://www.kaggle.com/general/74235

Saving kaggle.json to kaggle.json
kaggle.json


In [4]:
# Change directory to Google Drive
if not os.path.exists(f"{os.getcwd()}/fix-datasets"):
  os.mkdir(f"{os.getcwd()}/fix-datasets")

os.chdir(f"{os.getcwd()}/fix-datasets")

if not os.path.exists(f'{os.getcwd()}/skin-cancer-mnist-ham10000'):
  os.mkdir(f'{os.getcwd()}/skin-cancer-mnist-ham10000') # Make directory for MNIST HAM 10000 

os.chdir(f'{os.getcwd()}/skin-cancer-mnist-ham10000') # Change directory to MNIST HAM 10000 

In [5]:
# Downloading the datasets using Kaggle API
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

Downloading skin-cancer-mnist-ham10000.zip to /content/fix-datasets/skin-cancer-mnist-ham10000
100% 5.20G/5.20G [01:53<00:00, 62.8MB/s]
100% 5.20G/5.20G [01:53<00:00, 49.1MB/s]


In [6]:
# Extracting the datasets
local_zip = f'{os.getcwd()}/skin-cancer-mnist-ham10000.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall(f'{os.getcwd()}/')
zip_ref.close()

# Removing the old 'huge' zip
os.remove(f'{os.getcwd()}/skin-cancer-mnist-ham10000.zip')

In [7]:
# Resizing and moving function
def resize_img(SOURCE, DEST, SIZE=150, remove_after_resize=False):
  
  files = []
  
  if not os.path.exists(DEST):
      os.mkdir(DEST)
  
  for filename in os.listdir(SOURCE):
    file = SOURCE + filename
    if os.path.getsize(file) > 0:
      files.append(filename)
    else:
      print(filename + " is zero length, so ignoring.")
  
  print(f"{SOURCE}: {len(files)}")
  
  for filename in files:
    if '.jpg' in filename:
      img = cv2.imread(f"{SOURCE}{filename}")
      resize_img = cv2.resize(img, (SIZE,SIZE))
      cv2.imwrite(f"{SOURCE}/{filename}", resize_img)
      move(f"{SOURCE}/{filename}",f"{DEST}/{filename}")
  
  print(f"Succesfully moving {SOURCE} to {DEST}")
  
  print(f"After Moving {DEST}: {len(os.listdir(DEST))}\n") # Checking how much file in HAM10000 to make sure there is no file lost
  
  if remove_after_resize:
    rmtree(SOURCE)

In [8]:
# Resizing the image
resize_img(f'{os.getcwd()}/HAM10000_images_part_1/', f'{os.getcwd()}/HAM10000/', remove_after_resize=True)
resize_img(f'{os.getcwd()}/HAM10000_images_part_2/', f'{os.getcwd()}/HAM10000/', remove_after_resize=True)

# Remove some dir, to make directory more clean
rmtree(f'{os.getcwd()}/ham10000_images_part_1')
rmtree(f'{os.getcwd()}/ham10000_images_part_2')

/content/fix-datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1/: 5000
Succesfully moving /content/fix-datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1/ to /content/fix-datasets/skin-cancer-mnist-ham10000/HAM10000/
After Moving /content/fix-datasets/skin-cancer-mnist-ham10000/HAM10000/: 5000

/content/fix-datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_2/: 5015
Succesfully moving /content/fix-datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_2/ to /content/fix-datasets/skin-cancer-mnist-ham10000/HAM10000/
After Moving /content/fix-datasets/skin-cancer-mnist-ham10000/HAM10000/: 10015



In [None]:
# Converting lesion type to more readable format
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

# Reading the csv file
df = pd.read_csv(f'{os.getcwd()}/HAM10000_metadata.csv')

# Adding more columns in csv
df['cell_type'] = df['dx'].map(lesion_type_dict.get) 

# Get sample from various class except melanoma
df_random0 = df[df['cell_type'] == 'Melanocytic nevi'].sample(n=250, random_state=1)
df_random1 = df[df['cell_type'] == 'Dermatofibroma'].sample(n=115, random_state=1)
df_random2 = df[df['cell_type'] == 'Vascular lesions'].sample(n=142, random_state=1)
df_random3 = df[df['cell_type'] == 'Basal cell carcinoma'].sample(n=250, random_state=1)
df_random4 = df[df['cell_type'] == 'Actinic keratoses'].sample(n=250, random_state=1)
df_random5 = df[df['cell_type'] == 'Benign keratosis-like lesions'].sample(n=250, random_state=1)

# Drop unused row because we have got a sample above
df = df.drop(df[df['cell_type']== 'Melanocytic nevi'].index)
df = df.drop(df[df['cell_type']== 'Dermatofibroma'].index) 
df = df.drop(df[df['cell_type']== 'Vascular lesions'].index) 
df = df.drop(df[df['cell_type']== 'Basal cell carcinoma'].index) 
df = df.drop(df[df['cell_type']== 'Actinic keratoses'].index) 
df = df.drop(df[df['cell_type']== 'Benign keratosis-like lesions'].index) 

# Appending sample to parent dataframe
df = df.append(df_random0)
df = df.append(df_random1)
df = df.append(df_random2)
df = df.append(df_random3)
df = df.append(df_random4)
df = df.append(df_random5)

# Because we only detect 2 class, malignant and benign 
# so we rename all of the column to Benign except Melanoma
df['cell_type'] = df['cell_type'].replace('Melanocytic nevi', 'Benign')
df['cell_type'] = df['cell_type'].replace('Melanoma', 'Malignant')
df['cell_type'] = df['cell_type'].replace('Benign keratosis-like lesions', 'Benign')
df['cell_type'] = df['cell_type'].replace('Basal cell carcinoma', 'Benign')
df['cell_type'] = df['cell_type'].replace('Actinic keratoses', 'Benign')
df['cell_type'] = df['cell_type'].replace('Vascular lesions', 'Benign')
df['cell_type'] = df['cell_type'].replace('Dermatofibroma', 'Benign')

# One hot encoding datasets
df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes

# Writing new metadata.csv
df.to_csv(f'{os.getcwd()}/metadata.csv')

# Then delete unnecessary files on HAM10000 then moved to HAM10000-fix
df_read = pd.read_csv(f'{os.getcwd()}/metadata.csv')
os.mkdir(f'{os.getcwd()}/data')
for i in range(len(df_read)):
  move(f"{os.getcwd()}/HAM10000/{df_read['image_id'].values[i]}.jpg", f"{os.getcwd()}/data/{df_read['image_id'].values[i]}.jpg")

# Then delete unnecessary files
rmtree(f'{os.getcwd()}/HAM10000/')
os.remove(f'{os.getcwd()}/hmnist_28_28_L.csv')
os.remove(f'{os.getcwd()}/hmnist_28_28_RGB.csv')
os.remove(f'{os.getcwd()}/hmnist_8_8_L.csv')
os.remove(f'{os.getcwd()}/hmnist_8_8_RGB.csv')

In [None]:
df['cell_type'].value_counts()

Benign       1257
Malignant    1113
Name: cell_type, dtype: int64

In [None]:
os.chdir('/content/fix-datasets')
make_archive('another_ham10000_2Class', 'zip', '/content/fix-datasets/skin-cancer-mnist-ham10000') # Create the new zip file

'/content/fix-datasets/another_ham10000_2Class.zip'

In [None]:
copy('/content/fix-datasets/another_ham10000_2Class.zip', '/content/drive/MyDrive/fix-datasets-new')

'/content/drive/MyDrive/fix-datasets-new/another_ham10000_2Class.zip'

We successfully resizing the file from 5 GB to only 60 MB.
After that, we host the zip file to Google Drive <br> here is the links: https://drive.google.com/file/d/1-dOK_6g-Bkf8_SKcZKUwiCIj8TSqC7O5/view?usp=sharing