In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Activation, GlobalAveragePooling2D, Conv2D, MaxPool2D, Flatten
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.applications import MobileNetV3Small
from tensorflow.keras.applications import imagenet_utils
from sklearn.metrics import pair_confusion_matrix
from sklearn.metrics import confusion_matrix
import itertools
import os
import shutil
import random
from matplotlib import pyplot as plt
%matplotlib inline

In [6]:
# Downloaded dataset from:
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
# Path to destination directory where we want subfolders
dest_dir = os.getcwd() + "/gdrive/My Drive/Colab Notebooks/Dissertation/Reorganised_binary/"

In [8]:
# Dump all images into a folder and specify the path:
data_dir = os.getcwd() + "/gdrive/My Drive/Colab Notebooks/Dissertation/Dataset HAM10000/HAM10000_images/"

In [None]:
len(os.listdir(data_dir))

10015

In [9]:
# Read the csv file containing image names and corresponding labels
skin_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Dissertation/HAM10000_metadata.csv')

In [10]:
# extracting image number in new column
skin_df ['image_n'] = skin_df['image_id'].str.split('_', 1, expand=True)[1].astype(int)
skin_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,image_n
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,27419
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,25030
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,26769
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,25661
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,31633
...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern,33084
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern,33550
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern,33536
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern,32854


In [11]:
label=skin_df['dx'].unique().tolist()  #Extract labels into a list
label

['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec']

In [12]:
my_array = np.asarray(skin_df['dx'])

In [13]:
my_array[my_array != 'mel'] = 'non_mel'

In [14]:
skin_df['label_binary'] = my_array.tolist()

In [15]:
skin_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,image_n,label_binary
0,HAM_0000118,ISIC_0027419,non_mel,histo,80.0,male,scalp,vidir_modern,27419,non_mel
1,HAM_0000118,ISIC_0025030,non_mel,histo,80.0,male,scalp,vidir_modern,25030,non_mel
2,HAM_0002730,ISIC_0026769,non_mel,histo,80.0,male,scalp,vidir_modern,26769,non_mel
3,HAM_0002730,ISIC_0025661,non_mel,histo,80.0,male,scalp,vidir_modern,25661,non_mel
4,HAM_0001466,ISIC_0031633,non_mel,histo,75.0,male,ear,vidir_modern,31633,non_mel
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,non_mel,histo,40.0,male,abdomen,vidir_modern,33084,non_mel
10011,HAM_0002867,ISIC_0033550,non_mel,histo,40.0,male,abdomen,vidir_modern,33550,non_mel
10012,HAM_0002867,ISIC_0033536,non_mel,histo,40.0,male,abdomen,vidir_modern,33536,non_mel
10013,HAM_0000239,ISIC_0032854,non_mel,histo,80.0,male,face,vidir_modern,32854,non_mel


In [16]:
print(skin_df['label_binary'].value_counts())

non_mel    8902
mel        1113
Name: label_binary, dtype: int64


In [17]:
label=skin_df['label_binary'].unique().tolist()  #Extract labels into a list
label

['non_mel', 'mel']

In [18]:
# now I need to filter the df so that I take 1113 random non_mel images:
skin_df_balance = skin_df[skin_df['label_binary'] == 'non_mel']
skin_df_mel = skin_df[skin_df['label_binary'] == 'mel']

In [19]:
skin_df_balance = skin_df_balance.sample(n=1113)

In [20]:
skin_df_final = pd.concat([skin_df_balance, skin_df_mel], axis=0)

In [22]:
print(skin_df_final['label_binary'].value_counts())

non_mel    1113
mel        1113
Name: label_binary, dtype: int64


In [21]:
label_images = []
sample = skin_df_final[skin_df_final['label_binary'] == 'mel']['image_id']
#sample   
label_images.extend(sample)

In [None]:
label_images = []
# Copy images to new folders
for i in label:
    os.mkdir(dest_dir + str(i) + "/")
    sample = skin_df_final[skin_df_final['label_binary'] == i]['image_id']
    label_images.extend(sample)
    for id in label_images:
        shutil.copyfile((data_dir + "/"+ id +".jpg"), (dest_dir + i + "/"+id+".jpg"))
    label_images=[]    

In [7]:
dest_dir

'/content/gdrive/My Drive/Colab Notebooks/Dissertation/Reorganised_binary/'

In [None]:
# organise data into train, valid,test directories: 1778 training, 224 validation, 224 test
os.chdir(dest_dir)   #'/gdrive/My Drive/Colab Notebooks/Dissertation/Reorganised_binary/')
if os.path.isdir('train/0/') is False:
  os.mkdir('train')
  os.mkdir('valid')
  os.mkdir('test')

  for i in label:
    shutil.move (f'{i}', 'train')
    os.mkdir(f'valid/{i}')
    os.mkdir(f'test/{i}')

    valid_samples = random.sample(os.listdir(f'train/{i}'), 112)
    for j in valid_samples:
      shutil.move(f'train/{i}/{j}', f'valid/{i}')
    
    test_samples = random.sample(os.listdir(f'train/{i}'), 112)
    for k in test_samples:
      shutil.move(f'train/{i}/{k}', f'test/{i}')

In [None]:
print ("train mel: " + str(len(os.listdir(dest_dir + 'train/mel'))))
print ("train non mel: " + str(len(os.listdir(dest_dir + 'train/non_mel'))))
print ("valid mel: " + str(len(os.listdir(dest_dir + 'valid/mel'))))
print ("valid non mel: " + str(len(os.listdir(dest_dir + 'valid/non_mel'))))
print ("test mel: " + str(len(os.listdir(dest_dir + 'test/mel'))))
print ("test non mel: " + str(len(os.listdir(dest_dir + 'test/non_mel'))))

train mel: 889
train non mel: 889
valid mel: 112
valid non mel: 112
test mel: 112
test non mel: 112
