In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pydicom as dicom
import os, sys, time, shutil, scipy, cv2, json, datetime
import PIL.Image
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split


# Dataset 1 (Train and Validation)

In [2]:
df = pd.read_csv('../data/train_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target,gen_enc,site_enc,diag_enc
0,0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0,1,0,8
1,1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0,0,6,8
2,2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0,0,1,5
3,3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0,0,0,8
4,4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0,0,6,8


In [3]:
melanoma = df[df.target == 1]
non_melanoma = df[df.target == 0]
y_mel = melanoma.target
X_mel = melanoma.drop(columns=['target'], axis=1)
y_non_mel = non_melanoma.target
X_non_mel = non_melanoma.drop(columns=['target'], axis=1)

In [4]:
X_tr_m, X_val_m, y_tr_m, y_val_m = train_test_split(X_mel, y_mel, random_state=42, test_size=0.2)
X_tr_nm, X_val_nm, y_tr_nm, y_val_nm = train_test_split(X_non_mel, y_non_mel, random_state=42, test_size=0.2)

mel_train = pd.concat([X_tr_m, y_tr_m], axis=1)
non_mel_train = pd.concat([X_tr_nm, y_tr_nm], axis=1)
mel_val = pd.concat([X_val_m, y_val_m], axis=1)
non_mel_val = pd.concat([X_val_nm, y_val_nm], axis=1)

train = pd.concat([mel_train, non_mel_train], axis=0)
val = pd.concat([mel_val, non_mel_val], axis=0)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,gen_enc,site_enc,diag_enc,target
0,4886,ISIC_1569119,IP_6120178,IL_3157235,male,65.0,torso,melanoma,malignant,1,4,4,1
1,24825,ISIC_7536704,IP_3994607,IL_1685088,male,75.0,torso,melanoma,malignant,1,4,4,1
2,12008,ISIC_3696488,IP_6051587,IL_8878499,female,90.0,upper extremity,melanoma,malignant,0,6,4,1
3,10831,ISIC_3343475,IP_4576785,IL_9262145,male,80.0,head/neck,melanoma,malignant,1,0,4,1
4,25661,ISIC_7785723,IP_8447624,IL_2711936,female,50.0,lower extremity,melanoma,malignant,0,1,4,1


In [6]:
train = train[['file', 'target']]
train.head()

Unnamed: 0,file,target
0,ISIC_1569119,1
1,ISIC_7536704,1
2,ISIC_3696488,1
3,ISIC_3343475,1
4,ISIC_7785723,1


In [7]:
val.head()

Unnamed: 0.1,Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,gen_enc,site_enc,diag_enc,target
0,22329,ISIC_6767569,IP_7665112,IL_1769376,male,70.0,head/neck,melanoma,malignant,1,0,4,1
1,24088,ISIC_7312977,IP_8675890,IL_2723500,male,55.0,lower extremity,melanoma,malignant,1,1,4,1
2,5627,ISIC_1785627,IP_0170821,IL_6029631,female,40.0,upper extremity,melanoma,malignant,0,6,4,1
3,26613,ISIC_8066110,IP_3055814,IL_7066374,male,50.0,lower extremity,melanoma,malignant,1,1,4,1
4,29231,ISIC_8838753,IP_6292815,IL_9486623,female,65.0,lower extremity,melanoma,malignant,0,1,4,1


In [8]:
val = val[['file', 'target']]
val.head()

Unnamed: 0,file,target
0,ISIC_6767569,1
1,ISIC_7312977,1
2,ISIC_1785627,1
3,ISIC_8066110,1
4,ISIC_8838753,1


In [9]:
# train.to_csv('../data/train.csv')
# val.to_csv('../data/val.csv')

# Proper Folder Structure

In [10]:
new_dir = 'images/'
# os.mkdir(new_dir)

## Training Set

In [11]:
tr_fldr = os.path.join(new_dir, 'train')
tr_mel = os.path.join(tr_fldr, 'mel')
tr_nm = os.path.join(tr_fldr, 'not_mel')

# os.mkdir(tr_fldr)
# os.mkdir(tr_mel)
# os.mkdir(tr_nm)

In [12]:
train['jpg'] = train.file.apply(lambda x: str(x) + '.jpg')
mel_train = train[train.target == 1]
non_mel_train = train[train.target == 0]

In [13]:
images_dir = 'data/train_jpg'
mel_tr_ids = mel_train.jpg.to_list()
nm_tr_ids = non_mel_train.jpg.to_list()

In [14]:
# for img in mel_tr_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(tr_mel, img))

In [15]:
# for img in tqdm(nm_tr_ids):
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(tr_nm, img))

## Validation Set

In [16]:
val_fldr = os.path.join(new_dir, 'val')
val_mel = os.path.join(val_fldr, 'mel')
val_nm = os.path.join(val_fldr, 'not_mel')

# os.mkdir(val_fldr)
# os.mkdir(val_mel)
# os.mkdir(val_nm)

In [17]:
val['jpg'] = val.file.apply(lambda x: str(x) + 'jpg')
mel_val = val[val.target == 1]
non_mel_val = val[val.target == 0]

mel_val_ids = mel_val.jpg.to_list()
nm_val_ids = non_mel_val.jpg.to_list()

In [18]:
# for img in mel_val_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(val_mel, img))    

In [19]:
# for img in nm_val_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(val_nm, img))

# 2019 Dataset

In [20]:
train2 = pd.read_csv('../data/ISIC_2019_Training_GroundTruth.csv')
train2 = train2.drop(columns=['NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', "SCC", 'UNK'], axis=1)
train2.columns = ['file', 'target']
train2.target = train2.target.astype(int)
train2.head()

Unnamed: 0,file,target
0,ISIC_0000000,0
1,ISIC_0000001,0
2,ISIC_0000002,1
3,ISIC_0000003,0
4,ISIC_0000004,1


In [21]:
train2['jpg'] = train2.file.apply(lambda x: str(x) + 'jpg')


In [22]:
tr19_dir = 'images/test'
tr19_fldr = os.path.join(new_dir, '2019')
tr19_mel = os.path.join(tr19_fldr, 'mel')
tr19_nm = os.path.join(tr19_fldr, 'not_mel')

# os.mkdir(tr19_fldr)
# os.mkdir(tr19_mel)
# os.mkdir(tr19_nm)

In [23]:
tr19_mel_df = train2[train2.target == 1]
tr19_nm_df = train2[train2.target == 0]

tr19_mel_ids = tr19_mel_df.jpg.to_list()
tr19_nm_ids = tr19_nm_df.jpg.to_list()

In [24]:
# for img in tr19_mel_ids:
#     shutil.copyfile(os.path.join(tr19_dir, img), os.path.join(tr19_mel, img))

In [25]:
# for img in tr19_nm_ids:
#     shutil.copyfile(os.path.join(tr19_dir, img), os.path.join(tr19_nm, img))

# 2018 Dataset

In [26]:
train3 = pd.read_csv('../data/ISIC2018_Task3_Training_GroundTruth.csv')
train3 = train3.drop(columns=['NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC'], axis=1)
train3.columns = ['file', 'target']
train3.target = train3.target.astype(int)
train3.head()

Unnamed: 0,file,target
0,ISIC_0024306,0
1,ISIC_0024307,0
2,ISIC_0024308,0
3,ISIC_0024309,0
4,ISIC_0024310,1


In [27]:
len(train3)

10015

In [27]:
train3['jpg'] = train3.file.apply(lambda x: str(x) + '.jpg')

In [28]:
new_dir = "../split/"
tr18_fldr = os.path.join(new_dir, '2018')
tr18_mel = os.path.join(tr18_fldr, 'mel')
tr18_nm = os.path.join(tr18_fldr, 'not_mel')

# os.mkdir(tr18_fldr)
# os.mkdir(tr18_mel)
# os.mkdir(tr18_nm)

In [29]:
tr18_dir = "../split/2018"
tr18_mel_df = train3[train3.target == 1]
tr18_nm_df = train3[train3.target == 0]

tr18_mel_ids = tr18_mel_df.jpg.to_list()
tr18_nm_ids = tr18_nm_df.jpg.to_list()

AttributeError: 'DataFrame' object has no attribute 'jpg'

In [30]:
len(tr18_mel_ids)

1113

In [31]:
tr1 = tr18_mel_ids[0:200]
tr2 = tr18_mel_ids[200:400]
tr3 = tr18_mel_ids[400:600]
tr4 = tr18_mel_ids[600:800]
tr5 = tr18_mel_ids[800:]

In [33]:
for img in tqdm(tr1):
    shutil.move(os.path.join(tr18_dir, img), os.path.join(tr18_mel, img))

  0%|          | 0/200 [00:00<?, ?it/s]

In [34]:
for img in tqdm(tr2):
    shutil.move(os.path.join(tr18_dir, img), os.path.join(tr18_mel, img))

  0%|          | 0/200 [00:00<?, ?it/s]

In [35]:
for img in tqdm(tr3):
    shutil.move(os.path.join(tr18_dir, img), os.path.join(tr18_mel, img))

  0%|          | 0/200 [00:00<?, ?it/s]

In [36]:
for img in tqdm(tr4):
    shutil.move(os.path.join(tr18_dir, img), os.path.join(tr18_mel, img))

  0%|          | 0/200 [00:00<?, ?it/s]

In [32]:
for img in tqdm(tr5):
    shutil.move(os.path.join(tr18_dir, img), os.path.join(tr18_mel, img))

  0%|          | 0/313 [00:00<?, ?it/s]