In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pydicom as dicom
import os, sys, time, shutil, scipy, cv2, json, datetime
import PIL.Image
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split

%reload_ext autoreload
%autoreload 2
from utils import *


# Dataset 1 (Train and Validation)

In [2]:
df = pd.read_csv('../data/train20.csv', index_col=0)
df.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33126 entries, 0 to 33125
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file        33126 non-null  object 
 1   patient_id  33126 non-null  object 
 2   lesion_id   33126 non-null  object 
 3   gender      33061 non-null  object 
 4   age         33058 non-null  float64
 5   site        32599 non-null  object 
 6   diagnosis   33126 non-null  object 
 7   ben_mal     33126 non-null  object 
 8   target      33126 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 2.5+ MB


In [4]:
melanoma = df[df.target == 1]
non_melanoma = df[df.target == 0]
y_mel = melanoma.target
X_mel = melanoma.drop(columns=['target'], axis=1)
y_non_mel = non_melanoma.target
X_non_mel = non_melanoma.drop(columns=['target'], axis=1)

In [5]:
X_tr_m, X_val_m, y_tr_m, y_val_m = train_test_split(X_mel, y_mel, random_state=42, test_size=0.2)
X_tr_nm, X_val_nm, y_tr_nm, y_val_nm = train_test_split(X_non_mel, y_non_mel, random_state=42, test_size=0.2)

mel_train = pd.concat([X_tr_m, y_tr_m], axis=1)
non_mel_train = pd.concat([X_tr_nm, y_tr_nm], axis=1)
mel_val = pd.concat([X_val_m, y_val_m], axis=1)
non_mel_val = pd.concat([X_val_nm, y_val_nm], axis=1)

train = pd.concat([mel_train, non_mel_train], axis=0)
validation = pd.concat([mel_val, non_mel_val], axis=0)
train = train.reset_index(drop=True)
validation = validation.reset_index(drop=True)

In [6]:
train.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
0,ISIC_1569119,IP_6120178,IL_3157235,male,65.0,torso,melanoma,malignant,1
1,ISIC_7536704,IP_3994607,IL_1685088,male,75.0,torso,melanoma,malignant,1
2,ISIC_3696488,IP_6051587,IL_8878499,female,90.0,upper extremity,melanoma,malignant,1
3,ISIC_3343475,IP_4576785,IL_9262145,male,80.0,head/neck,melanoma,malignant,1
4,ISIC_7785723,IP_8447624,IL_2711936,female,50.0,lower extremity,melanoma,malignant,1


In [7]:
train.tail()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
26495,ISIC_9166979,IP_7279968,IL_5062492,male,45.0,torso,unknown,benign,0
26496,ISIC_1738297,IP_9965542,IL_7933843,male,60.0,head/neck,unknown,benign,0
26497,ISIC_0365032,IP_3908059,IL_2756378,female,60.0,upper extremity,unknown,benign,0
26498,ISIC_4919947,IP_8663649,IL_9363786,male,45.0,lower extremity,unknown,benign,0
26499,ISIC_7309302,IP_1590260,IL_5077089,male,45.0,torso,unknown,benign,0


In [8]:
tr = train[['file', 'target']]
tr.head()

Unnamed: 0,file,target
0,ISIC_1569119,1
1,ISIC_7536704,1
2,ISIC_3696488,1
3,ISIC_3343475,1
4,ISIC_7785723,1


In [9]:
tr.target.value_counts()

0    26033
1      467
Name: target, dtype: int64

In [10]:
validation.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
0,ISIC_6767569,IP_7665112,IL_1769376,male,70.0,head/neck,melanoma,malignant,1
1,ISIC_7312977,IP_8675890,IL_2723500,male,55.0,lower extremity,melanoma,malignant,1
2,ISIC_1785627,IP_0170821,IL_6029631,female,40.0,upper extremity,melanoma,malignant,1
3,ISIC_8066110,IP_3055814,IL_7066374,male,50.0,lower extremity,melanoma,malignant,1
4,ISIC_8838753,IP_6292815,IL_9486623,female,65.0,lower extremity,melanoma,malignant,1


In [11]:
val = validation[['file', 'target']]
val.head()

Unnamed: 0,file,target
0,ISIC_6767569,1
1,ISIC_7312977,1
2,ISIC_1785627,1
3,ISIC_8066110,1
4,ISIC_8838753,1


In [12]:
val.target.value_counts()

0    6509
1     117
Name: target, dtype: int64

In [13]:
len(train)

26500

In [14]:
len(val)

6626

In [15]:
# tr.to_csv('../data/train.csv')
# val.to_csv('../data/val.csv')

# Proper Folder Structure

In [19]:
new_dir = 'images/'
# os.mkdir(new_dir)

## Training Set

In [20]:
tr_fldr = os.path.join(new_dir, 'train')
tr_mel = os.path.join(tr_fldr, 'mel')
tr_nm = os.path.join(tr_fldr, 'not_mel')

# os.mkdir(tr_fldr)
# os.mkdir(tr_mel)
# os.mkdir(tr_nm)

In [21]:
train['jpg'] = train.file.apply(lambda x: str(x) + '.jpg')
mel_train = train[train.target == 1]
non_mel_train = train[train.target == 0]

In [22]:
images_dir = 'data/train_jpg'
mel_tr_ids = mel_train.jpg.to_list()
nm_tr_ids = non_mel_train.jpg.to_list()

In [23]:
# for img in tqdm (mel_tr_ids):
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(tr_mel, img))

In [24]:
# for img in tqdm(nm_tr_ids):
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(tr_nm, img))

## Validation Set

In [25]:
val_fldr = os.path.join(new_dir, 'val')
val_mel = os.path.join(val_fldr, 'mel')
val_nm = os.path.join(val_fldr, 'not_mel')

# os.mkdir(val_fldr)
# os.mkdir(val_mel)
# os.mkdir(val_nm)

In [26]:
val['jpg'] = val.file.apply(lambda x: str(x) + 'jpg')
mel_val = val[val.target == 1]
non_mel_val = val[val.target == 0]

mel_val_ids = mel_val.jpg.to_list()
nm_val_ids = non_mel_val.jpg.to_list()

In [27]:
# for img in mel_val_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(val_mel, img))    

In [28]:
# for img in nm_val_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(val_nm, img))

# New Train-Validation Split

In [29]:
df2 = pd.read_csv('../data/tr_tot.csv', index_col=0)
df2.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0


In [30]:
df2.target.value_counts()

0    32542
1     5106
Name: target, dtype: int64

In [31]:
df2.file.value_counts()

ISIC_2138559    1
ISIC_6322207    1
ISIC_8248353    1
ISIC_0068303    1
ISIC_2664946    1
               ..
ISIC_6673956    1
ISIC_5674857    1
ISIC_3850432    1
ISIC_3530337    1
ISIC_4452219    1
Name: file, Length: 37648, dtype: int64

In [32]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37648 entries, 0 to 37647
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file        37648 non-null  object 
 1   patient_id  33126 non-null  object 
 2   lesion_id   37311 non-null  object 
 3   gender      37502 non-null  object 
 4   age         37495 non-null  float64
 5   site        36994 non-null  object 
 6   diagnosis   33126 non-null  object 
 7   ben_mal     33126 non-null  object 
 8   target      37648 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 2.9+ MB


In [33]:
melanoma2 = df2[df2.target == 1]
non_melanoma2 = df2[df2.target == 0]
y_mel2 = melanoma2.target
X_mel2 = melanoma2.drop(columns=['target'], axis=1)
y_non_mel2 = non_melanoma2.target
X_non_mel2 = non_melanoma2.drop(columns=['target'], axis=1)

In [34]:
X_tr_m2, X_val_m2, y_tr_m2, y_val_m2 = train_test_split(X_mel2, y_mel2, random_state=42, test_size=0.2)
X_tr_nm2, X_val_nm2, y_tr_nm2, y_val_nm2 = train_test_split(X_non_mel2, y_non_mel2, random_state=42, test_size=0.2)

mel_train2 = pd.concat([X_tr_m2, y_tr_m2], axis=1)
non_mel_train2 = pd.concat([X_tr_nm2, y_tr_nm2], axis=1)
mel_val2 = pd.concat([X_val_m2, y_val_m2], axis=1)
non_mel_val2 = pd.concat([X_val_nm2, y_val_nm2], axis=1)

train2 = pd.concat([mel_train2, non_mel_train2], axis=0)
validation2 = pd.concat([mel_val2, non_mel_val2], axis=0)
train2 = train2.reset_index(drop=True)
validation2 = validation2.reset_index(drop=True)

In [35]:
tr2 = train2[['file', 'target']]
tr2.head()

Unnamed: 0,file,target
0,ISIC_0065823,1
1,ISIC_0013908_downsampled,1
2,ISIC_0054089,1
3,ISIC_0061371,1
4,ISIC_8702237,1


In [36]:
len(tr2)

30117

In [37]:
tr2.target.value_counts()

0    26033
1     4084
Name: target, dtype: int64

In [38]:
val2 = validation2[['file', 'target']]
val2.head()

Unnamed: 0,file,target
0,ISIC_0066225,1
1,ISIC_0000013,1
2,ISIC_9367522,1
3,ISIC_3969411,1
4,ISIC_0026531,1


In [39]:
len(val2)

7531

In [40]:
val2.target.value_counts()

0    6509
1    1022
Name: target, dtype: int64

In [41]:
# tr2.to_csv("../data/train2.csv")
# val2.to_csv("../data/val2.csv")

## Training Dataset

In [42]:
tr2['jpg'] = tr2.file.apply(lambda x: str(x) + '.jpg')
mel_tr = tr2[tr2.target == 1]
nm_tr = tr2[tr2.target == 0]

mel_tr_ids = mel_tr.jpg.to_list()
nm_tr_ids = nm_tr.jpg.to_list()

In [46]:
images_dir = '../jpegs'
tr_mel = '../split/train/mel'
tr_nm = '../split/train/not_mel'


In [47]:
mel1 = mel_tr_ids[0:1000]
mel2 = mel_tr_ids[1000:2000]
mel3 = mel_tr_ids[2000:3000]
mel4 = mel_tr_ids[3000:]

In [48]:
for img in tqdm(mel1):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_mel, img))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [49]:
for img in tqdm(mel2):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_mel, img))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [50]:
for img in tqdm(mel3):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_mel, img))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [51]:
for img in tqdm(mel4):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_mel, img))

  0%|          | 0/1084 [00:00<?, ?it/s]

In [52]:
nm1 = nm_tr_ids[0:2000]
nm2 = nm_tr_ids[2000:4000]
nm3 = nm_tr_ids[4000:6000]
nm4 = nm_tr_ids[6000:8000]
nm5 = nm_tr_ids[8000:10000]
nm6 = nm_tr_ids[10000:12000]
nm7 = nm_tr_ids[12000:14000]
nm8 = nm_tr_ids[14000:16000]
nm9 = nm_tr_ids[16000:18000]
nm10 = nm_tr_ids[18000:20000]
nm11 = nm_tr_ids[20000:22000]
nm12 = nm_tr_ids[22000:24000]
nm13 = nm_tr_ids[24000:]

In [62]:
for img in tqdm(nm1):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [63]:
for img in tqdm(nm2):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [64]:
for img in tqdm(nm3):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [65]:
for img in tqdm(nm4):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [66]:
for img in tqdm(nm5):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [67]:
for img in tqdm(nm6):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [68]:
for img in tqdm(nm7):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [69]:
for img in tqdm(nm8):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [84]:
for img in tqdm(nm9):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [85]:
for img in tqdm(nm10):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [87]:
for img in tqdm(nm11):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [88]:
for img in tqdm(nm12):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [89]:
for img in tqdm(nm13):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2033 [00:00<?, ?it/s]

## Sample Training Dataset

In [72]:
tr2.head()

Unnamed: 0,file,target,jpg
0,ISIC_0065823,1,ISIC_0065823.jpg
1,ISIC_0013908_downsampled,1,ISIC_0013908_downsampled.jpg
2,ISIC_0054089,1,ISIC_0054089.jpg
3,ISIC_0061371,1,ISIC_0061371.jpg
4,ISIC_8702237,1,ISIC_8702237.jpg


In [77]:
tr2.target.value_counts()

0    26033
1     4084
Name: target, dtype: int64

In [76]:
strat_tr = stratified_sample(df = tr2, strata = ['target'], size = 0.2, seed = 42)
strat_tr.head()

Unnamed: 0,index,file,target,jpg
0,22645,ISIC_5247383,0,ISIC_5247383.jpg
1,19797,ISIC_4422476,0,ISIC_4422476.jpg
2,18060,ISIC_8124459,0,ISIC_8124459.jpg
3,29506,ISIC_1669883,0,ISIC_1669883.jpg
4,17825,ISIC_5842418,0,ISIC_5842418.jpg


In [78]:
strat_tr.target.value_counts()

0    5206
1     817
Name: target, dtype: int64

In [90]:
samp_mel_tr = strat_tr[strat_tr.target == 1]
samp_nm_tr = strat_tr[strat_tr.target == 0]

samp_mel_tr_ids = samp_mel_tr.jpg.to_list()
samp_nm_tr_ids = samp_nm_tr.jpg.to_list()

In [91]:
tr_mel_ori = '../split/train/mel'
tr_mel_dest = '../sample/train/mel'
tr_nm_ori = '../split/train/not_mel'
tr_nm_dest = '../sample/train/not_mel'

In [92]:
for img in tqdm(samp_mel_tr_ids):
    shutil.copyfile(os.path.join(tr_mel_ori, img), os.path.join(tr_mel_dest, img))

  0%|          | 0/817 [00:00<?, ?it/s]

In [93]:
sam_tr_1 = samp_nm_tr_ids[0:2000]
sam_tr_2 = samp_nm_tr_ids[2000:4000]
sam_tr_3 = samp_nm_tr_ids[4000:] 

In [94]:
for img in tqdm(sam_tr_1):
    shutil.copyfile(os.path.join(tr_nm_ori, img), os.path.join(tr_nm_dest, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [95]:
for img in tqdm(sam_tr_2):
    shutil.copyfile(os.path.join(tr_nm_ori, img), os.path.join(tr_nm_dest, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
for img in tqdm(sam_tr_3):
    shutil.copyfile(os.path.join(tr_nm_ori, img), os.path.join(tr_nm_dest, img))

## Validation Dataset

In [53]:
val2['jpg'] = val2.file.apply(lambda x: str(x) + '.jpg')
mel_val = val2[val2.target == 1]
nm_val = val2[val2.target == 0]

mel_val_ids = mel_val.jpg.to_list()
nm_val_ids = nm_val.jpg.to_list()

In [54]:
images_dir = '../jpegs'
val_mel = '../split/val/mel'
val_nm = '../split/val/not_mel'

In [55]:
for img in mel_val_ids:
    shutil.move(os.path.join(images_dir, img), os.path.join(val_mel, img))

In [60]:
val_nm1 = nm_val_ids[0:2000]
val_nm3 = nm_val_ids[2000:4000]
val_nm2 = nm_val_ids[4000:]


In [58]:
for img in tqdm(val_nm1):
    shutil.move(os.path.join(images_dir, img), os.path.join(val_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [59]:
for img in tqdm(val_nm2):
    shutil.move(os.path.join(images_dir, img), os.path.join(val_nm, img))

  0%|          | 0/2509 [00:00<?, ?it/s]

In [61]:
for img in tqdm(val_nm3):
    shutil.move(os.path.join(images_dir, img), os.path.join(val_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

## Sample Validation Set

In [79]:
val2.head()

Unnamed: 0,file,target,jpg
0,ISIC_0066225,1,ISIC_0066225.jpg
1,ISIC_0000013,1,ISIC_0000013.jpg
2,ISIC_9367522,1,ISIC_9367522.jpg
3,ISIC_3969411,1,ISIC_3969411.jpg
4,ISIC_0026531,1,ISIC_0026531.jpg


In [80]:
val2.target.value_counts()

0    6509
1    1022
Name: target, dtype: int64

In [81]:
strat_val = stratified_sample(df = val2, strata = ['target'], size = 0.2, seed = 42)

In [83]:
strat_val.target.value_counts()

0    1302
1     204
Name: target, dtype: int64

In [None]:
val_mel_ori = '../split/val/mel'
val_mel_dest = '../sample/val/mel'
val_nm_ori = '../split/val/not_mel'
val_nm_dest = '../sample/val/not_mel'

In [None]:
samp_mel_val = strat_val[strat_val.target == 1]
samp_nm_val = strat_val[strat_val.target == 0]

samp_mel_val_ids = samp_mel_val.jpg.to_list()
samp_nm_val_ids = samp_nm_val.jpg.to_list()

In [None]:
for img in tqdm(samp_mel_val_ids):
    shutil.copyfile(os.path.join(val_mel_ori, img), os.path.join(val_mel_dest, img))

In [None]:
for img in tqdm(samp_nm_val_ids):
    shutil.copyfile(os.path.join(val_nm_ori, img), os.path.join(val_nm_dest, img))

# Testing Set

In [2]:
test2 = pd.read_csv('../data/test2.csv', index_col = 0)
test2.head()

Unnamed: 0,file,age,gender,target
0,ISIC_0000000,55,female,0
1,ISIC_0000001,30,female,0
2,ISIC_0000002,60,female,1
3,ISIC_0000003,30,male,0
4,ISIC_0000004,80,male,1


In [3]:
# test2 = test2.rename(columns={'image': 'file'})
test2['jpg'] = test2.file.apply(lambda x: str(x) + '.jpg')
test_nm = test2[test2.target == 0]
test_mel = test2[test2.target == 1]

In [4]:
ttnm = test_nm.jpg.to_list()
ttmel = test_mel.jpg.to_list()

In [5]:
direc = '../split/test2/'

os.mkdir('../split/test2/mel')
os.mkdir('../split/test2/non_mel')

direc2 = '../split/test2/mel/'
direc3 = '../split/test2/non_mel'

In [7]:
for img in tqdm(ttnm):
    shutil.move(os.path.join(direc, img), os.path.join(direc3, img))

  0%|          | 0/1626 [00:00<?, ?it/s]

In [6]:
for img in tqdm(ttmel):
    shutil.move(os.path.join(direc, img), os.path.join(direc2, img))

  0%|          | 0/374 [00:00<?, ?it/s]