# Importing Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pydicom as dicom
import os, sys, time, shutil, scipy, cv2, json, datetime
import PIL.Image
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split

%reload_ext autoreload
%autoreload 2
from utils import *


# Dataset 1 (80-10-10 Split)

In [2]:
df = pd.read_csv('../data/train20.csv', index_col=0)
df.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33126 entries, 0 to 33125
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file        33126 non-null  object 
 1   patient_id  33126 non-null  object 
 2   lesion_id   33126 non-null  object 
 3   gender      33061 non-null  object 
 4   age         33058 non-null  float64
 5   site        32599 non-null  object 
 6   diagnosis   33126 non-null  object 
 7   ben_mal     33126 non-null  object 
 8   target      33126 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 2.5+ MB


In [4]:
melanoma = df[df.target == 1]
non_melanoma = df[df.target == 0]
y_mel = melanoma.target
X_mel = melanoma.drop(columns=['target'], axis=1)
y_non_mel = non_melanoma.target
X_non_mel = non_melanoma.drop(columns=['target'], axis=1)

In [5]:
X_tr_m, X_val_m, y_tr_m, y_val_m = train_test_split(X_mel, y_mel, random_state=42, test_size=0.2)
X_tr_nm, X_val_nm, y_tr_nm, y_val_nm = train_test_split(X_non_mel, y_non_mel, random_state=42, test_size=0.2)

mel_train = pd.concat([X_tr_m, y_tr_m], axis=1)
non_mel_train = pd.concat([X_tr_nm, y_tr_nm], axis=1)
train = pd.concat([mel_train, non_mel_train], axis=0)
train = train.reset_index(drop=True)

In [47]:
X_val_m2, X_tt_m, y_val_m2, y_tt_m = train_test_split(X_val_m, y_val_m, random_state=42, test_size=0.5)
X_val_nm2, X_tt_nm, y_val_nm2, y_tt_nm = train_test_split(X_val_nm, y_val_nm, random_state=42, test_size=0.5)

mel_val = pd.concat([X_val_m2, y_val_m2], axis=1)
non_mel_val = pd.concat([X_val_nm2, y_val_nm2], axis=1)
mel_tt = pd.concat([X_tt_m, y_tt_m], axis=1)
non_mel_tt = pd.concat([X_tt_nm, y_tt_nm], axis=1)

validation = pd.concat([mel_val, non_mel_val], axis=0)
validation = validation.reset_index(drop=True)

test = pd.concat([mel_tt, non_mel_tt], axis=0)
test = test.reset_index(drop=True)

In [14]:
tr = train[['file', 'target']]
tr.head()

Unnamed: 0,file,target
0,ISIC_1569119,1
1,ISIC_7536704,1
2,ISIC_3696488,1
3,ISIC_3343475,1
4,ISIC_7785723,1


In [15]:
tr.target.value_counts()

0    26033
1      467
Name: target, dtype: int64

In [17]:
val = validation[['file', 'target']]
val.head()

Unnamed: 0,file,target
0,ISIC_6124481,1
1,ISIC_1255336,1
2,ISIC_2408815,1
3,ISIC_5343231,1
4,ISIC_8306697,1


In [18]:
val.target.value_counts()

0    3254
1      58
Name: target, dtype: int64

In [21]:
tt = test[['file', 'target']]
tt.head()

Unnamed: 0,file,target
0,ISIC_7346228,1
1,ISIC_8838753,1
2,ISIC_3084312,1
3,ISIC_3068885,1
4,ISIC_9164054,1


In [22]:
tt.target.value_counts()

0    3255
1      59
Name: target, dtype: int64

In [25]:
len(tr)

26500

In [26]:
len(val)

3312

In [27]:
len(tt)

3314

In [74]:
tr.to_csv('../data/train.csv')
val.to_csv('../data/val.csv')
tt.to_csv("../data/test.csv")

# Creating Proper Folder Structure

In [29]:
top_dir = 'split/'
# os.mkdir(source_dir)

## Training Set

In [20]:
tr_fldr = os.path.join(top_dir, 'train')
tr_mel = os.path.join(tr_fldr, 'mel')
tr_nm = os.path.join(tr_fldr, 'not_mel')

# os.mkdir(tr_fldr)
# os.mkdir(tr_mel)
# os.mkdir(tr_nm)

In [21]:
train['jpg'] = train.file.apply(lambda x: str(x) + '.jpg')
mel_train = train[train.target == 1]
non_mel_train = train[train.target == 0]

In [22]:
images_dir = 'data/train_jpg'
mel_tr_ids = mel_train.jpg.to_list()
nm_tr_ids = non_mel_train.jpg.to_list()

In [23]:
# for img in tqdm (mel_tr_ids):
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(tr_mel, img))

In [24]:
# for img in tqdm(nm_tr_ids):
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(tr_nm, img))

## Validation Set

In [25]:
val_fldr = os.path.join(top_dir, 'val')
val_mel = os.path.join(val_fldr, 'mel')
val_nm = os.path.join(val_fldr, 'not_mel')

# os.mkdir(val_fldr)
# os.mkdir(val_mel)
# os.mkdir(val_nm)

In [26]:
val['jpg'] = val.file.apply(lambda x: str(x) + 'jpg')
mel_val = val[val.target == 1]
non_mel_val = val[val.target == 0]

mel_val_ids = mel_val.jpg.to_list()
nm_val_ids = non_mel_val.jpg.to_list()

In [27]:
# for img in mel_val_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(val_mel, img))    

In [28]:
# for img in nm_val_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(val_nm, img))

## Testing Set

In [31]:
tt_fldr = os.path.join(top_dir, 'test')
tt_mel = os.path.join(tt_fldr, 'mel')
tt_nm = os.path.join(tt_fldr, 'not_mel')

In [53]:
tt['jpg'] = tt.file.apply(lambda x: str(x) + 'jpg')
mel_tt = tt[tt.target == 1]
non_mel_tt = tt[tt.target == 0]

mel_tt_ids = mel_tt.jpg.to_list()
nm_tt_ids = non_mel_tt.jpg.to_list()

In [60]:
# images_dir = '../split/val/mel'
# for img in mel_tt_ids:
#      shutil.copyfile(os.path.join(images_dir, img), os.path.join(tt_mel, img))   

# Post-Transformation Train-Validation-Test Split

In [55]:
df2 = pd.read_csv('../data/tr_tot.csv', index_col=0)
df2.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0


In [56]:
df2.target.value_counts()

0    32542
1     5106
Name: target, dtype: int64

In [57]:
df2.file.value_counts()

ISIC_0072182    1
ISIC_3179296    1
ISIC_5587291    1
ISIC_9935970    1
ISIC_7309194    1
               ..
ISIC_9448599    1
ISIC_0026430    1
ISIC_9898848    1
ISIC_8572893    1
ISIC_5423896    1
Name: file, Length: 37648, dtype: int64

In [58]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37648 entries, 0 to 37647
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file        37648 non-null  object 
 1   patient_id  33126 non-null  object 
 2   lesion_id   37311 non-null  object 
 3   gender      37502 non-null  object 
 4   age         37495 non-null  float64
 5   site        36994 non-null  object 
 6   diagnosis   33126 non-null  object 
 7   ben_mal     33126 non-null  object 
 8   target      37648 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 2.9+ MB


In [59]:
melanoma2 = df2[df2.target == 1]
non_melanoma2 = df2[df2.target == 0]
y_mel2 = melanoma2.target
X_mel2 = melanoma2.drop(columns=['target'], axis=1)
y_non_mel2 = non_melanoma2.target
X_non_mel2 = non_melanoma2.drop(columns=['target'], axis=1)

In [61]:
X_tr2_m, X_val2_m, y_tr2_m, y_val2_m = train_test_split(X_mel2, y_mel2, random_state=42, test_size=0.2)
X_tr2_nm, X_val2_nm, y_tr2_nm, y_val2_nm = train_test_split(X_non_mel2, y_non_mel2, random_state=42, test_size=0.2)

mel_train2 = pd.concat([X_tr2_m, y_tr2_m], axis=1)
non_mel_train2 = pd.concat([X_tr2_nm, y_tr2_nm], axis=1)

train2 = pd.concat([mel_train2, non_mel_train2], axis=0)
train2 = train2.reset_index(drop=True)

In [62]:
X_val2_m2, X_tt2_m, y_val2_m2, y_tt2_m = train_test_split(X_val2_m, y_val2_m, random_state=42, test_size=0.5)
X_val2_nm2, X_tt2_nm, y_val2_nm2, y_tt2_nm = train_test_split(X_val2_nm, y_val2_nm, random_state=42, test_size=0.5)

non_mel_val2 = pd.concat([X_val2_m2, y_val2_m2], axis=1)
non_mel_val2 = pd.concat([X_val2_nm2, y_val2_nm2], axis=1)
mel_tt2 = pd.concat([X_tt2_m, y_tt2_m], axis=1)
non_mel_tt2 = pd.concat([X_tt2_nm, y_tt2_nm], axis=1)

validation2 = pd.concat([mel_val, non_mel_val], axis=0)
validation2 = validation2.reset_index(drop=True)

test2 = pd.concat([mel_tt2, non_mel_tt2], axis=0)
test2 = test2.reset_index(drop=True)

In [63]:
tr2 = train2[['file', 'target']]
tr2.head()

Unnamed: 0,file,target
0,ISIC_0065823,1
1,ISIC_0013908_downsampled,1
2,ISIC_0054089,1
3,ISIC_0061371,1
4,ISIC_8702237,1


In [64]:
len(tr2)

30117

In [65]:
tr2.target.value_counts()

0    26033
1     4084
Name: target, dtype: int64

In [66]:
val2 = validation2[['file', 'target']]
val2.head()

Unnamed: 0,file,target
0,ISIC_0066678,1
1,ISIC_0057668,1
2,ISIC_0053843,1
3,ISIC_0072218,1
4,ISIC_0032685,1


In [67]:
len(val2)

3765

In [68]:
val2.target.value_counts()

0    3254
1     511
Name: target, dtype: int64

In [69]:
tt2 = test2[['file', 'target']]
tt2.head()

Unnamed: 0,file,target
0,ISIC_0064868,1
1,ISIC_0032617,1
2,ISIC_0061378,1
3,ISIC_0065382,1
4,ISIC_5732201,1


In [70]:
len(tt2)

3766

In [71]:
test2.target.value_counts()

0    3255
1     511
Name: target, dtype: int64

In [73]:
tr2.to_csv("../data/train2.csv")
val2.to_csv("../data/val2.csv")
tt2.to_csv("../data/tt2.csv")

## Training Dataset

In [42]:
tr2['jpg'] = tr2.file.apply(lambda x: str(x) + '.jpg')
mel_tr = tr2[tr2.target == 1]
nm_tr = tr2[tr2.target == 0]

mel_tr_ids = mel_tr.jpg.to_list()
nm_tr_ids = nm_tr.jpg.to_list()

In [46]:
images_dir = '../jpegs'
tr_mel = '../split/train/mel'
tr_nm = '../split/train/not_mel'


In [47]:
mel1 = mel_tr_ids[0:1000]
mel2 = mel_tr_ids[1000:2000]
mel3 = mel_tr_ids[2000:3000]
mel4 = mel_tr_ids[3000:]

In [48]:
for img in tqdm(mel1):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_mel, img))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [49]:
for img in tqdm(mel2):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_mel, img))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [50]:
for img in tqdm(mel3):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_mel, img))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [51]:
for img in tqdm(mel4):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_mel, img))

  0%|          | 0/1084 [00:00<?, ?it/s]

In [52]:
nm1 = nm_tr_ids[0:2000]
nm2 = nm_tr_ids[2000:4000]
nm3 = nm_tr_ids[4000:6000]
nm4 = nm_tr_ids[6000:8000]
nm5 = nm_tr_ids[8000:10000]
nm6 = nm_tr_ids[10000:12000]
nm7 = nm_tr_ids[12000:14000]
nm8 = nm_tr_ids[14000:16000]
nm9 = nm_tr_ids[16000:18000]
nm10 = nm_tr_ids[18000:20000]
nm11 = nm_tr_ids[20000:22000]
nm12 = nm_tr_ids[22000:24000]
nm13 = nm_tr_ids[24000:]

In [62]:
for img in tqdm(nm1):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [63]:
for img in tqdm(nm2):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [64]:
for img in tqdm(nm3):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [65]:
for img in tqdm(nm4):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [66]:
for img in tqdm(nm5):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [67]:
for img in tqdm(nm6):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [68]:
for img in tqdm(nm7):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [69]:
for img in tqdm(nm8):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [84]:
for img in tqdm(nm9):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [85]:
for img in tqdm(nm10):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [87]:
for img in tqdm(nm11):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [88]:
for img in tqdm(nm12):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [89]:
for img in tqdm(nm13):
    shutil.move(os.path.join(images_dir, img), os.path.join(tr_nm, img))

  0%|          | 0/2033 [00:00<?, ?it/s]

## Sample Training Dataset

In [72]:
tr2.head()

Unnamed: 0,file,target,jpg
0,ISIC_0065823,1,ISIC_0065823.jpg
1,ISIC_0013908_downsampled,1,ISIC_0013908_downsampled.jpg
2,ISIC_0054089,1,ISIC_0054089.jpg
3,ISIC_0061371,1,ISIC_0061371.jpg
4,ISIC_8702237,1,ISIC_8702237.jpg


In [77]:
tr2.target.value_counts()

0    26033
1     4084
Name: target, dtype: int64

In [76]:
strat_tr = stratified_sample(df = tr2, strata = ['target'], size = 0.2, seed = 42)
strat_tr.head()

Unnamed: 0,index,file,target,jpg
0,22645,ISIC_5247383,0,ISIC_5247383.jpg
1,19797,ISIC_4422476,0,ISIC_4422476.jpg
2,18060,ISIC_8124459,0,ISIC_8124459.jpg
3,29506,ISIC_1669883,0,ISIC_1669883.jpg
4,17825,ISIC_5842418,0,ISIC_5842418.jpg


In [78]:
strat_tr.target.value_counts()

0    5206
1     817
Name: target, dtype: int64

In [90]:
samp_mel_tr = strat_tr[strat_tr.target == 1]
samp_nm_tr = strat_tr[strat_tr.target == 0]

samp_mel_tr_ids = samp_mel_tr.jpg.to_list()
samp_nm_tr_ids = samp_nm_tr.jpg.to_list()

In [91]:
tr_mel_ori = '../split/train/mel'
tr_mel_dest = '../sample/train/mel'
tr_nm_ori = '../split/train/not_mel'
tr_nm_dest = '../sample/train/not_mel'

In [92]:
for img in tqdm(samp_mel_tr_ids):
    shutil.copyfile(os.path.join(tr_mel_ori, img), os.path.join(tr_mel_dest, img))

  0%|          | 0/817 [00:00<?, ?it/s]

In [93]:
sam_tr_1 = samp_nm_tr_ids[0:2000]
sam_tr_2 = samp_nm_tr_ids[2000:4000]
sam_tr_3 = samp_nm_tr_ids[4000:] 

In [94]:
for img in tqdm(sam_tr_1):
    shutil.copyfile(os.path.join(tr_nm_ori, img), os.path.join(tr_nm_dest, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [95]:
for img in tqdm(sam_tr_2):
    shutil.copyfile(os.path.join(tr_nm_ori, img), os.path.join(tr_nm_dest, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [96]:
for img in tqdm(sam_tr_3):
    shutil.copyfile(os.path.join(tr_nm_ori, img), os.path.join(tr_nm_dest, img))

  0%|          | 0/1206 [00:00<?, ?it/s]

## Validation Dataset

In [75]:
val2['jpg'] = val2.file.apply(lambda x: str(x) + '.jpg')
mel_val = val2[val2.target == 1]
nm_val = val2[val2.target == 0]

mel_val_ids = mel_val.jpg.to_list()
nm_val_ids = nm_val.jpg.to_list()

In [54]:
images_dir = '../jpegs'
val_mel = '../split/val/mel'
val_nm = '../split/val/not_mel'

In [55]:
for img in mel_val_ids:
    shutil.move(os.path.join(images_dir, img), os.path.join(val_mel, img))

In [60]:
val_nm1 = nm_val_ids[0:2000]
val_nm3 = nm_val_ids[2000:4000]
val_nm2 = nm_val_ids[4000:]


In [58]:
for img in tqdm(val_nm1):
    shutil.move(os.path.join(images_dir, img), os.path.join(val_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [59]:
for img in tqdm(val_nm2):
    shutil.move(os.path.join(images_dir, img), os.path.join(val_nm, img))

  0%|          | 0/2509 [00:00<?, ?it/s]

In [61]:
for img in tqdm(val_nm3):
    shutil.move(os.path.join(images_dir, img), os.path.join(val_nm, img))

  0%|          | 0/2000 [00:00<?, ?it/s]

## Sample Validation Set

In [84]:
val2.head()

Unnamed: 0,file,target,jpg
0,ISIC_0066678,1,ISIC_0066678.jpg
1,ISIC_0057668,1,ISIC_0057668.jpg
2,ISIC_0053843,1,ISIC_0053843.jpg
3,ISIC_0072218,1,ISIC_0072218.jpg
4,ISIC_0032685,1,ISIC_0032685.jpg


In [85]:
val2.target.value_counts()

0    3254
1     511
Name: target, dtype: int64

In [86]:
strat_val = stratified_sample(df = val2, strata = ['target'], size = 0.2, seed = 42)

In [87]:
strat_val.target.value_counts()

0    651
1    102
Name: target, dtype: int64

In [94]:
val_mel_ori = '../split/val/mel'
val_mel_dest = '../sample/val/mel'
val_nm_ori = '../split/val/not_mel'
val_nm_dest = '../sample/val/not_mel'

In [95]:
samp_mel_val = strat_val[strat_val.target == 1]
samp_nm_val = strat_val[strat_val.target == 0]

samp_mel_val_ids = samp_mel_val.jpg.to_list()
samp_nm_val_ids = samp_nm_val.jpg.to_list()

In [96]:
for img in tqdm(samp_mel_val_ids):
    shutil.copyfile(os.path.join(val_mel_ori, img), os.path.join(val_mel_dest, img))

  0%|          | 0/102 [00:00<?, ?it/s]

In [97]:
for img in tqdm(samp_nm_val_ids):
    shutil.copyfile(os.path.join(val_nm_ori, img), os.path.join(val_nm_dest, img))

  0%|          | 0/651 [00:00<?, ?it/s]

# Testing Set

In [76]:
tt2['jpg'] = tt2.file.apply(lambda x: str(x) + '.jpg')
mel_tt = tt2[tt2.target == 1]
nm_tt = tt2[tt2.target == 0]

mel_tt_ids = mel_tt.jpg.to_list()
nm_tt_ids = nm_tt.jpg.to_list()

In [78]:
# direc = '../split/te/'

# os.mkdir('../split/test/mel')
# os.mkdir('../split/test/non_mel')

direc = '../split/val/mel/'
direc2 = '../split/val/not_mel'
direc3 = '../split/test/mel'
direc4 = '../split/test/not_mel'

In [81]:
for img in tqdm(nm_tt_ids):
    shutil.move(os.path.join(direc2, img), os.path.join(direc4, img))

  0%|          | 0/3255 [00:00<?, ?it/s]

In [80]:
for img in tqdm(mel_tt_ids):
    shutil.move(os.path.join(direc, img), os.path.join(direc3, img))

  0%|          | 0/511 [00:00<?, ?it/s]

In [82]:
tt2.head()

Unnamed: 0,file,target,jpg
0,ISIC_0064868,1,ISIC_0064868.jpg
1,ISIC_0032617,1,ISIC_0032617.jpg
2,ISIC_0061378,1,ISIC_0061378.jpg
3,ISIC_0065382,1,ISIC_0065382.jpg
4,ISIC_5732201,1,ISIC_5732201.jpg


In [83]:
tt2.target.value_counts()

0    3255
1     511
Name: target, dtype: int64

In [90]:
strat_tt = stratified_sample(df = tt2, strata = ['target'], size = 0.2, seed = 42)

In [91]:
strat_tt.target.value_counts()

0    651
1    102
Name: target, dtype: int64

In [98]:
tt_mel_ori = '../split/test/mel'
tt_mel_dest = '../sample/test/mel'
tt_nm_ori = '../split/test/not_mel'
tt_nm_dest = '../sample/test/not_mel'

In [99]:
samp_mel_tt = strat_tt[strat_tt.target == 1]
samp_nm_tt = strat_tt[strat_tt.target == 0]

samp_mel_tt_ids = samp_mel_tt.jpg.to_list()
samp_nm_tt_ids = samp_nm_tt.jpg.to_list()

In [102]:
for img in tqdm(samp_mel_tt_ids):
    shutil.copyfile(os.path.join(tt_mel_ori, img), os.path.join(tt_mel_dest, img))

  0%|          | 0/102 [00:00<?, ?it/s]

In [103]:
for img in tqdm(samp_nm_tt_ids):
    shutil.copyfile(os.path.join(tt_nm_ori, img), os.path.join(tt_nm_dest, img))

  0%|          | 0/651 [00:00<?, ?it/s]