In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pydicom as dicom
import os, sys, time, shutil, scipy, cv2, json, datetime
import PIL.Image
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split


# Dataset 1 (Train and Validation)

In [37]:
df = pd.read_csv('../data/train_df.csv', index_col=0)
df.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target,gen_enc,site_enc,diag_enc
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0,1,0,8
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0,0,6,8
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0,0,1,5
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0,0,0,8
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0,0,6,8


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33126 entries, 0 to 33125
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file        33126 non-null  object 
 1   patient_id  33126 non-null  object 
 2   lesion_id   33126 non-null  object 
 3   gender      33126 non-null  object 
 4   age         33126 non-null  float64
 5   site        33126 non-null  object 
 6   diagnosis   33126 non-null  object 
 7   ben_mal     33126 non-null  object 
 8   target      33126 non-null  int64  
 9   gen_enc     33126 non-null  int64  
 10  site_enc    33126 non-null  int64  
 11  diag_enc    33126 non-null  int64  
dtypes: float64(1), int64(4), object(7)
memory usage: 3.3+ MB


In [39]:
melanoma = df[df.target == 1]
non_melanoma = df[df.target == 0]
y_mel = melanoma.target
X_mel = melanoma.drop(columns=['target'], axis=1)
y_non_mel = non_melanoma.target
X_non_mel = non_melanoma.drop(columns=['target'], axis=1)

In [40]:
X_tr_m, X_val_m, y_tr_m, y_val_m = train_test_split(X_mel, y_mel, random_state=42, test_size=0.2)
X_tr_nm, X_val_nm, y_tr_nm, y_val_nm = train_test_split(X_non_mel, y_non_mel, random_state=42, test_size=0.2)

mel_train = pd.concat([X_tr_m, y_tr_m], axis=1)
non_mel_train = pd.concat([X_tr_nm, y_tr_nm], axis=1)
mel_val = pd.concat([X_val_m, y_val_m], axis=1)
non_mel_val = pd.concat([X_val_nm, y_val_nm], axis=1)

train = pd.concat([mel_train, non_mel_train], axis=0)
validation = pd.concat([mel_val, non_mel_val], axis=0)
train = train.reset_index(drop=True)
validation = validation.reset_index(drop=True)

In [41]:
train.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,gen_enc,site_enc,diag_enc,target
0,ISIC_1569119,IP_6120178,IL_3157235,male,65.0,torso,melanoma,malignant,1,4,4,1
1,ISIC_7536704,IP_3994607,IL_1685088,male,75.0,torso,melanoma,malignant,1,4,4,1
2,ISIC_3696488,IP_6051587,IL_8878499,female,90.0,upper extremity,melanoma,malignant,0,6,4,1
3,ISIC_3343475,IP_4576785,IL_9262145,male,80.0,head/neck,melanoma,malignant,1,0,4,1
4,ISIC_7785723,IP_8447624,IL_2711936,female,50.0,lower extremity,melanoma,malignant,0,1,4,1


In [42]:
tr = train[['file', 'target']]
tr.head()

Unnamed: 0,file,target
0,ISIC_1569119,1
1,ISIC_7536704,1
2,ISIC_3696488,1
3,ISIC_3343475,1
4,ISIC_7785723,1


In [43]:
tr.target.value_counts()

0    26033
1      467
Name: target, dtype: int64

In [44]:
validation.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,gen_enc,site_enc,diag_enc,target
0,ISIC_6767569,IP_7665112,IL_1769376,male,70.0,head/neck,melanoma,malignant,1,0,4,1
1,ISIC_7312977,IP_8675890,IL_2723500,male,55.0,lower extremity,melanoma,malignant,1,1,4,1
2,ISIC_1785627,IP_0170821,IL_6029631,female,40.0,upper extremity,melanoma,malignant,0,6,4,1
3,ISIC_8066110,IP_3055814,IL_7066374,male,50.0,lower extremity,melanoma,malignant,1,1,4,1
4,ISIC_8838753,IP_6292815,IL_9486623,female,65.0,lower extremity,melanoma,malignant,0,1,4,1


In [45]:
val = validation[['file', 'target']]
val.head()

Unnamed: 0,file,target
0,ISIC_6767569,1
1,ISIC_7312977,1
2,ISIC_1785627,1
3,ISIC_8066110,1
4,ISIC_8838753,1


In [46]:
val.target.value_counts()

0    6509
1     117
Name: target, dtype: int64

In [47]:
len(train)

26500

In [48]:
len(val)

6626

In [9]:
# train.to_csv('../data/train.csv')
# val.to_csv('../data/val.csv')

# Proper Folder Structure

In [10]:
new_dir = 'images/'
# os.mkdir(new_dir)

## Training Set

In [11]:
tr_fldr = os.path.join(new_dir, 'train')
tr_mel = os.path.join(tr_fldr, 'mel')
tr_nm = os.path.join(tr_fldr, 'not_mel')

# os.mkdir(tr_fldr)
# os.mkdir(tr_mel)
# os.mkdir(tr_nm)

In [12]:
train['jpg'] = train.file.apply(lambda x: str(x) + '.jpg')
mel_train = train[train.target == 1]
non_mel_train = train[train.target == 0]

In [13]:
images_dir = 'data/train_jpg'
mel_tr_ids = mel_train.jpg.to_list()
nm_tr_ids = non_mel_train.jpg.to_list()

In [14]:
# for img in tqdm (mel_tr_ids):
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(tr_mel, img))

In [15]:
# for img in tqdm(nm_tr_ids):
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(tr_nm, img))

## Validation Set

In [16]:
val_fldr = os.path.join(new_dir, 'val')
val_mel = os.path.join(val_fldr, 'mel')
val_nm = os.path.join(val_fldr, 'not_mel')

# os.mkdir(val_fldr)
# os.mkdir(val_mel)
# os.mkdir(val_nm)

In [17]:
val['jpg'] = val.file.apply(lambda x: str(x) + 'jpg')
mel_val = val[val.target == 1]
non_mel_val = val[val.target == 0]

mel_val_ids = mel_val.jpg.to_list()
nm_val_ids = non_mel_val.jpg.to_list()

In [18]:
# for img in mel_val_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(val_mel, img))    

In [19]:
# for img in nm_val_ids:
#     shutil.copyfile(os.path.join(images_dir, img), os.path.join(val_nm, img))

# New Train-Validation Split

In [68]:
df2 = pd.read_csv('../data/tr_tot.csv', index_col=0)
df2.head()

Unnamed: 0,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0


In [69]:
df2.target.value_counts()

0    32542
1     6219
Name: target, dtype: int64

In [70]:
df2.file.value_counts()

ISIC_0033369    2
ISIC_0031561    2
ISIC_0026930    2
ISIC_0027573    2
ISIC_0025451    2
               ..
ISIC_3848572    1
ISIC_7166676    1
ISIC_3192233    1
ISIC_7486963    1
ISIC_4026808    1
Name: file, Length: 37648, dtype: int64

In [71]:
df2.drop_duplicates('file', inplace=True)
df2.reset_index(inplace=True)
df2.head()

Unnamed: 0,index,file,patient_id,lesion_id,gender,age,site,diagnosis,ben_mal,target
0,0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0


In [72]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37648 entries, 0 to 37647
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       37648 non-null  int64  
 1   file        37648 non-null  object 
 2   patient_id  33126 non-null  object 
 3   lesion_id   37311 non-null  object 
 4   gender      37502 non-null  object 
 5   age         37495 non-null  float64
 6   site        36994 non-null  object 
 7   diagnosis   33126 non-null  object 
 8   ben_mal     33126 non-null  object 
 9   target      37648 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.9+ MB


In [73]:
melanoma2 = df2[df2.target == 1]
non_melanoma2 = df2[df2.target == 0]
y_mel2 = melanoma2.target
X_mel2 = melanoma2.drop(columns=['target'], axis=1)
y_non_mel2 = non_melanoma2.target
X_non_mel2 = non_melanoma2.drop(columns=['target'], axis=1)

In [74]:
X_tr_m2, X_val_m2, y_tr_m2, y_val_m2 = train_test_split(X_mel2, y_mel2, random_state=42, test_size=0.2)
X_tr_nm2, X_val_nm2, y_tr_nm2, y_val_nm2 = train_test_split(X_non_mel2, y_non_mel2, random_state=42, test_size=0.2)

mel_train2 = pd.concat([X_tr_m2, y_tr_m2], axis=1)
non_mel_train2 = pd.concat([X_tr_nm2, y_tr_nm2], axis=1)
mel_val2 = pd.concat([X_val_m2, y_val_m2], axis=1)
non_mel_val2 = pd.concat([X_val_nm2, y_val_nm2], axis=1)

train2 = pd.concat([mel_train2, non_mel_train2], axis=0)
validation2 = pd.concat([mel_val2, non_mel_val2], axis=0)
train2 = train2.reset_index(drop=True)
validation2 = validation2.reset_index(drop=True)

In [75]:
tr2 = train2[['file', 'target']]
tr2.head()

Unnamed: 0,file,target
0,ISIC_0065823,1
1,ISIC_0013908_downsampled,1
2,ISIC_0054089,1
3,ISIC_0061371,1
4,ISIC_8702237,1


In [76]:
len(tr2)

30117

In [77]:
tr2.target.value_counts()

0    26033
1     4084
Name: target, dtype: int64

In [78]:
val2 = validation2[['file', 'target']]
val2.head()

Unnamed: 0,file,target
0,ISIC_0066225,1
1,ISIC_0000013,1
2,ISIC_9367522,1
3,ISIC_3969411,1
4,ISIC_0026531,1


In [79]:
len(val2)

7531

In [63]:
val2.target.value_counts()

0    6509
1    1244
Name: target, dtype: int64

# 2019 Dataset

In [21]:

train2 = train2.drop(columns=['NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', "SCC", 'UNK'], axis=1)
train2.columns = ['file', 'target']
train2.target = train2.target.astype(int)
train2.head()

In [22]:
tr19_dir = 'images/test'
tr19_fldr = os.path.join(new_dir, '2019')
tr19_mel = os.path.join(tr19_fldr, 'mel')
tr19_nm = os.path.join(tr19_fldr, 'not_mel')

# os.mkdir(tr19_fldr)
# os.mkdir(tr19_mel)
# os.mkdir(tr19_nm)

In [23]:
tr19_mel_df = train2[train2.target == 1]
tr19_nm_df = train2[train2.target == 0]

tr19_mel_ids = tr19_mel_df.jpg.to_list()
tr19_nm_ids = tr19_nm_df.jpg.to_list()

In [24]:
# for img in tr19_mel_ids:
#     shutil.copyfile(os.path.join(tr19_dir, img), os.path.join(tr19_mel, img))

In [25]:
# for img in tr19_nm_ids:
#     shutil.copyfile(os.path.join(tr19_dir, img), os.path.join(tr19_nm, img))

# Testing Set

In [2]:
test2 = pd.read_csv('../data/test2.csv', index_col = 0)
test2.head()

Unnamed: 0,file,age,gender,target
0,ISIC_0000000,55,female,0
1,ISIC_0000001,30,female,0
2,ISIC_0000002,60,female,1
3,ISIC_0000003,30,male,0
4,ISIC_0000004,80,male,1


In [3]:
# test2 = test2.rename(columns={'image': 'file'})
test2['jpg'] = test2.file.apply(lambda x: str(x) + '.jpg')
test_nm = test2[test2.target == 0]
test_mel = test2[test2.target == 1]

In [4]:
ttnm = test_nm.jpg.to_list()
ttmel = test_mel.jpg.to_list()

In [5]:
direc = '../split/test2/'

os.mkdir('../split/test2/mel')
os.mkdir('../split/test2/non_mel')

direc2 = '../split/test2/mel/'
direc3 = '../split/test2/non_mel'

In [7]:
for img in tqdm(ttnm):
    shutil.move(os.path.join(direc, img), os.path.join(direc3, img))

  0%|          | 0/1626 [00:00<?, ?it/s]

In [6]:
for img in tqdm(ttmel):
    shutil.move(os.path.join(direc, img), os.path.join(direc2, img))

  0%|          | 0/374 [00:00<?, ?it/s]