In [12]:
from os.path import join
from os import makedirs
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from coreml.utils.io import save_yml, read_yml

In [2]:
data_root = '/data/siim-isic-melanoma/processed/'

In [3]:
train_df = pd.read_csv(join(data_root, 'train.csv'))
test_df = pd.read_csv(join(data_root, 'test.csv'))

In [4]:
train_df.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [5]:
version_save_dir = join(data_root, 'versions')
makedirs(version_save_dir, exist_ok=True)

In [6]:
version_name = 'v1.0.yml'
version_save_path = join(version_save_dir, version_name)

In [7]:
training_patients = train_df['patient_id'].unique()

In [8]:
train_patients, val_patients = train_test_split(training_patients, test_size=0.2, random_state=0)

In [9]:
len(train_patients), len(val_patients)

(1644, 412)

In [10]:
train_patients = set(train_patients)
val_patients = set(val_patients)

In [11]:
train_indices = [index for index, patient in enumerate(train_df['patient_id'].values) if patient in train_patients]
val_indices = [index for index, patient in enumerate(train_df['patient_id'].values) if patient in val_patients]

In [12]:
len(train_indices), len(val_indices)

(26377, 6749)

In [13]:
assert not [index for index in train_indices if index in val_indices]

In [37]:
train_files = [join(data_root, 'images', f'{file}.jpg') for file in train_df.loc[train_indices]['image_name'].values]
train_labels = [{'classification': label.tolist()} for label in train_df.loc[train_indices]['target'].values]

In [39]:
df_train = pd.DataFrame({'file': train_files, 'label': train_labels})

In [40]:
df_train.head()

Unnamed: 0,file,label
0,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}
1,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}
2,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}
3,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}
4,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}


In [41]:
val_files = [join(data_root, 'images', f'{file}.jpg') for file in train_df.loc[val_indices]['image_name'].values]
val_labels = [{'classification': label.tolist()} for label in train_df.loc[val_indices]['target'].values]

In [42]:
df_val = pd.DataFrame({'file': val_files, 'label': val_labels})

In [43]:
df_val.head()

Unnamed: 0,file,label
0,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}
1,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}
2,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}
3,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}
4,/data/siim-isic-melanoma/processed/images/ISIC...,{'classification': 0}


In [44]:
len(df_train), len(df_val)

(26377, 6749)

In [45]:
assert len(df_train) + len(df_val) == len(train_df)

In [7]:
test_files = [join(data_root, 'images', f'{file}.jpg') for file in test_df['image_name'].values]

In [8]:
df_test = pd.DataFrame({'file': test_files})

In [9]:
df_test.head()

Unnamed: 0,file
0,/data/siim-isic-melanoma/processed/images/ISIC...
1,/data/siim-isic-melanoma/processed/images/ISIC...
2,/data/siim-isic-melanoma/processed/images/ISIC...
3,/data/siim-isic-melanoma/processed/images/ISIC...
4,/data/siim-isic-melanoma/processed/images/ISIC...


In [10]:
len(df_test)

10982

In [11]:
assert len(df_test) == len(test_df)

In [52]:
version = dict()

In [53]:
version['train'] = {
    'file': df_train['file'].values.tolist(),
    'label': df_train['label'].values.tolist()
}

In [54]:
version['val'] = {
    'file': df_val['file'].values.tolist(),
    'label': df_val['label'].values.tolist()
}

In [15]:
version['test'] = {
    'file': df_test['file'].values.tolist(),
    # adding dummy label just to be compatible with the data format
    'label': [{'classification': 0}] * len(df_test['file'])
}

In [19]:
save_yml(version_save_path, version)