In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from glob import glob
from PIL import Image

from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical

PATH = os.path.join('./drive/MyDrive/Skin Cancer MNIST: HAM10000/data')

In [None]:
all_image_path = glob(os.path.join(PATH, '*', '*.jpg'))
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in all_image_path}
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [None]:
df_original = pd.read_csv(os.path.join(PATH, 'HAM10000_metadata.csv'))
df_original['path'] = df_original['image_id'].map(imageid_path_dict.get)
df_original['cell_type'] = df_original['dx'].map(lesion_type_dict.get)
df_original['cell_type_idx'] = pd.Categorical(df_original['cell_type']).codes
df = df_original.copy()

In [None]:
df['age'].fillna(df['age'].mean(), inplace=True)
df['age'] = (df['age'] - df['age'].mean()) / df['age'].std()

dxtype_df = pd.get_dummies(df['dx_type'])
df = pd.concat([dxtype_df, df], axis=1)

localization_df = pd.get_dummies(df['localization'])
df = pd.concat([localization_df, df], axis=1)

sex_df = pd.get_dummies(df['sex'])
sex_df.drop(['unknown'], axis=1, inplace=True)
df = pd.concat([sex_df, df], axis=1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 31 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   female           10015 non-null  uint8  
 1   male             10015 non-null  uint8  
 2   abdomen          10015 non-null  uint8  
 3   acral            10015 non-null  uint8  
 4   back             10015 non-null  uint8  
 5   chest            10015 non-null  uint8  
 6   ear              10015 non-null  uint8  
 7   face             10015 non-null  uint8  
 8   foot             10015 non-null  uint8  
 9   genital          10015 non-null  uint8  
 10  hand             10015 non-null  uint8  
 11  lower extremity  10015 non-null  uint8  
 12  neck             10015 non-null  uint8  
 13  scalp            10015 non-null  uint8  
 14  trunk            10015 non-null  uint8  
 15  unknown          10015 non-null  uint8  
 16  upper extremity  10015 non-null  uint8  
 17  confocal    

In [None]:
X_df = df.drop('cell_type_idx', axis=1)
y_df = df['cell_type_idx']

In [None]:
tqdm.pandas()

pixel = 224

X_df['image'] = X_df['path'].progress_map(lambda x: np.asarray(Image.open(x).resize((pixel, pixel))))

HBox(children=(FloatProgress(value=0.0, max=10015.0), HTML(value='')))




------------------------------

### random seed = 0

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=0)

In [None]:
columns = ['lesion_id','image_id','dx_type','dx','path','cell_type','sex','path','localization','image']

X_train_meta = X_train.drop(columns, axis=1)
X_test_meta = X_test.drop(columns, axis=1)

In [None]:
X_train_img = np.array([np.array(i) for i in X_train['image']], dtype='float32')
X_test_img = np.array([np.array(i) for i in X_test['image']], dtype='float32')

X_train_img /= 255.0
X_test_img /= 255.0

y_train = to_categorical(y_train, num_classes=7)
y_test = to_categorical(y_test, num_classes=7)

In [None]:
X_train_img.shape, X_train_meta.shape, y_train.shape

((8012, 224, 224, 3), (8012, 22), (8012, 7))

In [None]:
X_test_img.shape, X_test_meta.shape, y_test.shape

((2003, 224, 224, 3), (2003, 22), (2003, 7))

In [None]:
if not os.path.exists(os.path.join(PATH, "input_feature_extractor")):
  os.mkdir(os.path.join(PATH, "input_feature_extractor"))

X_train_meta.to_csv(os.path.join(PATH, "input_feature_extractor/X_train_meta_seed0.csv"), index=False)
X_test_meta.to_csv(os.path.join(PATH, "input_feature_extractor/X_test_meta_seed0.csv"), index=False)

np.save(open(os.path.join(PATH, f"input_feature_extractor/X_train_img_{pixel}_seed0.npy"), 'wb'), X_train_img)
np.save(open(os.path.join(PATH, f"input_feature_extractor/X_test_img_{pixel}_seed0.npy"), 'wb'), X_test_img)

np.save(open(os.path.join(PATH, "input_feature_extractor/y_train_seed0.npy"), 'wb'), y_train)
np.save(open(os.path.join(PATH, "input_feature_extractor/y_test_seed0.npy"), 'wb'), y_test)
