In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [3]:
df_metadata = pd.read_csv('../data/MILK10k_Training_Metadata.csv')
df_metadata.info()
df_metadata.head()

df_metadata.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10480 entries, 0 to 10479
Data columns (total 17 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   lesion_id                                     10480 non-null  object 
 1   image_type                                    10480 non-null  object 
 2   isic_id                                       10480 non-null  object 
 3   attribution                                   10480 non-null  object 
 4   copyright_license                             10480 non-null  object 
 5   image_manipulation                            10480 non-null  object 
 6   age_approx                                    10440 non-null  float64
 7   sex                                           10480 non-null  object 
 8   skin_tone_class                               10480 non-null  int64  
 9   site                                          10418 non-null 

Unnamed: 0,age_approx,skin_tone_class,MONET_ulceration_crust,MONET_hair,MONET_vasculature_vessels,MONET_erythema,MONET_pigmented,MONET_gel_water_drop_fluid_dermoscopy_liquid,MONET_skin_markings_pen_ink_purple_pen
count,10440.0,10480.0,10480.0,10480.0,10480.0,10480.0,10480.0,10480.0,10480.0
mean,61.355364,3.209542,0.36017,0.281697,0.120773,0.355653,0.312637,0.244018,0.239441
std,15.691646,0.799361,0.231542,0.169726,0.183562,0.226047,0.264055,0.15811,0.220725
min,5.0,0.0,0.004519,0.01286,3.9e-05,0.006812,0.004111,0.005152,0.000345
25%,50.0,3.0,0.165659,0.143422,0.00661,0.157514,0.108379,0.125223,0.063859
50%,65.0,3.0,0.314602,0.251231,0.03042,0.326501,0.206067,0.207873,0.167687
75%,75.0,4.0,0.531885,0.391774,0.16291,0.531584,0.480979,0.328269,0.353858
max,85.0,5.0,0.987837,0.94883,0.959442,0.970064,0.974792,0.973334,0.989992


In [4]:
df_supplement = pd.read_csv('../data/MILK10k_Training_Supplement.csv')
df_supplement.info()

df_supplement.describe()
df_supplement.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10480 entries, 0 to 10479
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   isic_id                      10480 non-null  object
 1   diagnosis_full               10480 non-null  object
 2   diagnosis_confirm_type       10480 non-null  object
 3   invasion_thickness_interval  492 non-null    object
dtypes: object(4)
memory usage: 327.6+ KB


Unnamed: 0,isic_id,diagnosis_full,diagnosis_confirm_type,invasion_thickness_interval
0,ISIC_0051817,"Squamous cell carcinoma, Invasive",histopathology,
1,ISIC_0073863,"Nevus, Reed",histopathology,
2,ISIC_0075884,"Nevus, Acral",histopathology,
3,ISIC_0076255,Basal cell carcinoma,histopathology,
4,ISIC_0077054,Basal cell carcinoma,histopathology,


In [5]:
IMAGE_TYPE = {'clinical: close-up': 0, 'dermoscopic': 1}
IMAGE_MANIPULATION = {'altered': 0, 'instrument only': 1}
SEX = {'male': 0, 'female': 1}
SITE = {'head_neck_face': 0, 'lower_extremity': 1, 'upper_extremity': 2, 'trunk': 3, 'foot': 4, 'genital': 5, 'hand': 6}
DIAGNOSIS_FULL = { 'Squamous cell carcinoma, Invasive': 0, 'Nevus, Reed': 1, 'Nevus, Acral': 2, 'Basal cell carcinoma': 3, 'Squamous cell carcinoma in situ, Bowens disease': 4, 'Nevus, NOS, Dermal': 5, 'Nevus, NOS, Compound': 6, 'Melanoma in situ': 7, 'Seborrheic keratosis': 8, 'Keratoacanthoma': 9, 'Melanoma metastasis': 10, 'Lichen planus like keratosis': 11, 'Hemangioma': 12, 'Nevus': 13, 'Nevus, NOS, Junctional': 14, 'Nevus, Congenital': 15, 'Melanoma Invasive': 16, 'Inflammatory or infectious diseases': 17, 'Solar or actinic keratosis': 18, 'Dermatofibroma': 19, 'Sebaceous hyperplasia': 20, 'Angiokeratoma': 21, 'Trichoblastoma': 22, 'Solar lentigo': 23, 'Nevus, Combined': 24, 'Clear cell acanthoma': 25, 'Benign - Other': 26, 'Benign soft tissue proliferations - Fibro-histiocytic': 27, 'Blue nevus': 28, 'Collision - Only benign proliferations': 29, 'Exogenous': 30, 'Nevus, Spitz': 31, 'Mucosal melanotic macule': 32, 'Infundibular or epidermal cyst': 33, 'Benign soft tissue proliferations - Vascular': 34, 'Nevus, Recurrent or persistent': 35, 'Collision - At least one malignant proliferation': 36, 'Ink-spot lentigo': 37, 'Nevus, BAP-1 deficient': 38, 'Juvenile xanthogranuloma': 39, 'Nevus, Spilus': 40, 'Pyogenic granuloma': 41, 'Supernumerary nipple': 42, 'Porokeratosis': 43, 'Nevus, Balloon cell': 44, 'Hemangioma, Hobnail': 45, 'Molluscum': 46, 'Mastocytosis': 47,}
DIAGNOSIS_CONFIRM_TYPE = {'histopathology': 0, 'single contributor clinical assessment': 1}


df = pd.merge(df_metadata, df_supplement, on='isic_id')
df = df.dropna(subset=['site', 'age_approx'])
df = df.drop(['invasion_thickness_interval', 'attribution', 'copyright_license'], axis=1)

# Map categorical features
df['sex'] = df['sex'].map(SEX)
df['image_manipulation'] = df['image_manipulation'].map(IMAGE_MANIPULATION)
df['site'] = df['site'].map(SITE)
df['image_type'] = df['image_type'].map(IMAGE_TYPE)
df['diagnosis_full'] = df['diagnosis_full'].map(DIAGNOSIS_FULL)
df['diagnosis_confirm_type'] = df['diagnosis_confirm_type'].map(DIAGNOSIS_CONFIRM_TYPE)

# Identify numeric and categorical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.difference(
    ['sex', 'site', 'image_type', 'image_manipulation', 'diagnosis_full', 'diagnosis_confirm_type']
)
categorical_cols = ['sex', 'site', 'image_type', 'image_manipulation', 'diagnosis_full', 'diagnosis_confirm_type']

# Scale numeric columns
scaler = StandardScaler()
df_scaled_numeric = pd.DataFrame(
    scaler.fit_transform(df[numeric_cols]),
    columns=numeric_cols,
    index=df.index
)

# One-hot encode categorical columns
df_categorical = pd.get_dummies(df[categorical_cols], drop_first=False)

# Combine numeric and categorical
df_final = pd.concat([df_scaled_numeric, df_categorical], axis=1)


# Delete some ugly data
df_truth = pd.read_csv('../data/MILK10k_Training_GroundTruth.csv')
df_truth.head()

df_final.describe()


Unnamed: 0,MONET_erythema,MONET_gel_water_drop_fluid_dermoscopy_liquid,MONET_hair,MONET_pigmented,MONET_skin_markings_pen_ink_purple_pen,MONET_ulceration_crust,MONET_vasculature_vessels,age_approx,skin_tone_class,sex,site,image_type,image_manipulation,diagnosis_full,diagnosis_confirm_type
count,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0,10378.0
mean,-2.73865e-18,-1.314552e-16,1.16735e-16,-2.4647850000000002e-17,4.5187720000000004e-17,4.38184e-17,-7.531287000000001e-17,-1.739043e-16,-4.38184e-17,0.399114,1.670842,0.5,0.968202,6.845057,0.039121
std,1.000048,1.000048,1.000048,1.000048,1.000048,1.000048,1.000048,1.000048,1.000048,0.48974,1.362771,0.500024,0.17547,6.633503,0.193893
min,-1.550425,-1.513718,-1.582515,-1.164326,-1.07931,-1.536067,-0.6599655,-3.60631,-4.032687,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.8760097,-0.7509377,-0.8143308,-0.7693324,-0.7930784,-0.8414017,-0.6235933,-0.7293813,-0.2625328,0.0,0.0,0.0,1.0,3.0,0.0
50%,-0.1250265,-0.2290935,-0.1800304,-0.4022335,-0.3263637,-0.1957207,-0.4919332,0.2295949,-0.2625328,0.0,2.0,0.5,1.0,3.0,0.0
75%,0.778469,0.5358783,0.649549,0.6079548,0.5130742,0.7414724,0.23369,0.8689124,0.9941854,1.0,3.0,1.0,1.0,9.0,0.0
max,2.716401,4.611053,3.931716,2.533754,3.41729,2.706142,4.552716,1.50823,2.250904,1.0,6.0,1.0,1.0,47.0,1.0
