In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# folder path locations
project_folder = "/kaggle/input/isic-2024-challenge"
image_folder = f"{project_folder}/train-image/image"

# file path locations
train_path = f"{project_folder}/train-metadata.csv"
train_image_path = f"{project_folder}/train-image.hdf5"

test_path = f"{project_folder}/test-metadata.csv"
test_image_path = f"{project_folder}/test-image.hdf5"

submission_path = f"{project_folder}/sample_submission.csv"

In [3]:
# Unique identifiers (loaded in as a string column)
identifier_cols = [
    'isic_id', # Unique case identifier.
    'patient_id', # Unique patient identifier.
    'lesion_id' # Unique lesion identifier. Present in lesions that were manually tagged as a lesion of interest.
]

# Numerical columns
numerical_cols = [
    'age_approx', # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm', # Maximum diameter of the lesion.
    'tbp_lv_A', # A inside lesion.+
    'tbp_lv_Aext', # A outside lesion.+
    'tbp_lv_B', # B inside lesion.+
    'tbp_lv_Bext', # B outside lesion.+
    'tbp_lv_C', # Chroma inside lesion.+
    'tbp_lv_Cext', # Chroma outside lesion.+
    'tbp_lv_H', # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext', # Hue outside lesion.+
    'tbp_lv_L', # L inside lesion.+
    'tbp_lv_Lext', #  L outside lesion.+
    'tbp_lv_areaMM2', # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio', # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean', # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA', # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB', # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL', # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB', # ???
    'tbp_lv_deltaLBnorm', # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity', # Eccentricity.+
    'tbp_lv_minorAxisMM', # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence', # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border', # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color', # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM', # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max', # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL', # Standard deviation of L inside lesion.+
    'tbp_lv_stdLExt', # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis', # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle', # Lesion border asymmetry angle.+
    'tbp_lv_x', # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y', # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z', # Z-coordinate of the lesion on 3D TBP.+
    'mel_thick_mm', # Thickness in depth of melanoma invasion.
    'tbp_lv_dnn_lesion_confidence' # Lesion confidence score (0-100 scale).+,
]

# Categorical Data
categorical_cols = [
    'target', # Binary class {0: benign, 1: malignant}.
    'sex', # Sex of the person
    'anatom_site_general', # Location of the lesion on the patient's body
    'image_type', # Structured field of the ISIC Archive for image type
    'tbp_tile_type', # Lighting modality of the 3D TBP source image.
    'tbp_lv_location', # Classification of anatomical location, divides arms & legs to upper & lower; torso into thirds.+
    'tbp_lv_location_simple', # Classification of anatomical location, simple.+
    'attribution', # Image attribution, synonymous with image source.
    'copyright_license', # Copyright license.
    'iddx_full', # Fully classified lesion diagnosis.
    'iddx_1', # First level lesion diagnosis.
    'iddx_2', # Second level lesion diagnosis.
    'iddx_3', # Third level lesion diagnosis.
    'iddx_4', # Fourth level lesion diagnosis.
    'iddx_5', # Fifth level lesion diagnosis.
    'mel_mitotic_index' # Mitotic index of invasive malignant melanomas.
]

# Fields are not in test set
# Will initially load these columns in but can be removed afterwards
to_be_deleted_cols = ['lesion_id', 'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_lv_dnn_lesion_confidence']

In [4]:
df = pd.read_csv(train_path, low_memory=False)

# Convert identifier columns to string
for col in identifier_cols:
    df[col] = df[col].astype('string')
    
# Convert numerical columns to 'numeric'
for col in numerical_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert categorical columns to 'category' dtype
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [5]:
# Test data is returned successfully
df.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251
