In [1]:
import pandas as pd
import numpy as np

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

# Paths
RAW_DATA_PATH = "../data/raw/"

cases = pd.read_csv(RAW_DATA_PATH + "dataset_scin_cases.csv")
labels = pd.read_csv(RAW_DATA_PATH + "dataset_scin_labels.csv")
app_questions = pd.read_csv(RAW_DATA_PATH + "dataset_scin_app_questions.csv")
label_questions = pd.read_csv(RAW_DATA_PATH + "dataset_scin_label_questions.csv")

print("Cases shape:", cases.shape)
print("Labels shape:", labels.shape)
print("App Questions shape:", app_questions.shape)
print("Label Questions shape:", label_questions.shape)


Cases shape: (5033, 57)
Labels shape: (5033, 17)
App Questions shape: (11, 3)
Label Questions shape: (11, 4)


In [2]:
cases.columns


Index(['case_id', 'source', 'release', 'year', 'age_group', 'sex_at_birth', 'fitzpatrick_skin_type',
       'race_ethnicity_american_indian_or_alaska_native', 'race_ethnicity_asian',
       'race_ethnicity_black_or_african_american', 'race_ethnicity_hispanic_latino_or_spanish_origin',
       'race_ethnicity_middle_eastern_or_north_african', 'race_ethnicity_native_hawaiian_or_pacific_islander',
       'race_ethnicity_white', 'race_ethnicity_other_race', 'race_ethnicity_prefer_not_to_answer',
       'textures_raised_or_bumpy', 'textures_flat', 'textures_rough_or_flaky', 'textures_fluid_filled',
       'body_parts_head_or_neck', 'body_parts_arm', 'body_parts_palm', 'body_parts_back_of_hand',
       'body_parts_torso_front', 'body_parts_torso_back', 'body_parts_genitalia_or_groin', 'body_parts_buttocks',
       'body_parts_leg', 'body_parts_foot_top_or_side', 'body_parts_foot_sole', 'body_parts_other',
       'condition_symptoms_bothersome_appearance', 'condition_symptoms_bleeding', 'condi

In [3]:
labels.columns


Index(['case_id', 'dermatologist_gradable_for_skin_condition_1', 'dermatologist_gradable_for_skin_condition_2',
       'dermatologist_gradable_for_skin_condition_3', 'dermatologist_skin_condition_on_label_name',
       'dermatologist_skin_condition_confidence', 'weighted_skin_condition_label',
       'dermatologist_gradable_for_fitzpatrick_skin_type_1', 'dermatologist_gradable_for_fitzpatrick_skin_type_2',
       'dermatologist_gradable_for_fitzpatrick_skin_type_3', 'dermatologist_fitzpatrick_skin_type_label_1',
       'dermatologist_fitzpatrick_skin_type_label_2', 'dermatologist_fitzpatrick_skin_type_label_3',
       'gradable_for_monk_skin_tone_india', 'gradable_for_monk_skin_tone_us', 'monk_skin_tone_label_india',
       'monk_skin_tone_label_us'],
      dtype='object')

In [4]:
#merge the datasets 
df = cases.merge(labels, on="case_id", how="inner")

print("Merged shape:", df.shape)


Merged shape: (5033, 73)


In [5]:
# select the core columns
CORE_COLUMNS = [
    "case_id",
    "age_group",
    "sex_at_birth",
    "fitzpatrick_skin_type",
    "combined_race",

    # Image paths
    "image_1_path",
    "image_2_path",
    "image_3_path",

    # Labels (ground truth)
    "monk_skin_tone_label_us",
    "dermatologist_fitzpatrick_skin_type_label_1",
    "weighted_skin_condition_label"
]

df_core = df[CORE_COLUMNS].copy()
df_core.head()


Unnamed: 0,case_id,age_group,sex_at_birth,fitzpatrick_skin_type,combined_race,image_1_path,image_2_path,image_3_path,monk_skin_tone_label_us,dermatologist_fitzpatrick_skin_type_label_1,weighted_skin_condition_label
0,-1000600354148496558,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,dataset/images/-3205742176803893704.png,,,1.0,FST2,"{'Inflicted skin lesions': 0.41, 'Eczema': 0.4..."
1,-1002039107727665188,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,dataset/images/-4762289084741430925.png,,,3.0,FST1,"{'Prurigo nodularis': 0.41, 'SCC/SCCIS': 0.41,..."
2,-1003358831658393077,AGE_18_TO_29,MALE,NONE_IDENTIFIED,HISPANIC_LATINO_OR_SPANISH_ORIGIN,dataset/images/-4027806997035329030.png,,,4.0,FST4,"{'Impetigo': 0.55, 'Herpes Zoster': 0.23, 'Bul..."
3,-1003826561155964328,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,dataset/images/-5332065579713135540.png,dataset/images/-6353431708064969797.png,dataset/images/742075435141960831.png,4.0,,{}
4,-1003844406100696311,AGE_40_TO_49,FEMALE,FST3,WHITE,dataset/images/-3799298995660217860.png,dataset/images/-5881426422999442186.png,dataset/images/5854025080806696361.png,1.0,FST1,"{'Lichen planus/lichenoid eruption': 0.33, 'Fo..."


In [6]:
#normalizing the labels
def clean_text(x):
    if pd.isna(x):
        return x
    return str(x).lower().strip()

LABEL_COLS = [
    "monk_skin_tone_label_us",
    "dermatologist_fitzpatrick_skin_type_label_1",
    "weighted_skin_condition_label"
]

for col in LABEL_COLS:
    df_core[col] = df_core[col].apply(clean_text)


In [7]:
#checking for bias and undistributed in the labels
df_core["monk_skin_tone_label_us"].value_counts(normalize=True)


monk_skin_tone_label_us
2.0     0.331668
3.0     0.252747
4.0     0.137263
1.0     0.115285
5.0     0.072128
6.0     0.049550
7.0     0.027373
8.0     0.011389
9.0     0.002198
10.0    0.000400
Name: proportion, dtype: float64

In [8]:
df_core["dermatologist_fitzpatrick_skin_type_label_1"].value_counts(normalize=True)


dermatologist_fitzpatrick_skin_type_label_1
fst2    0.332868
fst3    0.309623
fst4    0.167132
fst1    0.090888
fst5    0.083450
fst6    0.016039
Name: proportion, dtype: float64

In [9]:
df_core["weighted_skin_condition_label"].value_counts().head(10)


weighted_skin_condition_label
{}                                                                            1972
{'eczema': 1.0}                                                                127
{'urticaria': 1.0}                                                              85
{'allergic contact dermatitis': 1.0}                                            42
{'folliculitis': 1.0}                                                           38
{'insect bite': 1.0}                                                            28
{'allergic contact dermatitis': 0.5, 'irritant contact dermatitis': 0.5}        27
{'eczema': 0.5, 'allergic contact dermatitis': 0.5}                             27
{'acute dermatitis, nos': 1.0}                                                  27
{'allergic contact dermatitis': 0.67, 'irritant contact dermatitis': 0.33}      21
Name: count, dtype: int64

In [10]:
#now we create the labels for the skin because weighted_skin_condition_label is a dictionary
import ast

def extract_primary_condition(x):
    if pd.isna(x) or x == "{}":
        return "none"
    try:
        data = ast.literal_eval(x)
        return max(data, key=data.get)
    except:
        return "none"

df_core["primary_skin_condition"] = df_core[
    "weighted_skin_condition_label"
].apply(extract_primary_condition)


In [11]:
#lets verify now 
df_core["primary_skin_condition"].value_counts().head(10)


primary_skin_condition
none                           1972
eczema                          488
allergic contact dermatitis     270
urticaria                       214
insect bite                     185
folliculitis                    142
psoriasis                       109
tinea                            93
impetigo                         69
herpes zoster                    68
Name: count, dtype: int64

In [12]:
#encoding the fitzpatrick skin type
def encode_fitzpatrick(x):
    if pd.isna(x):
        return None
    return int(x.replace("fst", ""))

df_core["fitzpatrick_label"] = df_core[
    "dermatologist_fitzpatrick_skin_type_label_1"
].apply(encode_fitzpatrick)


In [13]:
# lets now verify
df_core["fitzpatrick_label"].value_counts().sort_index()


fitzpatrick_label
1.0     391
2.0    1432
3.0    1332
4.0     719
5.0     359
6.0      69
Name: count, dtype: int64

In [15]:
# now lets convert the monk skin from a float to an int
# now lets convert the monk skin from a float to an int (use pandas nullable Int64 to keep NaNs)
df_core["monk_skin_tone_label_us"] = pd.to_numeric(
    df_core["monk_skin_tone_label_us"], errors="coerce"
).astype("Int64")


In [16]:
#then verify
df_core["monk_skin_tone_label_us"].value_counts().sort_index()


monk_skin_tone_label_us
1      577
2     1660
3     1265
4      687
5      361
6      248
7      137
8       57
9       11
10       2
Name: count, dtype: Int64

In [17]:
#final check
df_core[[
    "image_1_path",
    "monk_skin_tone_label_us",
    "fitzpatrick_label",
    "primary_skin_condition"
]].head()


Unnamed: 0,image_1_path,monk_skin_tone_label_us,fitzpatrick_label,primary_skin_condition
0,dataset/images/-3205742176803893704.png,1,2.0,inflicted skin lesions
1,dataset/images/-4762289084741430925.png,3,1.0,prurigo nodularis
2,dataset/images/-4027806997035329030.png,4,4.0,impetigo
3,dataset/images/-5332065579713135540.png,4,,none
4,dataset/images/-3799298995660217860.png,1,1.0,lichen planus/lichenoid eruption


In [18]:
# now we need to save the cleaned labeled data to processed file
df_core.to_csv(
    "../data/processed/beauty_ml_vision_dataset.csv",
    index=False
)

print(" Vision dataset saved successfully")


 Vision dataset saved successfully
