In [112]:
import pandas as pd
import numpy as np

df = pd.read_csv('combined_dataset.csv')

In [113]:
# The first step is to semantically standardize each column, meaning that we ensure that every column has a stable and efficient set of values

# The age column already has numerical values ranged 0-100 so no need for standardizing
print("The current age values are: ")
df['age'].unique()

The current age values are: 


array([ 8, 55, 77, 75, 79, 53, 52, 74, 68, 58, 45, 34,  9, 78, 60, 64, 61,
       62, 67, 71, 66, 38, 54, 82, 63, 49, 76, 31, 36, 17, 44, 56, 26, 47,
       80, 48, 51, 35, 87, 41, 22, 69, 59, 57, 65, 83, 92, 33, 46, 50, 70,
       85, 40, 73, 10, 86, 72, 14, 21, 39, 94, 43, 90, 89, 13, 30, 81, 84,
       88, 37, 32, 16, 29, 24, 42, 20, 23, 28, 27, 91, 18, 12, 25,  6, 93,
       99, 95])

In [114]:
# The sex column needs to be standardized as there are both uppercase and lowercase gender describing values
# I have identified multiple columns in this dataset with the same issue so we can just build a function to convert lowercase from uppercase and apply it to each column

def standardize_case_simple(df, column_name):
    
    # 1. Print current unique values
    print(f"The current {column_name} values are: ")
    print(df[column_name].unique())
    
    # 2. Apply lowercasing (The transformation)
    df[column_name] = df[column_name].str.lower()
    
    # 3. Print new unique values
    print("================================")
    print(f"The new {column_name} values are: ")
    print(df[column_name].unique())
    
    return df

# I will also create an array of all the columns that need to be transformed into lowercase that will be traversed and mapped
columns_lowercase = ['sex','alcohol_consumption','smoking','cancer_history','skin_cancer_history','background_father', 'background_mother', 'bleed', 'hurt', 'itch', 'changed', 'grew','elevation', 'biopsed', 'has_piped_water','has_sewage_system', 'pesticide']
for col in columns_lowercase:
    print(f"| PROCESSING: {col.upper()}")
    df = standardize_case_simple(df, col)

| PROCESSING: SEX
The current sex values are: 
['unknown' 'FEMALE' 'MALE' 'female' 'male']
The new sex values are: 
['unknown' 'female' 'male']
| PROCESSING: ALCOHOL_CONSUMPTION
The current alcohol_consumption values are: 
['unknown' 'False' 'True']
The new alcohol_consumption values are: 
['unknown' 'false' 'true']
| PROCESSING: SMOKING
The current smoking values are: 
['unknown' 'False' 'True']
The new smoking values are: 
['unknown' 'false' 'true']
| PROCESSING: CANCER_HISTORY
The current cancer_history values are: 
['unknown' 'True' 'False']
The new cancer_history values are: 
['unknown' 'true' 'false']
| PROCESSING: SKIN_CANCER_HISTORY
The current skin_cancer_history values are: 
['unknown' 'True' 'False']
The new skin_cancer_history values are: 
['unknown' 'true' 'false']
| PROCESSING: BACKGROUND_FATHER
The current background_father values are: 
['unknown' 'POMERANIA' 'GERMANY' 'BRAZIL' 'NETHERLANDS' 'ITALY' 'POLAND'
 'UNK' 'PORTUGAL' 'BRASIL' 'CZECH' 'AUSTRIA' 'SPAIN' 'ISRAEL']


In [115]:
print("The current ethnicity values are: ")
print(df['ethnicity'].unique())

# According to the documentation of the MIDAS dataset, the ethnicity column stands for hispanic or latinos that partied in this survey
# As this does not help us with the prediction of the disease, it introduces significant bias so it will be removed
df = df.drop('ethnicity', axis=1)
df.columns

The current ethnicity values are: 
['unknown' 'no' 'yes']


Index(['is_malignant', 'diagnosis', 'dataset_id', 'patient_global', 'img_path',
       'age', 'sex', 'fitzpatrick', 'lesion_size_mm', 'anatomical_site',
       'clinical_impression', 'smoking', 'alcohol_consumption',
       'cancer_history', 'skin_cancer_history', 'background_father',
       'background_mother', 'bleed', 'hurt', 'itch', 'changed', 'grew',
       'elevation', 'biopsed', 'diameter_2', 'has_piped_water',
       'has_sewage_system', 'pesticide', 'clinical_impression_2',
       'clinical_impression_3', 'race', 'distance', 'is_control',
       'melanoma_flag', 'pathology_report'],
      dtype='object')

In [116]:
print("The current fitzpatrick values are: ")
print(df['fitzpatrick'].unique())

The current fitzpatrick values are: 
['unknown' '3.0' '1.0' '2.0' '4.0' '5.0' '6.0' 'ii fair skin, blue eyes'
 'i pale white skin, blue/green eyes, blond/red hair'
 'iii darker white skin' 'iv light brown skin' 'v brown skin'
 'vi dark brown or black skin']


In [117]:
# I have decided to standardize the fitzpatrick values with 1,6 values and 0 for the cases that we do not have a value

fitzpatrick_map = {
    'i pale white skin, blue/green eyes, blond/red hair': 1,
    'ii fair skin, blue eyes': 2,
    'iii darker white skin': 3,
    'iv light brown skin': 4,
    'v brown skin': 5,
    'vi dark brown or black skin': 6,
    'unknown': 0 
}
df['fitzpatrick'] = df['fitzpatrick'].replace(fitzpatrick_map)
df['fitzpatrick'] = pd.to_numeric(df['fitzpatrick'])
df['fitzpatrick'].unique()

array([0., 3., 1., 2., 4., 5., 6.])

In [118]:
# I need to convert these columns to integers so
df['fitzpatrick'] = df['fitzpatrick'].astype(int)
print("The new fitzpatrick values are: ",df['fitzpatrick'].unique())

The new fitzpatrick values are:  [0 3 1 2 4 5 6]


In [119]:
print("The current lesion size values in mm are: ")
print(df['lesion_size_mm'].unique())
# The current values are consistent so no need for change yet

The current lesion size values in mm are: 
[ -1.     6.     5.    15.     9.    20.    14.    13.     3.     7.
  10.     8.    11.     4.    40.    18.    32.    23.    16.    12.
  30.    19.     2.    17.    22.    24.    31.    35.    25.    21.
   0.    60.     1.     1.2   89.    26.    42.    29.    90.   100.
  70.    45.    50.     6.5    5.5    4.5    7.5    1.5    3.25  11.5
   8.5    9.5   22.5    3.5   10.5    2.5   12.5   15.5   13.5    6.25
  18.5    4.75  17.5   16.5   80.    26.5    5.25  42.5   55.    32.5
   8.25   1.15   0.75  33.5    5.75  14.5    4.25   3.75]


In [120]:
print("The current anatomical site values are: ")
print(df['anatomical_site'].unique())

The current anatomical site values are: 
['ARM' 'NECK' 'FACE' 'HAND' 'FOREARM' 'CHEST' 'NOSE' 'THIGH' 'SCALP' 'EAR'
 'BACK' 'FOOT' 'ABDOMEN' 'LIP' 'chest' 'l lower back' 'left upper buttock'
 'right upper eyelid' 'left forearm' 'left upper back' 'r forehead'
 'r dorsal hand\xa0' 'nasal bridge' 'l dorsal hand\xa0'
 'r posterior shoulder' 'r flank\xa0' 'forehead' 'l ear' 'l cheek'
 'r upper back\xa0' 'l medial ankle' 'l forearm' 'l forehead'
 'r post auricular scalp' 'mid scalp' 'posterior midline neck'
 'r distal lateral upper arm\xa0' 'l proximal dorsal forearm\xa0'
 'r upper back' 'r chest' 'l lateral neck' 'l upper back\xa0'
 'r upper arm\xa0' 'l posterior neck\xa0' 'r upper arm'
 'r posterior helix' 'l upper back' 'umbilicus' 'r nasal ala'
 'l antihelix' 'r elbow' 'l lateral thigh' 'l nasal ala'
 'r medial forearm' 'mid back' 'r mid back' 'l flank' 'chin' 'l elbow'
 'right posterior calf' 'l dorsal 2nd toe' 'l chest' 'r dorsal foot\xa0'
 'r abdomen\xa0' 'r posterior calf' 'right pos

In [121]:
# Due to the size and difference of all the values the data should first be cleaned
df['anatomical_site_clean'] = df['anatomical_site'].str.lower()
df['anatomical_site_clean'] = df['anatomical_site_clean'].str.replace(r'[\xa0]', '', regex=True)
df['anatomical_site_clean'] = df['anatomical_site_clean'].str.strip()

# And then be split into smaller sets
body_region_map = {
    'head_neck': ['face', 'ear', 'nose', 'lip', 'cheek', 'scalp', 'neck', 'forehead', 'jaw', 'temple', 'chin', 'eyelid', 'eyebrow', 'preauricular', 'postauricular', 'malar', 'occiput'],
    'trunk': ['chest', 'back', 'abdomen', 'flank', 'shoulder', 'axilla', 'ribcage', 'breast', 'clavicle', 'umbilicus', 'groin', 'inguinal', 'suprapubic'],
    'upper_extremity': ['arm', 'forearm', 'hand', 'elbow', 'wrist', 'finger', 'deltoid', 'thumb'],
    'lower_extremity': ['thigh', 'leg', 'foot', 'calf', 'shin', 'knee', 'ankle', 'toe', 'heel', 'popliteal', 'buttock'],
    'unknown': ['unknown'] 
}
def map_to_region(site):
    """Maps a clean anatomical site string to one of the major body regions."""
    if pd.isna(site) or site == '':
        return 'unknown'

    for region, keywords in body_region_map.items():
        for keyword in keywords:
            if keyword in site:
                return region
    # If no keyword is found
    return 'other_unmapped' 

# Apply the mapping function to the cleaned column
df['anatomical_site'] = df['anatomical_site_clean'].apply(map_to_region)

In [122]:
# The reason that the data was split into 5 sets was to reduce the variance and the risk of overfitting the model, as the amount of data is relatively small 
print("The current anatomical site values are: ")
print(df['anatomical_site'].unique())

The current anatomical site values are: 
['upper_extremity' 'head_neck' 'trunk' 'lower_extremity' 'other_unmapped']


In [123]:
print("The current clinical impression values are: ")
print(df['clinical_impression'].unique())
# As we know, we have three clinical impression columns so we need to create a method to clean all three since they use the same values

The current clinical impression values are: 
['unknown' '7-malignant-bcc' '1-benign-melanocytic nevus' '6-benign-other'
 '14-other-non-neoplastic/inflammatory/infectious' '8-malignant-scc'
 '9-malignant-sccis' '10-malignant-ak' '3-benign-fibrous papule'
 '4-benign-dermatofibroma' '2-benign-seborrheic keratosis'
 '5-benign-hemangioma' '11-malignant-melanoma'
 '13-other-melanocytic lesion with possible re-excision (severe/spitz nevus, aimp)'
 '12-malignant-other']


In [124]:
impression_mapping = {
    # Target: nevus
    'benign-melanocytic nevus': 'nevus',
    'other-melanocytic lesion with possible re-excision (severe/spitz nevus, aimp)': 'nevus',
    
    # Target: benign_other
    'benign-fibrous papule': 'benign_other',
    'benign-dermatofibroma': 'benign_other',
    'benign-hemangioma': 'benign_other',
    'benign-other': 'benign_other', 
    
    # Target: seborrheic_keratosis
    'benign-seborrheic keratosis': 'seborrheic_keratosis',
    
    # Target: bcc
    'malignant-bcc': 'bcc',
    
    # Target: scc
    'malignant-scc': 'scc',
    'malignant-sccis': 'scc',
    
    # Target: melanoma
    'malignant-melanoma': 'melanoma',
    
    # Target: ak (Actinic Keratosis)
    'malignant-ak': 'ak',
    
    # Target: other_malignant
    'malignant-other': 'other_malignant',
    
    # Target: other_unclassified (for unknowns and non-neoplastic)
    'unknown': 'other_unclassified',
    'other-non-neoplastic/inflammatory/infectious': 'other_unclassified',
}

# The following function will clean and map the three clinical impression columns
def standardize_impression_column(df, column_name, mapping_dict):
    print(f"\nProcessing and cleaning column: {column_name}")

    # Step 1: Preliminary Cleaning and Splitting
    clean_col = df[column_name].astype(str).str.lower().str.strip()
    split_temp = clean_col.str.split('-', n=1, expand=True)
    
    # Step 2: Extract the Descriptive Impression Name
    mask_no_hyphen = split_temp[1].isnull()
    impression_name = np.where(mask_no_hyphen, split_temp[0], split_temp[1])
    
    # Step 3: Define the new column name (e.g., diag_group, diag_2_group)
    new_group_name = column_name.replace('clinical_impression', 'cli_impres').replace('cli_impres', 'cli_impres_') + '_group'
    
    # Step 4: Apply the Mapping
    # Fill any unmapped values with 'other_unclassified'
    df[new_group_name] = pd.Series(impression_name).map(mapping_dict).fillna('other_unclassified')
    
    print(f"-> Created new categorical column: {new_group_name}")
    print(f"-> Value Counts for {column_name}:\n{df[new_group_name].value_counts().to_string()}")
    
    return df    


# List of columns to process
impression_columns = ['clinical_impression', 'clinical_impression_2', 'clinical_impression_3']

# To map each column sequentially
for col in impression_columns:
    df = standardize_impression_column(df, col, impression_mapping)
    


Processing and cleaning column: clinical_impression
-> Created new categorical column: cli_impres__group
-> Value Counts for clinical_impression:
cli_impres__group
other_unclassified      2402
nevus                    967
bcc                      689
benign_other             473
seborrheic_keratosis     454
scc                      348
melanoma                 165
ak                       152
other_malignant            9

Processing and cleaning column: clinical_impression_2
-> Created new categorical column: cli_impres__2_group
-> Value Counts for clinical_impression_2:
cli_impres__2_group
other_unclassified      2830
nevus                    664
scc                      576
benign_other             496
melanoma                 460
seborrheic_keratosis     258
ak                       184
bcc                      179
other_malignant           12

Processing and cleaning column: clinical_impression_3
-> Created new categorical column: cli_impres__3_group
-> Value Counts for clinical_i

In [125]:
print("The current diameter values are: ")
print(df['diameter_2'].unique())
# The current values are consistent so no need for change yet

The current diameter values in mm are: 
[-1.   5.  10.   7.   4.  18.  12.  13.   3.   8.   9.   6.  11.  20.
 15.  24.   2.  14.  21.  16.  25.   1.   0.  33.  40.  23.  17.  27.
 22.  30.   1.2 26.  34.  19.  70.  60.  28.  32.  48. ]


In [132]:
print("The current race values are: ")
print(df['race'].unique())
# The current values are consistent so no need for change yet

The current race values are: 
['unknown' 'white' 'asian' 'other' 'black or african american'
 'american indian or alaska native']


In [133]:
# Below, we can see depicted the size of each race
df['race'].value_counts()

race
white                               2890
unknown                             2401
asian                                171
other                                163
black or african american             27
american indian or alaska native       7
Name: count, dtype: int64

In [134]:
# We are combining other, african american and american indian into one due to the lack of size
race_mapping = {
    'white': 'white',
    'unknown': 'unknown',
    'asian': 'asian',
    'other': 'other_minority',
    'black or african american': 'other_minority',
    'american indian or alaska native': 'other_minority'
}
df['race_group'] = df['race'].map(race_mapping).fillna('other_minority')
df['race_group'].value_counts()

race_group
white             2890
unknown           2401
other_minority     197
asian              171
Name: count, dtype: int64

In [137]:
print("The current distance values are: ")
print(df['distance'].unique())
# To clean this we will convert feet to inches, assume dscope to 1 inch and the rest will be negative values 

The current distance values are: 
['-1' '1ft' '6in' 'dscope' 'n/a - virtual']


In [141]:
distance_mapping = {
    '1ft': 12,
    '6in': 6,
    'dscope': 1,
    # '-1': NaN,
    # 'n/a - virtual': NaN,
}
df['distance_group'] = df['distance'].map(distance_mapping)
print(df['distance_group'].unique())

[nan 12.  6.  1.]


In [143]:
print("The current is control values are: ")
print(df['is_control'].unique())

The current is control values are: 
['unknown' 'no' 'yes']


In [144]:
print("The current melanoma flag values are: ")
print(df['melanoma_flag'].unique())

The current melanoma flag values are: 
['unknown' 'no' 'yes']


In [145]:
print("The current pathology report values are: ")
print(df['pathology_report'].unique())

The current pathology report values are: 
['unknown'
 'focal squamous cell carcinoma, in situ, \xa0 arising in background actinic keratosis\xa0'
 'intradermal melanocytic nevus, transected at the deep margin\xa0'
 'findings consistent with ruptured \xa0 follicle/epidermal cyst\xa0'
 'well-differentiated invasive squamous cell \xa0 carcinoma'
 'actinic keratosis' 'seborrheic keratosis'
 'basal cell carcinoma, superficial type, transected at the peripheral margin'
 'invasive squamous cell carcinoma\xa0'
 'basal cell carcinoma, infiltrative type\xa0'
 '\xa0at least squamous cell carcinoma, in-situ, \xa0 transected at all margins \xa0 \xa0 incidental intradermal melanocytic nevus, \xa0 transected at all margins\xa0'
 'invasive squamous cell carcinoma, excised\xa0'
 'traumatized seborrheic keratosis\xa0'
 'hyperplastic actinic keratosis\xa0'
 '\xa0basal cell carcinoma, nodular type, \xa0 transected at the deep margin\xa0'
 'superficial invasive squamous cell carcinoma'
 'moderately differen