## Imports

In [None]:
import pandas as pd
import json


input_csv_path = './data/balanced_data/balanced_train_metadata.csv'
output_csv_path = './data/balanced_data/normalized_metadata.csv'
normalization_file_path = './data/balanced_data/normalization_params.json'


columns_to_drop = [
    "patient_id", "lesion_id", "iddx_full", "iddx_1", "iddx_2", "iddx_3", "iddx_4", "iddx_5",
    "mel_mitotic_index", "mel_thick_mm", "tbp_lv_dnn_lesion_confidence", "attribution", "copyright_license",
    "image_type", "anatom_site_general", "tbp_tile_type", "tbp_lv_location"
]


metadata = pd.read_csv(input_csv_path)


metadata = metadata.drop(columns=columns_to_drop)


metadata.dropna(inplace=True)


metadata['sex'] = metadata['sex'].map({'male': 1, 'female': 0})


metadata = pd.get_dummies(metadata, columns=['tbp_lv_location_simple'], prefix='', prefix_sep='', dtype=int)


metadata.rename(
    columns={col: col.lower().replace(' ', '_') for col in metadata.columns if col.startswith(('Torso', 'Left', 'Right', 'Head', 'Unknown'))},
    inplace=True
)


metadata = metadata.applymap(lambda x: 1 if x is True else (0 if x is False else x))


binary_columns = ['sex'] + [col for col in metadata.columns if metadata[col].nunique() == 2]
feature_columns = [col for col in metadata.columns if col not in ['isic_id'] + binary_columns]


normalization_params = {}
for col in feature_columns:
    col_min = metadata[col].min()
    col_max = metadata[col].max()
    col_median = metadata[col].median()
    normalization_params[col] = {
        'min': float(col_min), 
        'max': float(col_max), 
        'median': float(col_median)
    }
    
    
    metadata[col] = (metadata[col] - col_min) / (col_max - col_min)


with open(normalization_file_path, 'w') as f:
    json.dump(normalization_params, f)


metadata.to_csv(output_csv_path, index=False)

print(f"Normalized dataset saved to {output_csv_path}")
print(f"Normalization parameters saved to {normalization_file_path}")


Normalized dataset saved to ./data/balanced_data/normalized_metadata.csv
Normalization parameters saved to ./data/balanced_data/normalization_params.json


  metadata = metadata.applymap(lambda x: 1 if x is True else (0 if x is False else x))
