In [161]:
import pandas as pd

pad  = pd.read_csv("metadata.csv")           
midas = pd.read_excel("release_midas.xlsx")   

In [162]:
# We process PAD-UFES first

# Split diagnostics to benign vs malignant 
malignant_pad = {"BCC", "ACK", "SCC", "MEL"}
benign_pad = {"NEV", "SEK"}

def pad_bin(d):
    if d in malignant_pad:
        return 1
    elif d in benign_pad:
        return 0
    else:
        return None 

def pad_subtype(d):
    mapping = {
        # Malignant
        "MEL": "melanoma",
        "BCC": "bcc",
        "SCC": "scc",
        "ACK": "ak",
        # Benign
        "NEV": "nevus",
        "SEK": "seborrheic_keratosis",
    }
    return mapping.get(d, None)

In [163]:
pad_new = pad.copy()
# For stage 1. Malignant vs Benign
pad_new["bin_label"] = pad_new["diagnostic"].map(pad_bin)
# For stage 2. Actual Diagnosis
pad_new["subtype"]   = pad_new["diagnostic"].map(pad_subtype)

# Mark dataset and build a global patient id
pad_new["dataset_id"]    = "A"
pad_new["patient_global"] = "A_" + pad_new["patient_id"].astype(str)

# Build an image path attribute
pad_new["img_path"] = "pad_images/" + pad_new["img_id"].astype(str)

# Keep only rows with a valid binary label
pad_new = pad_new[pad_new["bin_label"].notna()]

In [164]:
# Adjust the diagnosis of midas to be similar with PAD-UFES
vals = midas["midas_path"].dropna().astype(str).unique()# Had to convert to string due to instances of '<'
vals = sorted(v.strip() for v in vals) 
print("The following is a list of all disease predictions in the midas dataset:")
for v in vals:
    print(v)

The following is a list of all disease predictions in the midas dataset:
0
benign-dermatofibroma
benign-fibrous papule
benign-hemangioma
benign-melanocytic nevus
benign-other
benign-seborrheic keratosis
malignant- ak
malignant- bcc
malignant- melanoma
malignant- other
malignant- scc
malignant- sccis
melanocytic tumor, possible re-excision (severe, spitz, aimp)
other- melanocytic lesion, possible re-excision (severe, spitz, aimp)
other- non-neoplastic, inflammatory, infectious


In [165]:
# Lets first get the count of each disease
midas_clean = midas.copy()
midas_clean["midas_path_clean"] = midas_clean["midas_path"].astype(str).str.strip()
# Count occurrences of every unique diagnosis
disease_counts = midas_clean["midas_path_clean"].value_counts()
print(disease_counts)

midas_path_clean
malignant- bcc                                                           608
benign-melanocytic nevus                                                 578
nan                                                                      497
benign-other                                                             421
benign-seborrheic keratosis                                              242
malignant- melanoma                                                      238
malignant- scc                                                           203
malignant- ak                                                            191
malignant- sccis                                                         165
other- melanocytic lesion, possible re-excision (severe, spitz, aimp)    109
benign-dermatofibroma                                                     51
other- non-neoplastic, inflammatory, infectious                           39
benign-hemangioma                                          

In [166]:
# To see if nan is actually a value or NaN, usually nothing is a float so in our case it is correct
midas["midas_path"].apply(type).value_counts()

midas_path
<class 'str'>      2913
<class 'float'>     497
<class 'int'>         6
Name: count, dtype: int64

In [167]:
# Handle the nan cases
path_col = midas_clean["midas_path"]
clin_col = midas_clean["clinical_impression_1"]
# Create a final label with combination, chooses clin_col when path_col is nan
midas_clean["final_label"] = path_col.combine_first(clin_col)
# Drop cases where both were NaN
midas_clean = midas_clean[midas_clean["final_label"].notna()].copy()
print("Rows after dropping cases without label:", len(midas_clean))

Rows after dropping cases without label: 3381


In [168]:
# Convert to string and strip spaces
midas_clean["final_label"] = (
    midas_clean["final_label"]
    .astype(str)     # everything to string
    .str.strip()     # remove leading/trailing spaces
)

# Check unique labels now
print(midas_clean["final_label"].value_counts())

final_label
malignant- bcc                                                                      608
benign-melanocytic nevus                                                            578
benign-other                                                                        421
benign-seborrheic keratosis                                                         242
malignant- melanoma                                                                 238
malignant- scc                                                                      203
malignant- ak                                                                       191
malignant- sccis                                                                    165
1-benign-melanocytic nevus                                                          138
2-benign-seborrheic keratosis                                                       117
other- melanocytic lesion, possible re-excision (severe, spitz, aimp)               109
6-benign-other      

In [169]:
import re

def normalize_label(raw):
    """Remove leading numeric prefixes like '1-' and tidy spaces."""
    if pd.isna(raw):
        return None
    s = str(raw).strip()
    # remove leading "<digits>-"
    s = re.sub(r'^\d+\-', '', s).strip()
    # collapse multiple spaces
    s = re.sub(r'\s+', ' ', s)
    return s

def map_final_label(raw):
    """
    Map final_label -> (subtype, bin_label).
    bin_label: 0 = benign, 1 = malignant, None = unlabeled.
    """
    if pd.isna(raw):
        return None, None

    label = normalize_label(raw)
    if label is None:
        return None, None

    # DROP '0'
    if label == "0":
        return "DROP", None

    # --- BENIGN main classes ---
    if label == "benign-melanocytic nevus":
        return "nevus", 0

    if label == "benign-seborrheic keratosis":
        return "seborrheic_keratosis", 0

    # rare benign → merge into benign_other
    if label in [
        "benign-other",
        "benign-dermatofibroma",
        "benign-hemangioma",
        "benign-fibrous papule",
        "other- non-neoplastic, inflammatory, infectious",
        "other-non-neoplastic/inflammatory/infectious",
    ]:
        return "benign_other", 0

    # any other 'benign-' prefix that slipped through → benign_other
    if label.startswith("benign-"):
        return "benign_other", 0

    # --- MALIGNANT main classes ---
    if label in ["malignant- bcc", "malignant-bcc"]:
        return "bcc", 1

    if label in ["malignant- melanoma", "malignant-melanoma"]:
        return "melanoma", 1

    if label in ["malignant- scc", "malignant-scc", "malignant- sccis", "malignant-sccis"]:
        return "scc", 1

    if label in ["malignant- ak", "malignant-ak"]:
        return "ak", 1

    # malignant-other (small but valid) → other_malignant
    if label == "malignant- other":
        return "other_malignant", 1

    # --- BORDERLINE melanocytic: merge spitz/aimp into melanoma ---
    if (
        "melanocytic lesion" in label
        and "re-excision" in label
    ) or (
        "melanocytic tumor" in label
        and "re-excision" in label
    ):
        return "melanoma", 1

    # anything else unexpected → unlabeled
    return None, None

In [170]:
# Apply mapping
midas_clean = midas_clean.copy()
midas_clean[["subtype", "bin_label"]] = midas_clean["final_label"].apply(
    lambda x: pd.Series(map_final_label(x)) #Since we get a tuple in return the .series splits it into two rows
)

# Drop '0' rows
midas_clean = midas_clean[midas_clean["subtype"] != "DROP"]

# Supervised subset: keep only rows with a valid bin_label (0/1)
midas_supervised = midas_clean[midas_clean["bin_label"].notna()].copy()

print("Supervised rows:", len(midas_supervised))
print("\nSubtype counts:")
print(midas_supervised["subtype"].value_counts())
print("\nBenign vs malignant counts:")
print(midas_supervised["bin_label"].value_counts())

Supervised rows: 3375

Subtype counts:
subtype
nevus                   716
benign_other            703
bcc                     621
scc                     383
seborrheic_keratosis    359
melanoma                357
ak                      222
other_malignant          14
Name: count, dtype: int64

Benign vs malignant counts:
bin_label
0.0    1778
1.0    1597
Name: count, dtype: int64


In [171]:
# Drop rare malignant class
midas_supervised = midas_supervised[midas_supervised["subtype"] != "other_malignant"]
print("Supervised rows:", len(midas_supervised))
print("\nSubtype counts:")
print(midas_supervised["subtype"].value_counts())
print("\nBenign vs malignant counts:")
print(midas_supervised["bin_label"].value_counts())

Supervised rows: 3361

Subtype counts:
subtype
nevus                   716
benign_other            703
bcc                     621
scc                     383
seborrheic_keratosis    359
melanoma                357
ak                      222
Name: count, dtype: int64

Benign vs malignant counts:
bin_label
0.0    1778
1.0    1583
Name: count, dtype: int64


In [172]:
meta_features = [
    "age",
    "sex",
    "fitzpatrick",
    "lesion_size",
    "anatomical_site",
    "smoking",
    "clinical_impression",
    "ethnicity",
]

In [173]:
# Check what columns each dataset actually has
print("PAD columns:")
print(sorted(pad_new.columns.tolist()))
print(f"\nTotal PAD columns: {len(pad_new.columns)}")

print("\n" + "="*50)
print("\nMIDAS columns:")
print(sorted(midas_supervised.columns.tolist()))
print(f"\nTotal MIDAS columns: {len(midas_supervised.columns)}")

PAD columns:
['age', 'background_father', 'background_mother', 'bin_label', 'biopsed', 'bleed', 'cancer_history', 'changed', 'dataset_id', 'diagnostic', 'diameter_1', 'diameter_2', 'drink', 'elevation', 'fitspatrick', 'gender', 'grew', 'has_piped_water', 'has_sewage_system', 'hurt', 'img_id', 'img_path', 'itch', 'lesion_id', 'patient_global', 'patient_id', 'pesticide', 'region', 'skin_cancer_history', 'smoke', 'subtype']

Total PAD columns: 31


MIDAS columns:
['Unnamed: 0', 'bin_label', 'clinical_impression_1', 'clinical_impression_2', 'clinical_impression_3', 'final_label', 'length_(mm)', 'midas_age', 'midas_distance', 'midas_ethnicity', 'midas_file_name', 'midas_fitzpatrick', 'midas_gender', 'midas_iscontrol', 'midas_location', 'midas_melanoma', 'midas_path', 'midas_path_clean', 'midas_pathreport', 'midas_race', 'midas_record_id', 'subtype', 'width_(mm)']

Total MIDAS columns: 23


In [174]:
# Prepare PAD-UFES dataset
pad_final = pad_new.copy()

# Rename PAD columns to standardized names
pad_column_mapping = {
    "gender": "sex",
    "fitspatrick": "fitzpatrick",  # Fix typo
    "smoke": "smoking",
    "region": "anatomical_site",
    "diameter_1": "lesion_size_mm",  # Use diameter_1 as lesion size
    "drink": "alcohol_consumption",  # Standardize name
}

pad_final = pad_final.rename(columns=pad_column_mapping)

# Add missing metadata columns that PAD doesn't have (MIDAS-specific)
pad_missing_cols = {
    "clinical_impression": None,
    "ethnicity": None,
    "clinical_impression_2": None,
    "clinical_impression_3": None,
    "race": None,
    "distance": None,
    "is_control": None,
    "melanoma_flag": None,
    "pathology_report": None,
}

for col, default_val in pad_missing_cols.items():
    if col not in pad_final.columns:
        pad_final[col] = default_val
        
print(f"✓ PAD-UFES prepared: {len(pad_final)} rows")
print(f"Renamed columns: {pad_column_mapping}")
print(f"Added missing MIDAS-specific columns: {len(pad_missing_cols)} columns")

✓ PAD-UFES prepared: 2298 rows
Renamed columns: {'gender': 'sex', 'fitspatrick': 'fitzpatrick', 'smoke': 'smoking', 'region': 'anatomical_site', 'diameter_1': 'lesion_size_mm', 'drink': 'alcohol_consumption'}
Added missing MIDAS-specific columns: 9 columns


In [175]:
# Prepare MIDAS dataset
midas_final = midas_supervised.copy()

# Add dataset identifiers
midas_final["dataset_id"] = "B"
midas_final["patient_global"] = "B_" + midas_final["midas_record_id"].astype(str)
midas_final["img_path"] = "midas_images/" + midas_final["midas_file_name"].astype(str)

# Rename MIDAS columns to standardized names
midas_column_mapping = {
    "midas_age": "age",
    "midas_gender": "sex",
    "midas_fitzpatrick": "fitzpatrick",
    "midas_ethnicity": "ethnicity",
    "midas_location": "anatomical_site",
    "clinical_impression_1": "clinical_impression",  # Use primary clinical impression
    "midas_race": "race",
    "midas_distance": "distance",
    "midas_iscontrol": "is_control",
    "midas_melanoma": "melanoma_flag",
    "midas_pathreport": "pathology_report",
}

midas_final = midas_final.rename(columns=midas_column_mapping)

# Calculate lesion size from length and width
if 'length_(mm)' in midas_final.columns and 'width_(mm)' in midas_final.columns:
    # Use average of length and width as lesion size
    midas_final['lesion_size_mm'] = midas_final[['length_(mm)', 'width_(mm)']].mean(axis=1)
else:
    midas_final['lesion_size_mm'] = None

# Add missing metadata columns that MIDAS doesn't have (PAD-specific)
midas_missing_cols = {
    # Lifestyle
    "smoking": None,
    "alcohol_consumption": None,
    # Medical history
    "cancer_history": None,
    "skin_cancer_history": None,
    "background_father": None,
    "background_mother": None,
    # Lesion symptoms/characteristics
    "bleed": None,
    "hurt": None,
    "itch": None,
    "changed": None,
    "grew": None,
    "elevation": None,
    # Biopsy status
    "biopsed": None,
    # Additional measurements
    "diameter_2": None,
    # Socioeconomic
    "has_piped_water": None,
    "has_sewage_system": None,
    "pesticide": None,
}

for col, default_val in midas_missing_cols.items():
    if col not in midas_final.columns:
        midas_final[col] = default_val
        
print(f"✓ MIDAS prepared: {len(midas_final)} rows")
print(f"Renamed columns: {midas_column_mapping}")
print(f"Calculated lesion_size_mm from length and width")
print(f"Added missing PAD-specific columns: {len(midas_missing_cols)} columns")


✓ MIDAS prepared: 3361 rows
Renamed columns: {'midas_age': 'age', 'midas_gender': 'sex', 'midas_fitzpatrick': 'fitzpatrick', 'midas_ethnicity': 'ethnicity', 'midas_location': 'anatomical_site', 'clinical_impression_1': 'clinical_impression', 'midas_race': 'race', 'midas_distance': 'distance', 'midas_iscontrol': 'is_control', 'midas_melanoma': 'melanoma_flag', 'midas_pathreport': 'pathology_report'}
Calculated lesion_size_mm from length and width
Added missing PAD-specific columns: 17 columns


In [176]:
# Combine the two datasets with ALL metadata
# Define desired columns (organized logically)
desired_cols = [
    # ===== CORE LABELS =====
    "bin_label",
    "subtype",
    
    # ===== IDENTIFIERS =====
    "dataset_id",
    "patient_global",
    "img_path",
    
    # ===== SHARED DEMOGRAPHICS =====
    "age",
    "sex",
    "ethnicity",
    "fitzpatrick",
    
    # ===== SHARED LESION FEATURES =====
    "lesion_size_mm",
    "anatomical_site",
    "clinical_impression",
    
    # ===== SHARED LIFESTYLE =====
    "smoking",
    "alcohol_consumption",
    
    # ===== PAD-SPECIFIC: MEDICAL HISTORY =====
    "cancer_history",
    "skin_cancer_history",
    "background_father",
    "background_mother",
    
    # ===== PAD-SPECIFIC: LESION SYMPTOMS =====
    "bleed",
    "hurt",
    "itch",
    "changed",
    "grew",
    "elevation",
    
    # ===== PAD-SPECIFIC: OTHER =====
    "biopsed",
    "diameter_2",
    "has_piped_water",
    "has_sewage_system",
    "pesticide",
    
    # ===== MIDAS-SPECIFIC: ADDITIONAL CLINICAL DATA =====
    "clinical_impression_2",
    "clinical_impression_3",
    "race",
    "distance",
    "is_control",
    "melanoma_flag",
    "pathology_report",
]

# Find columns that exist in both dataframes
pad_cols = set(pad_final.columns)
midas_cols = set(midas_final.columns)
common_cols = [col for col in desired_cols if col in pad_cols and col in midas_cols]

print(f"✓ Columns to combine: {len(common_cols)}")
print(common_cols)

# Check for missing columns
missing_from_pad = [col for col in desired_cols if col not in pad_cols]
missing_from_midas = [col for col in desired_cols if col not in midas_cols]

if missing_from_pad:
    print(f"\n⚠ Warning: Still missing from PAD: {missing_from_pad}")
if missing_from_midas:
    print(f"⚠ Warning: Still missing from MIDAS: {missing_from_midas}")

# Combine both datasets with the common columns
combined = pd.concat([
    pad_final[common_cols],
    midas_final[common_cols]
], axis=0, ignore_index=True)

print(f"\n✓ Combined dataset size: {len(combined)}")
print(f"  - PAD-UFES: {len(pad_final)} rows")
print(f"  - MIDAS: {len(midas_final)} rows")
print(f"  - Total columns (with metadata): {len(combined.columns)}")
print(f"\nDataset distribution:")
print(combined["dataset_id"].value_counts())
print(f"\nBin label distribution:")
print(combined["bin_label"].value_counts())
print(f"\nSubtype distribution:")
print(combined["subtype"].value_counts())

✓ Columns to combine: 36
['bin_label', 'subtype', 'dataset_id', 'patient_global', 'img_path', 'age', 'sex', 'ethnicity', 'fitzpatrick', 'lesion_size_mm', 'anatomical_site', 'clinical_impression', 'smoking', 'alcohol_consumption', 'cancer_history', 'skin_cancer_history', 'background_father', 'background_mother', 'bleed', 'hurt', 'itch', 'changed', 'grew', 'elevation', 'biopsed', 'diameter_2', 'has_piped_water', 'has_sewage_system', 'pesticide', 'clinical_impression_2', 'clinical_impression_3', 'race', 'distance', 'is_control', 'melanoma_flag', 'pathology_report']

✓ Combined dataset size: 5659
  - PAD-UFES: 2298 rows
  - MIDAS: 3361 rows
  - Total columns (with metadata): 36

Dataset distribution:
dataset_id
B    3361
A    2298
Name: count, dtype: int64

Bin label distribution:
bin_label
1.0    3402
0.0    2257
Name: count, dtype: int64

Subtype distribution:
subtype
bcc                     1466
nevus                    960
ak                       952
benign_other             703
sebor

  combined = pd.concat([


In [177]:
# Fill in missing metadata values

# Numeric columns (fill with -1 to indicate missing)
numeric_cols = [col for col in ["age", "lesion_size_mm", "diameter_2", "distance"] if col in combined.columns]
if numeric_cols:
    combined[numeric_cols] = combined[numeric_cols].fillna(-1)
    print(f"✓ Filled numeric columns with -1: {numeric_cols}")

# Categorical demographic/clinical columns (fill with "unknown")
cat_cols = [col for col in ["sex", "fitzpatrick", "ethnicity", "anatomical_site",
                             "clinical_impression", "clinical_impression_2", "clinical_impression_3",
                             "race", "pathology_report"] 
            if col in combined.columns]
if cat_cols:
    combined[cat_cols] = combined[cat_cols].fillna("unknown")
    print(f"✓ Filled categorical columns with 'unknown': {cat_cols}")

# Boolean/binary columns (fill with "unknown")
bool_cols = [col for col in ["smoking", "alcohol_consumption", "cancer_history", "skin_cancer_history",
                              "background_father", "background_mother", "bleed", "hurt", "itch", 
                              "changed", "grew", "elevation", "biopsed", "has_piped_water", 
                              "has_sewage_system", "pesticide", "is_control", "melanoma_flag"] 
             if col in combined.columns]
if bool_cols:
    combined[bool_cols] = combined[bool_cols].fillna("unknown")
    print(f"✓ Filled boolean columns with 'unknown': {bool_cols}")

# Verify no missing values remain
print("\nMissing values after filling:")
missing_counts = combined.isna().sum()
if missing_counts.sum() == 0:
    print("✓ No missing values in entire dataset!")
else:
    print("⚠ Columns with missing values:")
    print(missing_counts[missing_counts > 0])
    
print(f"\n✓ Final combined dataset shape: {combined.shape}")
print(f"Total columns: {len(combined.columns)}")
print(f"Metadata columns: {len(combined.columns) - 5} (excluding labels and identifiers)")

✓ Filled numeric columns with -1: ['age', 'lesion_size_mm', 'diameter_2', 'distance']
✓ Filled categorical columns with 'unknown': ['sex', 'fitzpatrick', 'ethnicity', 'anatomical_site', 'clinical_impression', 'clinical_impression_2', 'clinical_impression_3', 'race', 'pathology_report']
✓ Filled boolean columns with 'unknown': ['smoking', 'alcohol_consumption', 'cancer_history', 'skin_cancer_history', 'background_father', 'background_mother', 'bleed', 'hurt', 'itch', 'changed', 'grew', 'elevation', 'biopsed', 'has_piped_water', 'has_sewage_system', 'pesticide', 'is_control', 'melanoma_flag']

Missing values after filling:
✓ No missing values in entire dataset!

✓ Final combined dataset shape: (5659, 36)
Total columns: 36
Metadata columns: 31 (excluding labels and identifiers)


In [178]:
# Show metadata availability by dataset
print("Metadata Availability Summary (ALL metadata columns):")
print("="*80)

# All metadata columns (excluding labels and identifiers)
all_metadata_cols = [col for col in combined.columns 
                     if col not in ['bin_label', 'subtype', 'dataset_id', 'patient_global', 'img_path']]

for dataset_name, dataset_id in [("PAD-UFES", "A"), ("MIDAS", "B")]:
    print(f"\n{dataset_name} (Dataset {dataset_id}):")
    dataset_subset = combined[combined['dataset_id'] == dataset_id]
    
    for col in all_metadata_cols:
        if col in combined.columns:
            # Count non-missing and non-"unknown" values
            if combined[col].dtype in ['float64', 'int64']:
                valid_count = (dataset_subset[col] != -1).sum()
            else:
                valid_count = (dataset_subset[col] != "unknown").sum()
            
            total = len(dataset_subset)
            percentage = (valid_count / total * 100) if total > 0 else 0
            print(f"  {col:30s}: {valid_count:4d}/{total:4d} ({percentage:5.1f}%) available")

print("\n" + "="*80)
print(f"Total metadata columns: {len(all_metadata_cols)}")
print(f"Total dataset columns: {len(combined.columns)}")

Metadata Availability Summary (ALL metadata columns):

PAD-UFES (Dataset A):
  age                           : 2298/2298 (100.0%) available
  sex                           : 1494/2298 ( 65.0%) available
  ethnicity                     :    0/2298 (  0.0%) available
  fitzpatrick                   : 1494/2298 ( 65.0%) available
  lesion_size_mm                : 1494/2298 ( 65.0%) available
  anatomical_site               : 2298/2298 (100.0%) available
  clinical_impression           :    0/2298 (  0.0%) available
  smoking                       : 1494/2298 ( 65.0%) available
  alcohol_consumption           : 1494/2298 ( 65.0%) available
  cancer_history                : 1494/2298 ( 65.0%) available
  skin_cancer_history           : 1494/2298 ( 65.0%) available
  background_father             : 1480/2298 ( 64.4%) available
  background_mother             : 1476/2298 ( 64.2%) available
  bleed                         : 2298/2298 (100.0%) available
  hurt                          : 2298/22

In [179]:
# View sample of combined data
print("="*80)
print("DATASET SUMMARY")
print("="*80)

print("\nDataset Composition:")
print(f"  Total samples: {len(combined)}")
print(f"  PAD-UFES samples: {(combined['dataset_id'] == 'A').sum()}")
print(f"  MIDAS samples: {(combined['dataset_id'] == 'B').sum()}")

print(f"\nLabel Distribution:")
print(f"  Benign samples: {(combined['bin_label'] == 0).sum()}")
print(f"  Malignant samples: {(combined['bin_label'] == 1).sum()}")

print(f"\nSubtype Distribution ({combined['subtype'].nunique()} unique subtypes):")
for subtype, count in combined['subtype'].value_counts().items():
    print(f"  {subtype:25s}: {count:4d} samples")

print(f"\nMetadata Included:")
all_metadata_cols = [col for col in combined.columns 
                     if col not in ['bin_label', 'subtype', 'dataset_id', 'patient_global', 'img_path']]
print(f"  Total columns: {len(combined.columns)}")
print(f"  Metadata columns: {len(all_metadata_cols)}")
print(f"\n  Shared metadata (both datasets):")
shared_metadata = ["age", "sex", "ethnicity", "fitzpatrick", "lesion_size_mm", 
                   "anatomical_site", "clinical_impression", "smoking", "alcohol_consumption"]
for col in shared_metadata:
    if col in combined.columns:
        print(f"    - {col}")
midas_specific = ["clinical_impression_2", "clinical_impression_3", "race", "distance", 
                  "is_control", "melanoma_flag", "pathology_report"]
pad_specific_count = len([c for c in all_metadata_cols if c not in shared_metadata and c not in midas_specific])
print(f"\n  PAD-specific metadata ({pad_specific_count} columns)")
print(f"  MIDAS-specific metadata (7 columns): clinical_impression_2, clinical_impression_3, race, distance, is_control, melanoma_flag, pathology_report")

print("\n" + "="*80)
print("\nFirst 5 rows of combined dataset:")
print(combined.head())

print("\n Dataset successfully combined with all available metadata!")


DATASET SUMMARY

Dataset Composition:
  Total samples: 5659
  PAD-UFES samples: 2298
  MIDAS samples: 3361

Label Distribution:
  Benign samples: 2257
  Malignant samples: 3402

Subtype Distribution (7 unique subtypes):
  bcc                      : 1466 samples
  nevus                    :  960 samples
  ak                       :  952 samples
  benign_other             :  703 samples
  seborrheic_keratosis     :  594 samples
  scc                      :  575 samples
  melanoma                 :  409 samples

Metadata Included:
  Total columns: 36
  Metadata columns: 31

  Shared metadata (both datasets):
    - age
    - sex
    - ethnicity
    - fitzpatrick
    - lesion_size_mm
    - anatomical_site
    - clinical_impression
    - smoking
    - alcohol_consumption

  PAD-specific metadata (15 columns)
  MIDAS-specific metadata (7 columns): clinical_impression_2, clinical_impression_3, race, distance, is_control, melanoma_flag, pathology_report


First 5 rows of combined dataset:
   bi

In [180]:
patient_counts = combined['patient_global'].value_counts()
print(patient_counts)

patient_global
B_13          42
B_220         30
B_90          27
B_174         25
B_27          21
              ..
A_PAT_84       1
A_PAT_967      1
A_PAT_1995     1
A_PAT_117      1
A_PAT_1549     1
Name: count, Length: 2099, dtype: int64


In [181]:
num_patients = combined['patient_global'].nunique()
print(f"Total unique patients: {num_patients}")

Total unique patients: 2099


In [184]:
# Rename columns
combined = combined.rename(columns={'bin_label': 'is_malignant','subtype': 'diagnosis'})


In [185]:
combined.to_csv("combined_dataset.csv", index=False)
print("\nCombined dataset saved to 'combined_dataset.csv'")


Combined dataset saved to 'combined_dataset.csv'
