
# data preparation


In [5]:
import pandas as pd
from pathlib import Path



In [6]:
nb_dir = Path.cwd()
project_root = nb_dir if (nb_dir / 'data').exists() else nb_dir.parent
train_df = pd.read_csv(str(project_root / 'data' / 'raw' / 'train' / 'train.csv'))
val_df   = pd.read_csv(str(project_root / 'data' / 'raw' / 'val' / 'val.csv'))

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)


Train shape: (10015, 15)
Validation shape: (193, 15)


In [7]:
valid_classes = ["Benign", "Malignant"]

train_df = train_df[train_df["diagnosis_1"].isin(valid_classes)]
val_df   = val_df[val_df["diagnosis_1"].isin(valid_classes)]

print("After filtering:")
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)


After filtering:
Train shape: (9885, 15)
Validation shape: (193, 15)


In [8]:
label_map = {
    "Benign": 0,
    "Malignant": 1
}

train_df["label"] = train_df["diagnosis_1"].map(label_map)
val_df["label"]   = val_df["diagnosis_1"].map(label_map)


In [9]:
keep_cols = [
    "isic_id",
    "label",
    "sex",
    "age_approx",
    "anatom_site_general"
]

train_clean = train_df[keep_cols]
val_clean   = val_df[keep_cols]


In [10]:
print("\nTrain label distribution:")
print(train_clean["label"].value_counts())

print("\nValidation label distribution:")
print(val_clean["label"].value_counts())

assert train_clean["label"].isnull().sum() == 0
assert val_clean["label"].isnull().sum() == 0



Train label distribution:
label
0    8061
1    1824
Name: count, dtype: int64

Validation label distribution:
label
0    149
1     44
Name: count, dtype: int64


In [12]:
nb_dir = Path.cwd()
project_root = nb_dir if (nb_dir / 'data').exists() else nb_dir.parent

processed_train = Path(project_root / 'data' / 'processed' / 'train')
processed_train.mkdir(exist_ok=True)

processed_val = Path(project_root / 'data' / 'processed' / 'val')
processed_val.mkdir(exist_ok=True)

train_clean.to_csv(processed_train / "train_binary.csv", index=False)
val_clean.to_csv(processed_val / "val_binary.csv", index=False)

print("Saved:")
print(" - root/data/processed/train/train_binary.csv")
print(" - root/data/processed/val/val_binary.csv")


Saved:
 - root/data/processed/train/train_binary.csv
 - root/data/processed/val/val_binary.csv
