In [5]:
import os
from urllib.parse import urlparse
import h5py
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
import matplotlib.pyplot as plt

from kaggleisic import config
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# === Paths ===
image_folder = urlparse(str(config.RAW_DATA_DIR / "ISIC-images")).path
metadata_file = urlparse(str(config.RAW_DATA_DIR / "ISIC-images/metadata.csv")).path
train_meta_file = config.RAW_DATA_DIR / "new-train-metadata.csv"
train_hdf5_file = urlparse(str(config.RAW_DATA_DIR / "train-image.hdf5")).path

output_hdf5 = urlparse(str(config.INTERIM_DATA_DIR / "combined-image.hdf5")).path
output_hdf5_train = urlparse(str(config.INTERIM_DATA_DIR / "train-image.hdf5")).path
output_meta = urlparse(str(config.INTERIM_DATA_DIR / "combined-metadata.csv")).path

resize_shape = (224, 224)

In [7]:
# === 1. Load ISIC metadata and assign cancer label (1) ===
print("üì• Loading new cancer metadata...")
isic_metadata = pd.read_csv(metadata_file)[["isic_id"]]
isic_metadata["target"] = 1

# === 2. Load original benign metadata (target = 0) ===
print("üì• Loading benign metadata from existing train file...")
original_metadata = pd.read_csv(train_meta_file)[["isic_id", "target"]]
benign_metadata = original_metadata[original_metadata["target"] == 0]

üì• Loading new cancer metadata...
üì• Loading benign metadata from existing train file...


  original_metadata = pd.read_csv(train_meta_file)[["isic_id", "target"]]


In [8]:
# === 3. Combine metadata and save to CSV ===
combined_metadata = pd.concat([benign_metadata, isic_metadata], ignore_index=True)
combined_metadata.to_csv(output_meta, index=False)
print(f"‚úÖ Saved combined metadata: {output_meta}")

‚úÖ Saved combined metadata: /Users/eduardfer/Desktop/Big Data Analytics - UC3M/Machine Learning/KaggleISIC/data/interim/combined-metadata.csv


In [11]:
# === 4. Append new cancer images into HDF5 ===
print(f"üì¶ Appending cancer images to {output_hdf5_train}...")

with h5py.File(output_hdf5_train, "a") as hf:
    for _, row in isic_metadata.iterrows():
        isic_id = row["isic_id"]
        img_path = Path(image_folder) / f"{isic_id}.jpg"

        if isic_id in hf:
            print(f"‚ö†Ô∏è Skipping duplicate: {isic_id}")
            continue
        if not img_path.exists():
            print(f"‚ùå Missing file: {img_path}")
            continue

        try:
            img = Image.open(img_path).convert("RGB").resize(resize_shape)
            img_np = np.asarray(img)
            if img_np.shape != (224, 224, 3):
                print(f"‚ùå Invalid shape for {isic_id}: {img_np.shape}")
                continue
            hf.create_dataset(isic_id, data=img_np, compression="gzip")
        except Exception as e:
            print(f"‚ùå Error processing {isic_id}: {e}")

print("‚úÖ Done updating HDF5 with new images.")

üì¶ Appending cancer images to /Users/eduardfer/Desktop/Big Data Analytics - UC3M/Machine Learning/KaggleISIC/data/interim/train-image.hdf5...
‚ö†Ô∏è Skipping duplicate: ISIC_0000002
‚ö†Ô∏è Skipping duplicate: ISIC_0000004
‚ö†Ô∏è Skipping duplicate: ISIC_0000013
‚ö†Ô∏è Skipping duplicate: ISIC_0096034
‚ö†Ô∏è Skipping duplicate: ISIC_0104229
‚ö†Ô∏è Skipping duplicate: ISIC_0119495
‚ö†Ô∏è Skipping duplicate: ISIC_0157834
‚ö†Ô∏è Skipping duplicate: ISIC_0190307
‚ö†Ô∏è Skipping duplicate: ISIC_0211092
‚ö†Ô∏è Skipping duplicate: ISIC_0220459
‚ö†Ô∏è Skipping duplicate: ISIC_0238218
‚ö†Ô∏è Skipping duplicate: ISIC_0279372
‚ö†Ô∏è Skipping duplicate: ISIC_0287900
‚ö†Ô∏è Skipping duplicate: ISIC_0293670
‚ùå Missing file: /Users/eduardfer/Desktop/Big Data Analytics - UC3M/Machine Learning/KaggleISIC/data/raw/ISIC-images/ISIC_0302225.jpg
‚ö†Ô∏è Skipping duplicate: ISIC_0321944
‚ö†Ô∏è Skipping duplicate: ISIC_0330452
‚ö†Ô∏è Skipping duplicate: ISIC_0338720
‚ö†Ô∏è Skipping duplicate: ISIC_0386460
‚