## [ISIC 2024 - Skin Cancer Detection with 3D-TBP](https://www.kaggle.com/competitions/isic-2024-challenge/overview)
##### Lingfeng (Ling) Su, Manal Amarsaikhan, Yuxin (Katy) Chen, Keelan Gan

In [8]:
import tensorflow as tf
import pandas as pd
import os
import numpy as np
import cv2
from tqdm import tqdm


In [9]:
def load_and_preprocess_image(image_path, label):
    width, height = 100, 100
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    
    image = tf.image.resize(image, [width, height])
    image = image / 255.0
    
    return image, label

In [10]:
def batch_dataset(batch_size, dataset):
    return dataset.batch(batch_size)

#### Process Image Dataset

In [11]:
def preprocess_and_save(image_path, save_path, target_size=(64, 64)):
    """Preprocess image and save as .npy"""
    try:
        image_raw = tf.io.read_file(image_path)
        image = tf.image.decode_jpeg(image_raw, channels=3)
        image = tf.image.resize(image, target_size)
        image = tf.image.convert_image_dtype(image, dtype=tf.float32)  # Normalize [0,1]
        
        np.save(save_path, image.numpy())  # Save as NumPy array
    except Exception as e:
        print(f"Error processing {image_path}: {e}")

def load_preprocessed_image(npy_path, label):
    """Load preprocessed .npy image"""
    image = np.load(npy_path.numpy())  # Convert Tensor to NumPy path
    return tf.convert_to_tensor(image, dtype=tf.float32), label

def load_preprocessed_wrapper(npy_path, label):
    """Wrapper function for TensorFlow Dataset"""
    image, label = tf.py_function(load_preprocessed_image, [npy_path, label], [tf.float32, tf.int32])
    image.set_shape((64, 64, 3))  # Set fixed shape
    return image, label

In [None]:
# Load metadata
metadata = pd.read_csv(r"isic-2024-challenge\train-metadata.csv", low_memory=False)
image_dir = r"isic-2024-challenge\train-image\image"
processed_dir = r"isic-2024-challenge\processed_images"

# create DIR for run once use many times
os.makedirs(processed_dir, exist_ok=True)

# Create a mapping from image filenames to labels
labels = metadata[['isic_id', 'target']].copy()
labels['isic_id'] = labels['isic_id'].apply(lambda x: x + '.jpg')
image_to_label = dict(zip(labels['isic_id'], labels['target']))

# Create a list of image file paths and their corresponding labels
image_paths = [os.path.join(image_dir, img_id) for img_id in labels['isic_id']]
image_labels = [image_to_label[img_id] for img_id in labels['isic_id']]

# Preprocess all images and save them
# for img_path in tqdm(image_paths):
#     save_path = os.path.join(processed_dir, os.path.basename(img_path).replace(".jpg", ".npy"))
#     if not os.path.exists(save_path):  # Skip if already processed
#         preprocess_and_save(img_path, save_path)

# Create lists of preprocessed file paths and labels
# processed_paths = [os.path.join(processed_dir, img_id.replace(".jpg", ".npy")) for img_id in labels['isic_id']]

# Create a TensorFlow Dataset
#dataset = tf.data.Dataset.from_tensor_slices((processed_paths, image_labels))
# Parallelize image loading and preprocessing
#dataset = dataset.map(load_preprocessed_wrapper, num_parallel_calls=tf.data.AUTOTUNE)


# Commenting this out to test saving it onto hard drive instead for now
# Create a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((image_paths, image_labels))

# Parallelize image loading and preprocessing
dataset = dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and repeat the dataset
dataset = dataset.shuffle(buffer_size=1000)
dataset = dataset.repeat()

# Batch the dataset
batch_size = 1000
num_batches = 10
dataset = batch_dataset(batch_size, dataset)
dataset = dataset.take(num_batches)

In [13]:
# Print the first batch
for images, labels in dataset.take(1):
    print("Images shape:", images.shape)
    print("Labels shape:", labels.shape)
    

Images shape: (1000, 100, 100, 3)
Labels shape: (1000,)
