## [ISIC 2024 - Skin Cancer Detection with 3D-TBP](https://www.kaggle.com/competitions/isic-2024-challenge/overview)
##### Lingfeng (Ling) Su, Manal Amarsaikhan, Yuxin (Katy) Chen, Keelan Gan

In [46]:
import tensorflow as tf
import pandas as pd
import os

In [47]:
def load_and_preprocess_image(image_path, label):
    width, height = 100, 100
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    
    image = tf.image.resize(image, [width, height])
    image = image / 255.0
    
    return image, label

In [48]:
def batch_dataset(batch_size, dataset):
    return dataset.batch(batch_size)

#### Process Image Dataset

In [None]:
# Load metadata
metadata = pd.read_csv(r"\train-metadata.csv", low_memory=False)
image_dir = r"\train-image\image"

# Create a mapping from image filenames to labels
labels = metadata[['isic_id', 'target']].copy()
labels['isic_id'] = labels['isic_id'].apply(lambda x: x + '.jpg')
image_to_label = dict(zip(labels['isic_id'], labels['target']))

# Create a list of image file paths and their corresponding labels
image_paths = [os.path.join(image_dir, img_id) for img_id in labels['isic_id']]
image_labels = [image_to_label[img_id] for img_id in labels['isic_id']]

# Create a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((image_paths, image_labels))

# Parallelize image loading and preprocessing
dataset = dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and repeat the dataset
dataset = dataset.shuffle(buffer_size=1000)
dataset = dataset.repeat()

# Batch the dataset
batch_size = 1000
num_batches = 10
dataset = batch_dataset(batch_size, dataset)
dataset = dataset.take(num_batches)

In [50]:
# Print the first batch
for images, labels in dataset.take(1):
    print("Images shape:", images.shape)
    print("Labels shape:", labels.shape)
    

Images shape: (1000, 100, 100, 3)
Labels shape: (1000,)


In [None]:
import cv2
import os
import numpy as np
from tqdm import tqdm

input_folder = "isic-2024-challenge/train-image/image"
output_folder = "resized_images"
target_size = (64, 64)

os.makedirs(output_folder, exist_ok=True)

counter = 0

for img_name in tqdm(os.listdir(input_folder)):
    input_path = os.path.join(input_folder, img_name)
    output_path = os.path.join(output_folder, img_name)

    # Skip processing if the file already exists
    if os.path.exists(output_path):
        continue  

    img = cv2.imread(input_path)
    if img is not None:
        resized_img = cv2.resize(img, target_size)
        normalized_img = resized_img.astype(np.float32) / 255.0  # Normalize to [0,1]
        cv2.imwrite(output_path, resized_img)
        
        np.save(output_path.replace(".jpg", ".npy"), normalized_img)

    # This is ugly but I dont have enough compute to run 20k images.
    counter += 1
    if counter >= 200:
        break