# 1. Dataset Setup and Preprocessing

In [2]:
import pandas as pd
import glob
import os

# Paths to metadata and image directory
metadata_path = '/kaggle/input/data/Data_Entry_2017.csv'
images_root_path = '/kaggle/input/data'

# Load the metadata
metadata = pd.read_csv(metadata_path)

# Use glob to get a list of all image paths within subfolders like images_001/images
all_image_paths = glob.glob(os.path.join(images_root_path, 'images_*/images/*.png'))

# Create a dictionary to map each image filename to its full path
image_paths_dict = {os.path.basename(path): path for path in all_image_paths}

# Now, add a column in metadata with the full image paths
metadata['image_path'] = metadata['Image Index'].map(image_paths_dict)

# Filter out any images that might be missing
metadata = metadata.dropna(subset=['image_path'])

# Display the first few rows to verify
print(metadata[['Image Index', 'image_path']].head())

        Image Index                                         image_path
0  00000001_000.png  /kaggle/input/data/images_001/images/00000001_...
1  00000001_001.png  /kaggle/input/data/images_001/images/00000001_...
2  00000001_002.png  /kaggle/input/data/images_001/images/00000001_...
3  00000002_000.png  /kaggle/input/data/images_001/images/00000002_...
4  00000003_000.png  /kaggle/input/data/images_001/images/00000003_...


In [3]:
import tensorflow as tf

# Create a tf.data.Dataset from the image paths
def load_and_preprocess_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=3)  # Decode PNG image
    img = tf.image.resize(img, [224, 224])  # Resize to 224x224 for CNN input
    img = img / 255.0  # Normalize to [0, 1]
    return img

# Create a tf.data.Dataset object that processes the images in parallel
image_paths = metadata['image_path'].values

dataset = tf.data.Dataset.from_tensor_slices(image_paths)
dataset = dataset.map(lambda x: load_and_preprocess_image(x), num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)  # Batch and prefetch for better performance

# Check the dataset
for batch in dataset.take(1):
    print(batch.shape)  # Should print (32, 224, 224, 3)

(32, 224, 224, 3)


# 2. Handling Labels

In [4]:
# Example: Convert labels from the metadata (assuming binary labels)
def get_labels(metadata):
    label_map = {'No Finding': 0, 'Disease': 1}  # Map labels to integers
    labels = metadata['Finding Labels'].map(label_map).values
    return labels

labels = get_labels(metadata)

labels = labels.reshape(-1, 1)

# Create a tf.data.Dataset that yields (image, label) pairs
label_dataset = tf.data.Dataset.from_tensor_slices(labels)

# Zip the image dataset with the label dataset
dataset = tf.data.Dataset.zip((dataset, label_dataset))

# Now, the dataset will yield tuples of (image, label) batches