In [1]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

# Load metadata
metadata_path = "Dataset/HAM10000_metadata.csv"
metadata = pd.read_csv(metadata_path)

# Get unique values in the 'dx' column
unique_diagnoses = metadata['dx'].unique()

# Create directories for each unique diagnosis in the "Classified Images" folder
output_base_dir = "Classified_Images"
os.makedirs(output_base_dir, exist_ok=True)
for dx in unique_diagnoses:
    os.makedirs(os.path.join(output_base_dir, dx), exist_ok=True)

# Function to copy images based on dx classification
def copy_images_to_classes(input_folders, metadata, output_base_dir):
    not_found_images = []
    image_count = {dx: 0 for dx in unique_diagnoses}

    # Process images in the input folders
    for input_folder in input_folders:
        for image_name in tqdm(os.listdir(input_folder)):
            image_id = os.path.splitext(image_name)[0]  # Get the image_id without extension
            
            # Find the corresponding dx value in the metadata
            match = metadata[metadata['image_id'] == image_id]
            if not match.empty:
                dx_class = match['dx'].values[0]
                dest_folder = os.path.join(output_base_dir, dx_class)
                src_path = os.path.join(input_folder, image_name)
                dest_path = os.path.join(dest_folder, image_name)
                
                # Copy the image to the corresponding folder
                shutil.copy(src_path, dest_path)
                image_count[dx_class] += 1
            else:
                not_found_images.append(image_name)

    return image_count, not_found_images

# Define input folders
input_folders = ["HAM_10000_Hair_Removed_1", "HAM_10000_Hair_Removed_2"]

# Copy images and get the summary
image_count, not_found_images = copy_images_to_classes(input_folders, metadata, output_base_dir)

# Print summary
print("\nImage counts by class:")
for dx, count in image_count.items():
    print(f"{dx}: {count} images")

print(f"\nTotal images not found in metadata: {len(not_found_images)}")
if not_found_images:
    print("Images not found:", not_found_images)


100%|██████████| 5000/5000 [01:39<00:00, 50.43it/s]
100%|██████████| 5015/5015 [01:51<00:00, 44.87it/s]


Image counts by class:
bkl: 1099 images
nv: 6705 images
df: 115 images
mel: 1113 images
vasc: 142 images
bcc: 514 images
akiec: 327 images

Total images not found in metadata: 0



