# Pull the missing images in valid-small.csv from the big file

1. Viewing the Folder Names Inside a Zip File

To view the folder names and structure inside a zip file, you can use the zipfile module to list its contents.

Explanation

	•	zip_ref.namelist() returns a list of all file and folder paths inside the zip file.
	•	The if item.endswith('/') check ensures that only folders (paths ending with a slash) are included.
	•	Using set() removes duplicate folder names.

In [21]:
import pandas as pd
import zipfile

# Read the CSV file to get missing image names
valid_df = pd.read_csv("data/nih/valid-small.csv")

# Create a list of missing image names from the "Image" column
missing_image_names = [img for img in valid_df["Image"]]

# Remove a specific image from the list if needed
missing_image_names.remove("00008760_004.png")
print(f"Missing image names (first 3): {missing_image_names[:3]}")
print(f"{len(missing_image_names) = }")

# Path to the zip file
zip_file_path = 'nih_zip_file/archive.zip'

# Open the zip file and get its contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()
    print(f"\nZip contents (first 10): {zip_contents[:10]}")

    # Create a list of full path names in zip_contents that match the missing image names
    # Check if any of the missing image names are present in the zip file paths
    matched_image_paths = [path for path in zip_contents if any(img in path for img in missing_image_names)]

# Print the matched image paths
print(f"\nMatched image paths (first 10): {matched_image_paths[:10]}")
print(f"{len(matched_image_paths) = }")

Missing image names (first 3): ['00027623_007.png', '00028214_000.png', '00022764_014.png']
len(missing_image_names) = 108

Zip contents (first 10): ['ARXIV_V5_CHESTXRAY.pdf', 'BBox_List_2017.csv', 'Data_Entry_2017.csv', 'FAQ_CHESTXRAY.pdf', 'LOG_CHESTXRAY.pdf', 'README_CHESTXRAY.pdf', 'images_001/images/00000001_000.png', 'images_001/images/00000001_001.png', 'images_001/images/00000001_002.png', 'images_001/images/00000002_000.png']

Matched image paths (first 10): ['images_001/images/00001005_001.png', 'images_002/images/00001890_001.png', 'images_002/images/00002206_003.png', 'images_002/images/00002274_000.png', 'images_002/images/00002274_002.png', 'images_002/images/00002345_007.png', 'images_002/images/00003098_000.png', 'images_002/images/00003386_000.png', 'images_002/images/00003527_007.png', 'images_002/images/00003585_001.png']
len(matched_image_paths) = 108


In [22]:
import zipfile
import os

# Define the path to the zip file and the destination folder
zip_file_path = 'nih_zip_file/archive.zip'
output_folder = 'data/nih/images-small/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    for image_path in matched_image_paths:
        # Extract each image to the specified folder, removing the original structure
        destination_path = os.path.join(output_folder, os.path.basename(image_path))
        
        # Write the image file to the destination path
        with zip_ref.open(image_path) as source, open(destination_path, "wb") as target:
            target.write(source.read())

print(f"Extracted {len(matched_image_paths)} images to {output_folder}.")

Extracted 108 images to data/nih/images-small/.
