In [1]:
# find_bad_images.py

import os
from PIL import Image

# --- Configuration ---
# Set this to the root directory of your dataset
dataset_root_dir =  r"C:\Users\tsili\Documents\Meme_cleaner\dataset"
# The name of the file where the list of bad files will be saved
output_file_path = "bad_files.txt"
# ---

bad_files = []
print(f"Scanning for corrupted images in: {dataset_root_dir}")

# os.walk will go through all subdirectories recursively
for dirpath, dirnames, filenames in os.walk(dataset_root_dir):
    for filename in filenames:
        # Check for common image file extensions
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
            filepath = os.path.join(dirpath, filename)
            try:
                # Open the image file
                img = Image.open(filepath)
                # Force PIL to read the entire image to check for truncation
                img.load()
            except (IOError, OSError) as e:
                # This block catches corrupted or truncated files
                print(f"Found bad file: {filepath}  |  Reason: {e}")
                bad_files.append(filepath)

print("\n" + "="*50)
if bad_files:
    print(f"Scan complete. Found {len(bad_files)} corrupted files.")
    # Save the list of bad file paths to the output file
    with open(output_file_path, "w") as f:
        for path in bad_files:
            f.write(f"{path}\n")
    print(f"A list of these files has been saved to: {output_file_path}")
    print("You can now review this list and then run the 'delete_files_from_list.py' script.")
else:
    print("Scan complete. No corrupted images were found!")

Scanning for corrupted images in: C:\Users\tsili\Documents\Meme_cleaner\dataset
Found bad file: C:\Users\tsili\Documents\Meme_cleaner\dataset\meme\image_5119.png  |  Reason: image file is truncated

Scan complete. Found 1 corrupted files.
A list of these files has been saved to: bad_files.txt
You can now review this list and then run the 'delete_files_from_list.py' script.


In [4]:
# delete_files_from_list.py

import os

# --- Configuration ---
# This file should have been created by the find_bad_images.py script
input_file_path = "bad_files.txt"
# ---

# Safety check: ensure the file list exists
if not os.path.exists(input_file_path):
    print(f"Error: The file '{input_file_path}' was not found.")
    print("Please run the 'find_bad_images.py' script first to generate it.")
    exit() # Stop the script

# Read the file paths from the text file
with open(input_file_path, "r") as f:
    # .strip() removes leading/trailing whitespace, including the newline character
    files_to_delete = [line.strip() for line in f if line.strip()]

if not files_to_delete:
    print(f"The file '{input_file_path}' is empty. Nothing to do.")
    exit()

# --- SAFETY CONFIRMATION STEP ---
print("="*50)
print(f"The following {len(files_to_delete)} files are scheduled for deletion:")
for filepath in files_to_delete:
    print(f"  - {filepath}")

print("="*50)
user_input = input("\nAre you absolutely sure you want to PERMANENTLY delete these files? (yes/no): ").lower()

if user_input == 'yes':
    deleted_count = 0
    for filepath in files_to_delete:
        try:
            os.remove(filepath)
            print(f"Deleted: {filepath}")
            deleted_count += 1
        except FileNotFoundError:
            print(f"Warning: File not found (may have been deleted already): {filepath}")
        except OSError as e:
            print(f"Error deleting {filepath}: {e}")

    print(f"\nDeletion complete. {deleted_count} of {len(files_to_delete)} files were deleted.")
    # Clean up the list file after deletion
    # os.remove(input_file_path)
else:
    print("Deletion canceled by user. No files were touched.")

The following 1 files are scheduled for deletion:
  - C:\Users\tsili\Documents\Meme_cleaner\dataset\meme\image_5119.png
Deleted: C:\Users\tsili\Documents\Meme_cleaner\dataset\meme\image_5119.png

Deletion complete. 1 of 1 files were deleted.
