# GenImages Sample Generator
Note: This notebook is designed to be run from a machine where you have access to the *full* GenImages

In [1]:
#Imports
import os
import random
import shutil
from PIL import Image
import py7zr

## Run Configuration

These values should be set per-run.
* **runtype** - determines the type of samples produced
    * '' - Normal, does not do anything to the images
    * 'Cropped' - Creates randomly cropped images
    * 'Scaled' - Creates randomly scaled images
* **sample_size** - The number of samples per model for each AI and Real
    * Note this will go through a 70/15/15 train/validation/test split, which can be configured later.
    * So if you set this value to '3000' with all 8 models you will get 
* **models** - a list of models to include.
    * Select which you want from: 'adm','big_gan','glide','midjourney','sd4','sd5','vqdm','wukong'
* **scale_type** - The type of scaling used.
    * Select From: BICUBIC, BILINEAR, LANCZOS
    * Future support (maybe) for RANDOM - doesn't currently work
* **cleanup** - When true, removes all non-zipped files after zipping

In [21]:
# Per-Run Configurations go here
runtype = 'Scaled' 
sample_size = 3000
models = ["adm","big_gan","glide","midjourney","sd5","sd4","vqdm","wukong"]
scale_type = 'LANCZOS'
cleanup = True


## Local Setup 
This is where you put your file locations - it should only need to be set up once.

In [3]:
# Put all your file locations in here

# This is where the samples will end up

zip_out = "C:\\DL_Temp"

out_train = "C:\\DL_Temp\\train"
out_val = "C:\\DL_Temp\\validation"
out_test = "C:\\DL_Temp\\test"

out_train_ai = out_train + "\\ai"
out_val_ai = out_val + "\\ai"
out_test_ai = out_test + "\\ai"

out_train_real = out_train + "\\real"
out_val_real = out_val + "\\real"
out_test_real = out_test + "\\real"

file_loc = {}

# Locations for files - AI 
file_loc["adm_ai"] = "D:\\ADM\\imagenet_ai_0508_adm\\train\\ai"
file_loc["big_gan_ai"] = "D:\\BigGAN\\imagenet_ai_0419_biggan\\train\\ai"
file_loc["glide_ai"] = "D:\\Glide\\imagenet_glide\\train\\ai"
file_loc["midjourney_ai"] = "D:\\midjourney\\imagenet_midjourney\\train\\ai"
file_loc["sd5_ai"] = "D:\\Stable_Diffusionv5\\imagenet_ai_0424_sdv5\\train\\ai"
file_loc["sd4_ai"] = "D:\\Stable_Diffusionv4\\imagenet_ai_0419_sdv4\\train\\ai"
file_loc["vqdm_ai"] = "D:\\VQDM\\imagenet_ai_0419_vqdm\\train\\ai"
file_loc["wukong_ai"] = "D:\\WuKong\\imagenet_ai_0424_wukong\\train\\ai"

# Locations for files - Real 
file_loc["adm_real"] = "D:\\ADM\\imagenet_ai_0508_adm\\train\\nature"
file_loc["big_gan_real"] = "D:\\BigGAN\\imagenet_ai_0419_biggan\\train\\nature"
file_loc["glide_real"] = "D:\\Glide\\imagenet_glide\\train\\nature"
file_loc["midjourney_real"] = "D:\\midjourney\\imagenet_midjourney\\train\\nature"
file_loc["sd5_real"] = "D:\\Stable_Diffusionv5\\imagenet_ai_0424_sdv5\\train\\nature"
file_loc["sd4_real"] = "D:\\Stable_Diffusionv4\\imagenet_ai_0419_sdv4\\train\\nature"
file_loc["vqdm_real"] = "D:\\VQDM\\imagenet_ai_0419_vqdm\\train\\nature"
file_loc["wukong_real"] = "D:\\WuKong\\imagenet_ai_0424_wukong\\train\\nature"



## Useful Function Library 

* create_image_sample_list(source_dir, sample_size, *...others*) -- creates a file with a train/validation/test split, defaults to 70/15/15 
* copy_images_from_list(list_file_path, target_dir) -- actually moves files from the directory over to somewhere to pick up the sample and zip it up
* delete_all_files(folder) - Deletes all files in a folder - leaves folder
* delete_txt_files(folder) - Deletes all text files in a folder
* delete_folder(folder) - Deletes everything, including the folder itself
* convert_png_to_jpg(folder, subfolder, quality=90) - Converts a folder of pngs to jpgs (used for AI images)
* random_crops (input_folder, archive_folder='archive', crops_per_image=1, min_scale=.5, max_scale=.9, seed=42) - Creates a number of randomly cropped images from each image in a folder
* random_scale (input_folder, archive_folder='archive',rescale='LANCZOS', min_scale=.5, max_scale=1.5, seed=42) - creates a number of randomly scaled images from each image in a folder

In [4]:
# Create Sample Lists
def create_image_sample_list(
     source_dir,
    sample_size,
    train_ratio=0.7,
    val_ratio=0.15,
    test_ratio=0.15,
    output_prefix="dataset",
    extensions={".jpg", ".jpeg", ".png", ".webp"},
    seed=42
):  
    all_images = []
    for root, dirs, files in os.walk(source_dir):
        for f in files:
            if os.path.splitext(f.lower())[1] in extensions:
                all_images.append(os.path.join(root, f))

    if not all_images:
        raise ValueError(f"No image files in: {source_dir}")

    ## Make the sample
    n_available = len(all_images)
    actual_sample_size = min(sample_size, n_available)

    random.seed(seed) 
    sampled_images = random.sample(all_images, actual_sample_size)

    #  Split sampled images into train / val / test 
    random.shuffle(sampled_images)  # extra shuffle for good measure

    n_total = len(sampled_images)
    n_train = int(n_total * train_ratio)
    n_val   = int(n_total * val_ratio)
    n_test  = n_total - n_train - n_val  # ensures all are used

    train_list = sampled_images[:n_train]
    val_list   = sampled_images[n_train:n_train + n_val]
    test_list  = sampled_images[n_train + n_val:]

    #  Write helper  
    def write_list(paths, filepath):
        with open(filepath, "w") as f:
            for p in paths:
                f.write(p + "\n")

    # Output filenames  
    train_file = f"{output_prefix}_train.txt"
    val_file   = f"{output_prefix}_val.txt"
    test_file  = f"{output_prefix}_test.txt"

    #   Write files  
    write_list(train_list, train_file)
    write_list(val_list, val_file)
    write_list(test_list, test_file)

    print(f"Found {n_available} images in total.")
    print(f"Sampled {n_total} images (requested {sample_size}).")
    print(f"Train: {len(train_list)} → {train_file}")
    print(f"Val:   {len(val_list)} → {val_file}")
    print(f"Test:  {len(test_list)} → {test_file}")

 


In [5]:
# Get Files From List
def copy_images_from_list(list_file_path, target_dir):  
    os.makedirs(target_dir, exist_ok=True)
 
    with open(list_file_path, "r") as f:
        image_paths = [line.strip() for line in f if line.strip()]

    for img_path in image_paths:
        if os.path.isfile(img_path):
            shutil.copy(img_path, target_dir)
        else:
            print(f"File not found -> {img_path}")

    print(f"Copied {len(image_paths)} files into {target_dir}.")

In [6]:
# File Management Helpers
def delete_all_files(folder):
    for root, dirs, files in os.walk(folder):
        for f in files:
            try:
                os.remove(os.path.join(root, f))
            except Exception as e:
                print(f"Error deleting {f}: {e}")

def delete_txt_files(folder):
    for f in os.listdir(folder):
        path = os.path.join(folder, f)
        if os.path.isfile(path) and f.lower().endswith(".txt"):
            os.remove(path)
    print(f"Deleted all text files in: {folder}")
            
def delete_folder(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder)
        print(f"Deleted folder: {folder}")
    else:
        print(f"Folder not found: {folder}")

In [7]:
# Convert png to jpg
def convert_png_to_jpg(folder, subfolder="original_pngs", quality=95): 
    # Create subfolder for PNG backups
    move_path = os.path.join(folder, subfolder)
    os.makedirs(move_path, exist_ok=True)

    counter = 0
    for f in os.listdir(folder):
        if f.lower().endswith(".png"):
            png_path = os.path.join(folder, f)
            jpg_name = f.rsplit(".", 1)[0] + ".jpg"
            jpg_path = os.path.join(folder, jpg_name)

            # Convert PNG → JPG
            img = Image.open(png_path).convert("RGB")
            img.save(jpg_path, "JPEG", quality=quality)
            
            # Move original PNG to the subfolder
            shutil.move(png_path, os.path.join(move_path, f))
            counter = counter + 1
    print("Converted ", counter, " images")

In [8]:
# Function for randomly cropping images 
def random_crops(
    input_folder,
    archive_folder = "archive", # naming to not overlap with convert_png_to_jpg
    crops_per_image=1,
    min_scale=0.5,
    max_scale=0.9,
    seed=42
): 
    archive_path = os.path.join(input_folder, archive_folder)
    os.makedirs(archive_path, exist_ok=True)
    
    random.seed(seed)
    img_counter = 0

    print(f"Starting {input_folder}.")

    files = [f for f in os.listdir(input_folder)
             if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
             and f != archive_folder]
    
    for filename in files: 

        try:
            img_path = os.path.join(input_folder, filename)
            img = Image.open(img_path)
    
            w, h = img.size 
    
            for i in range(crops_per_image):
                # Choose a random crop size between min_scale and max_scale for each direction
                scale_w = random.uniform(min_scale, max_scale)
                scale_h = random.uniform(min_scale, max_scale)
                crop_size_w = int(w * scale_w)
                crop_size_h = int(h * scale_h)
    
                # Max for Upper Left Corner
                max_x = w - crop_size_w
                max_y = h - crop_size_h
     
                x1 = random.randint(0, max_x)
                y1 = random.randint(0, max_y)
                x2 = x1 + crop_size_w
                y2 = y1 + crop_size_h
    
                cropped = img.crop((x1, y1, x2, y2))
    
                # Save output
                base = os.path.splitext(filename)[0]
                out_name = f"{base}_crop{i+1}.jpg"
                out_path = os.path.join(input_folder, out_name)
    
                cropped.save(out_path, "JPEG", quality=95)
                
            img_counter = img_counter + 1    
            shutil.move(img_path, os.path.join(archive_path, filename))
        except Exception as e:
            print(f"Error reading {filename} → deleting it. Reason: {e}")
            try:
                os.remove(img_path)
            except Exception:
                print(f"Could not delete {img_path}, skipping.")
            continue  # move to the next file

    print(f"Processed {img_counter} images in {input_folder}, created {crops_per_image} samples each.")


In [9]:
#Function for randomly scaling images
def random_scale(
    input_folder,
    archive_folder="archive",
    rescale='LANCZOS', # BICUBIC, BILINEAR, LANCZOS, RANDOM
    min_scale=0.5,
    max_scale=1.5,
    seed=42
): 
 
 
    archive_path = os.path.join(input_folder, archive_folder)
    os.makedirs(archive_path, exist_ok=True)

    random.seed(seed)
    img_counter = 0

    print(f"Processing folder: {input_folder}")

    # Snapshot of files so we don't pick up newly written files
    files = [
        f for f in os.listdir(input_folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
        and f != archive_folder
    ]

    for filename in files:
        img_path = os.path.join(input_folder, filename)

        # Try to open the image
        try:
            img = Image.open(img_path)
            w, h = img.size

            # Pick a random scale size
            scale = random.uniform(min_scale, max_scale)
            new_w = max(1, int(w * scale))
            new_h = max(1, int(h * scale))

            # Resize with a filter
            if rescale == 'BICUBIC':
                resized = img.resize((new_w, new_h), Image.BICUBIC)
            elif rescale == 'BILINEAR':
                resized = img.resize((new_w, new_h), Image.BILINEAR)
            else:
                resized = img.resize((new_w, new_h), Image.LANCZOS)
            
 
            archived_file_path = os.path.join(archive_path, filename)
        
            try:
                shutil.move(img_path, archived_file_path)
            except Exception as e:
                print(f"Could not archive {filename}, deleting instead. Reason: {e}")
                try:
                    os.remove(img_path)
                except Exception:
                    print(f"Could not delete {img_path}, skipping.")
                continue

            # Save scaled image back to original location (same name)
            base = os.path.splitext(filename)[0]
            out_name = f"{base}_resize.jpg"
            out_path = os.path.join(input_folder, out_name)
    
            resized.save(out_path, "JPEG", quality=95)
            
            img_counter += 1 
        
        except Exception as e:
            print(f"Error reading {filename} → deleting it. Reason: {e}")
            try:
                os.remove(img_path)
            except Exception:
                print(f"Could not delete {img_path}, skipping.")
            continue  # move to the next file

    print(f"Processed {img_counter} images in {input_folder}, created {crops_per_image} samples each.")


In [10]:
#Function for randomly scaling images
def random_scale(
    input_folder,
    archive_folder="archive",
    rescale='LANCZOS', # BICUBIC, BILINEAR, LANCZOS, RANDOM
    min_scale=0.5,
    max_scale=1.5,
    seed=42
): 
 
 
    archive_path = os.path.join(input_folder, archive_folder)
    os.makedirs(archive_path, exist_ok=True)

    random.seed(seed)
    img_counter = 0

    print(f"Processing folder: {input_folder}")

    # Snapshot of files so we don't pick up newly written files
    files = [
        f for f in os.listdir(input_folder)
        if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
        and f != archive_folder
    ]

    for filename in files:
        img_path = os.path.join(input_folder, filename)

        # Try to open the image
        try:
            img = Image.open(img_path)
            w, h = img.size

            # Pick a random scale size
            scale = random.uniform(min_scale, max_scale)
            new_w = max(1, int(w * scale))
            new_h = max(1, int(h * scale))

            # Resize with a filter
            if rescale == 'BICUBIC':
                resized = img.resize((new_w, new_h), Image.BICUBIC)
            elif rescale == 'BILINEAR':
                resized = img.resize((new_w, new_h), Image.BILINEAR)
            else:
                resized = img.resize((new_w, new_h), Image.LANCZOS)
            
 
            archived_file_path = os.path.join(archive_path, filename)
        
            try:
                shutil.move(img_path, archived_file_path)
            except Exception as e:
                print(f"Could not archive {filename}, deleting instead. Reason: {e}")
                try:
                    os.remove(img_path)
                except Exception:
                    print(f"Could not delete {img_path}, skipping.")
                continue

            # Save scaled image back to original location (same name)
            base = os.path.splitext(filename)[0]
            out_name = f"{base}_resize.jpg"
            out_path = os.path.join(input_folder, out_name)
    
            resized.save(out_path, "JPEG", quality=95)
            
            img_counter += 1 
        
        except Exception as e:
            print(f"Error reading {filename} → deleting it. Reason: {e}")
            try:
                os.remove(img_path)
            except Exception:
                print(f"Could not delete {img_path}, skipping.")
            continue  # move to the next file

    print(f"Processed {img_counter} images in {input_folder}.")


## Generate Samples

In [11]:
# Deletion Script to run at the beginning of start

delete_all_files(out_train_ai)
delete_all_files(out_test_ai)
delete_all_files(out_val_ai)

delete_all_files(out_train_real)
delete_all_files(out_test_real)
delete_all_files(out_val_real)

delete_txt_files(".")

Deleted all text files in: .


In [12]:
# Make Sample File List
for m in models:
    print("---- Starting " + m + " ----")
    ai_source = m + "_ai"
    real_source = m + "_real"
    
    create_image_sample_list(source_dir=file_loc[ai_source], sample_size=sample_size, output_prefix=ai_source) 
    create_image_sample_list(source_dir=file_loc[real_source], sample_size=sample_size, output_prefix=real_source) 

---- Starting adm ----
Found 162000 images in total.
Sampled 3000 images (requested 3000).
Train: 2100 → adm_ai_train.txt
Val:   450 → adm_ai_val.txt
Test:  450 → adm_ai_test.txt
Found 157453 images in total.
Sampled 3000 images (requested 3000).
Train: 2100 → adm_real_train.txt
Val:   450 → adm_real_val.txt
Test:  450 → adm_real_test.txt
---- Starting big_gan ----
Found 162000 images in total.
Sampled 3000 images (requested 3000).
Train: 2100 → big_gan_ai_train.txt
Val:   450 → big_gan_ai_val.txt
Test:  450 → big_gan_ai_test.txt
Found 162000 images in total.
Sampled 3000 images (requested 3000).
Train: 2100 → big_gan_real_train.txt
Val:   450 → big_gan_real_val.txt
Test:  450 → big_gan_real_test.txt
---- Starting glide ----
Found 162000 images in total.
Sampled 3000 images (requested 3000).
Train: 2100 → glide_ai_train.txt
Val:   450 → glide_ai_val.txt
Test:  450 → glide_ai_test.txt
Found 162000 images in total.
Sampled 3000 images (requested 3000).
Train: 2100 → glide_real_train.txt


In [13]:
# Copy Files
for m in models:
    print("---- Starting " + m + " ----")
    ai_file = m + "_ai"
    real_file = m + "_real"
     
    copy_images_from_list(ai_file + "_train.txt", out_train_ai)
    copy_images_from_list(real_file + "_train.txt", out_train_real)
    copy_images_from_list(ai_file + "_test.txt", out_test_ai)
    copy_images_from_list(real_file + "_test.txt", out_test_real)
    copy_images_from_list(ai_file + "_val.txt", out_val_ai)
    copy_images_from_list(real_file + "_val.txt", out_val_real)

---- Starting adm ----
Copied 2100 files into C:\DL_Temp\train\ai.
Copied 2100 files into C:\DL_Temp\train\real.
Copied 450 files into C:\DL_Temp\test\ai.
Copied 450 files into C:\DL_Temp\test\real.
Copied 450 files into C:\DL_Temp\validation\ai.
Copied 450 files into C:\DL_Temp\validation\real.
---- Starting big_gan ----
Copied 2100 files into C:\DL_Temp\train\ai.
Copied 2100 files into C:\DL_Temp\train\real.
Copied 450 files into C:\DL_Temp\test\ai.
Copied 450 files into C:\DL_Temp\test\real.
Copied 450 files into C:\DL_Temp\validation\ai.
Copied 450 files into C:\DL_Temp\validation\real.
---- Starting glide ----
Copied 2100 files into C:\DL_Temp\train\ai.
Copied 2100 files into C:\DL_Temp\train\real.
Copied 450 files into C:\DL_Temp\test\ai.
Copied 450 files into C:\DL_Temp\test\real.
Copied 450 files into C:\DL_Temp\validation\ai.
Copied 450 files into C:\DL_Temp\validation\real.
---- Starting midjourney ----
Copied 2100 files into C:\DL_Temp\train\ai.
Copied 2100 files into C:\DL_

In [14]:
# Convert PNG to JPG
convert_png_to_jpg(out_test_ai,"original")
convert_png_to_jpg(out_val_ai,"original")
convert_png_to_jpg(out_train_ai,"original")

Converted  3600  images
Converted  3600  images
Converted  16800  images


In [15]:
# Delete Original PNGs
delete_folder(out_test_ai + "\\original")
delete_folder(out_val_ai + "\\original")
delete_folder(out_train_ai + "\\original")

Deleted folder: C:\DL_Temp\test\ai\original
Deleted folder: C:\DL_Temp\validation\ai\original
Deleted folder: C:\DL_Temp\train\ai\original


## Perform Cropping/Resizing etc. 

In [16]:
# If Cropping - Crop Images
if runtype == 'Cropped': # Optional - Create Crops 
    random_crops(out_test_ai)
    random_crops(out_test_real)
    random_crops(out_val_ai)
    random_crops(out_val_real)
    random_crops(out_train_ai)
    random_crops(out_train_real)

In [17]:
# If Cropping - Delete Archives
if runtype == 'Cropped': # Delete Archives before Zipping
    delete_folder(out_test_ai + "\\archive")
    delete_folder(out_val_ai + "\\archive")
    delete_folder(out_train_ai + "\\archive")
    
    delete_folder(out_test_real + "\\archive")
    delete_folder(out_val_real + "\\archive")
    delete_folder(out_train_real + "\\archive")

In [18]:
# If Scaling - Scale Images
if runtype == 'Scaled':  
    random_scale(out_test_ai)
    random_scale(out_test_real)
    random_scale(out_val_ai)
    random_scale(out_val_real)
    random_scale(out_train_ai)
    random_scale(out_train_real)

Processing folder: C:\DL_Temp\test\ai
Processed 3600 images in C:\DL_Temp\test\ai.
Processing folder: C:\DL_Temp\test\real
Error reading n03447721_40894.JPEG → deleting it. Reason: cannot identify image file 'C:\\DL_Temp\\test\\real\\n03447721_40894.JPEG'
Error reading n03450230_1303.JPEG → deleting it. Reason: cannot identify image file 'C:\\DL_Temp\\test\\real\\n03450230_1303.JPEG'
Error reading n03452741_20019.JPEG → deleting it. Reason: cannot identify image file 'C:\\DL_Temp\\test\\real\\n03452741_20019.JPEG'
Error reading n03457902_14560.JPEG → deleting it. Reason: cannot identify image file 'C:\\DL_Temp\\test\\real\\n03457902_14560.JPEG'
Error reading n03476684_9581.JPEG → deleting it. Reason: cannot identify image file 'C:\\DL_Temp\\test\\real\\n03476684_9581.JPEG'
Error reading n03482405_21043.JPEG → deleting it. Reason: cannot identify image file 'C:\\DL_Temp\\test\\real\\n03482405_21043.JPEG'
Error reading n03483316_23597.JPEG → deleting it. Reason: cannot identify image fil

In [19]:
# If Scaled - Delete Archives
if runtype == 'Scaled': # Delete Archives before Zipping
    delete_folder(out_test_ai + "\\archive")
    delete_folder(out_val_ai + "\\archive")
    delete_folder(out_train_ai + "\\archive")
    
    delete_folder(out_test_real + "\\archive")
    delete_folder(out_val_real + "\\archive")
    delete_folder(out_train_real + "\\archive")

Deleted folder: C:\DL_Temp\test\ai\archive
Deleted folder: C:\DL_Temp\validation\ai\archive
Deleted folder: C:\DL_Temp\train\ai\archive
Deleted folder: C:\DL_Temp\test\real\archive
Deleted folder: C:\DL_Temp\validation\real\archive
Deleted folder: C:\DL_Temp\train\real\archive


## Zipping into a nice package
Remember to save these before you run next time!

In [20]:
# Zip Up Results 

with py7zr.SevenZipFile(zip_out + '\\test.7z', mode='w') as z: 
        z.writeall(out_test_ai)
        z.writeall(out_test_real)
print("Finished Zipping Test!") 

with py7zr.SevenZipFile(zip_out + '\\validation.7z', mode='w') as z: 
        z.writeall(out_val_ai)
        z.writeall(out_val_real)
print("Finished Zipping Validation!")

with py7zr.SevenZipFile(zip_out + '\\train.7z', mode='w') as z: 
        z.writeall(out_train_ai)
        z.writeall(out_train_real)
print("Finished Zipping Train!")

Finished Zipping Test!
Finished Zipping Validation!
Finished Zipping Train!


In [None]:
if cleanup:
    delete_all_files(out_train_ai)
    delete_all_files(out_test_ai)
    delete_all_files(out_val_ai)

    delete_all_files(out_train_real)
    delete_all_files(out_test_real)
    delete_all_files(out_val_real)