Scraping using https://github.com/mikf/gallery-dl

Data preprocessing:
- Resize all images to 400px wide
- Convert any jpg to png

In [None]:
!python -m pip install -U gallery-dl

In [None]:
!gallery-dl "https://danbooru.donmai.us/posts?tags=torino_aqua+rating:general"

In [None]:
!gallery-dl "https://danbooru.donmai.us/posts?tags=yukie_(kusaka_shi)+rating:general"

In [None]:
!gallery-dl "https://danbooru.donmai.us/posts?tags=mery_(yangmalgage)+rating:general"

In [None]:
# Shrink all images to 400 px wide

import os
from PIL import Image

def resize_images_in_folder(folder_path, output_path, target_width=400):
    # Create output folder
    os.makedirs(output_path, exist_ok=True)

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if os.path.isfile(filepath) and filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
            try:
                with Image.open(filepath) as img:
                    # Calculate new height to maintain aspect ratio
                    width_percent = target_width / float(img.size[0])
                    new_height = int((float(img.size[1]) * float(width_percent)))
                    
                    # Resize image
                    resized_img = img.resize((target_width, new_height), Image.ANTIALIAS)
                    
                    # Save resized image
                    resized_img.save(os.path.join(output_folder, filename))
                    print(f"Resized: {filename}")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")

folder_path = rf'gallery-dl\danbooru\mery_(yangmalgage) rating_general'
output_folder = rf'dataset\mery'
resize_images_in_folder(folder_path, output_folder)

folder_path = rf'gallery-dl\danbooru\torino_aqua rating_general'
output_folder = rf'dataset\torino'
resize_images_in_folder(folder_path, output_folder)

folder_path = rf'gallery-dl\danbooru\yukie_(kusaka_shi) rating_general'
output_folder = rf'dataset\yukien'
resize_images_in_folder(folder_path, output_folder)

In [1]:
import os
from PIL import Image

def convert_and_delete_jpgs(folder_path, output_folder=None):
    if output_folder is None:
        output_folder = folder_path  # Save PNGs in same folder
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.jpg', '.jpeg')):
            jpg_path = os.path.join(folder_path, filename)
            png_filename = os.path.splitext(filename)[0] + '.png'
            png_path = os.path.join(output_folder, png_filename)

            try:
                with Image.open(jpg_path) as img:
                    img = img.convert('RGB')
                    img.save(png_path, 'PNG')
                    print(f"Converted: {filename} -> {png_filename}")

                os.remove(jpg_path)
                print(f"Deleted original: {filename}")

            except Exception as e:
                print(f"Failed to process {filename}: {e}")

convert_and_delete_jpgs(rf'dataset\mery')
convert_and_delete_jpgs(rf'dataset\torino')
convert_and_delete_jpgs(rf'dataset\yukien')


Converted: danbooru_1978743_4bacfbeb262201658c6c8620ddad8259.jpg -> danbooru_1978743_4bacfbeb262201658c6c8620ddad8259.png
Deleted original: danbooru_1978743_4bacfbeb262201658c6c8620ddad8259.jpg
Converted: danbooru_2309208_5d6447c7b81696ebc3c32b3060b29f00.jpg -> danbooru_2309208_5d6447c7b81696ebc3c32b3060b29f00.png
Deleted original: danbooru_2309208_5d6447c7b81696ebc3c32b3060b29f00.jpg
Converted: danbooru_2309215_d0c5956ddf9bd8b3643af10e24f7c034.jpg -> danbooru_2309215_d0c5956ddf9bd8b3643af10e24f7c034.png
Deleted original: danbooru_2309215_d0c5956ddf9bd8b3643af10e24f7c034.jpg
Converted: danbooru_2356475_1c4317e059297156702b0c82614c9d98.jpg -> danbooru_2356475_1c4317e059297156702b0c82614c9d98.png
Deleted original: danbooru_2356475_1c4317e059297156702b0c82614c9d98.jpg
Converted: danbooru_2356477_5d5524df60249b25dc3cefd2b9d052f1.jpg -> danbooru_2356477_5d5524df60249b25dc3cefd2b9d052f1.png
Deleted original: danbooru_2356477_5d5524df60249b25dc3cefd2b9d052f1.jpg
Converted: danbooru_2364746_8b