<a href="https://colab.research.google.com/github/gwhitez/Lora-Trainer-XL/blob/main/Waifu_Diffusion_V3_Dataser_Maker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

| | GitHub | Trainer XL | Tagger WDV3 (kohya)  | Tagger WDV3 | Old Trainer XL |
| :--- | :--- | :--- | :--- | :--- | :---|
| 🏠 **Original Proyect** | [![GitHub](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/github.svg)](https://github.com/hollowstrawberry/kohya-colab) | | | | |
| **Modified By WhiteZ** | [![GitHub](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/github.svg)](https://github.com/gwhitez/Lora-Trainer-XL) | [![Open in Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge.svg)](https://colab.research.google.com/github/gwhitez/Lora-Trainer-XL/blob/main/Fix_Lora_Trainer_XL.ipynb) | [![Open in Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge.svg)](https://colab.research.google.com/github/gwhitez/Lora-Trainer-XL/blob/main/Dataset_Maker_By_WhiteZ.ipynb) | [![Open in Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge.svg)](https://colab.research.google.com/github/gwhitez/Lora-Trainer-XL/blob/main/Waifu_Diffusion_V3_Dataser_Maker.ipynb) | [![Open in Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge.svg)](https://colab.research.google.com/github/gwhitez/Lora-Trainer-XL/blob/main/Old_Fix_Lora_Trainer_XL.ipynb) |

In [None]:
# @title ## **1. Install dependencies** ✅
import os
import shutil
from subprocess import getoutput
from google.colab import drive
from IPython.display import clear_output

%store -r

# root_dir

output_to_drive   = True  # param {type: "boolean"}
if output_to_drive:
  from google.colab import drive
  drive.mount('/content/drive')


#install dependencies
    # Custom =====================
!pip install uv
!uv venv tagger
!git clone https://github.com/corkborg/wd14-tagger-standalone /content/wd14_tagger
%cd /content/wd14_tagger
!uv pip install -r requirements.txt -q
    # ============================
clear_output()

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.environ["PYTHONWARNINGS"] = "ignore"

(print(f"DONE ✅"))

In [None]:
# @title ## **2. Directory Config** 📁
# @markdown Specify the location of your training data in the following cell. A folder with the same name as your input will be created.
#@markdown You can tag multiple folders by entering their paths.
import os

%store -r

train_data_dir = ""  # @param {'type' : 'string'}
%store train_data_dir

os.makedirs(train_data_dir, exist_ok=True)
print(f"Your train data directory : {train_data_dir}")

In [None]:
# @title ## **3. Data Cleaning 🔨**
# @markdown #### Delete Unnecessary Files
import os
import random
import concurrent.futures
from tqdm import tqdm
from PIL import Image

%store -r

os.chdir(train_data_dir)

test = os.listdir(train_data_dir)
# @markdown This section will delete unnecessary files and unsupported media such as `.mp4`, `.webm`, and `.gif`. These files are not used and may cause errors when training your lora.

supported_types = [
    ".png",
    ".jpg",
    ".jpeg",
    ".webp",
    ".bmp",
    ".caption",
    ".npz",
    ".txt",
    ".json",
]

for item in test:
    file_ext = os.path.splitext(item)[1]
    if file_ext not in supported_types:
        print(f"Deleting file {item} from {train_data_dir}")
        os.remove(os.path.join(train_data_dir, item))

# @markdown #### <br> Convert Transparent Images
# @markdown This code will convert your transparent dataset with alpha channel (RGBA) to RGB and give it a white background.

convert = True  # @param {type:"boolean"}
random_color = False  # @param {type:"boolean"}

batch_size = 32

images = [
    image
    for image in os.listdir(train_data_dir)
    if image.endswith(".png") or image.endswith(".webp")
]
background_colors = [
    (255, 255, 255),
    (0, 0, 0),
    (255, 0, 0),
    (0, 255, 0),
    (0, 0, 255),
    (255, 255, 0),
    (255, 0, 255),
    (0, 255, 255),
]


def process_image(image_name):
    img = Image.open(f"{train_data_dir}/{image_name}")

    if img.mode in ("RGBA", "LA"):
        if random_color:
            background_color = random.choice(background_colors)
        else:
            background_color = (255, 255, 255)
        bg = Image.new("RGB", img.size, background_color)
        bg.paste(img, mask=img.split()[-1])

        if image_name.endswith(".webp"):
            bg = bg.convert("RGB")
            bg.save(f'{train_data_dir}/{image_name.replace(".webp", ".jpg")}', "JPEG")
            os.remove(f"{train_data_dir}/{image_name}")
            print(
                f" Converted image: {image_name} to {image_name.replace('.webp', '.jpg')}"
            )
        else:
            bg.save(f"{train_data_dir}/{image_name}", "PNG")
            print(f" Converted image: {image_name}")
    else:
        if image_name.endswith(".webp"):
            img.save(f'{train_data_dir}/{image_name.replace(".webp", ".jpg")}', "JPEG")
            os.remove(f"{train_data_dir}/{image_name}")
            print(
                f" Converted image: {image_name} to {image_name.replace('.webp', '.jpg')}"
            )
        else:
            img.save(f"{train_data_dir}/{image_name}", "PNG")


num_batches = len(images) // batch_size + 1

if convert:
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for i in tqdm(range(num_batches)):
            start = i * batch_size
            end = start + batch_size
            batch = images[start:end]
            executor.map(process_image, batch)

    print("\033[96mAll images have been converted\033[0m")

In [None]:
from IPython.display import clear_output
#@markdown ### **4. Tag your images**
#@markdown ##### **Waifu Diffusion 1.4 Tagger V3, V2, Z3D and ML Models**
%store -r
from IPython.display import clear_output

tag_dir = "/content/wd14_tagger"
os.chdir(tag_dir)

#@markdown the **WD** **v1** and **v2** models work well in general use (anime, cartoon etc.), the **z3d** models are more focused on **furry** tags and the **ml** models are trained with **danbooru tags** (much more than the WD models) are more accurate but slower, I recommend using them with a **threshold** of `0.4` to `0.6`.
MODEL = "wd-v1-4-swinv2-tagger.v3" # @param ["wd-v1-4-vit-tagger.v3","wd-v1-4-convnext-tagger.v3","wd-v1-4-swinv2-tagger.v3","wd-vit-large-tagger-v3","wd-eva02-large-tagger-v3","wd-v1-4-moat-tagger.v2","wd14-vit.v2","wd14-convnext.v2","z3d-e621-convnext-toynya","z3d-e621-convnext-silveroxides","mld-caformer.dec-5-97527","mld-tresnetd.6-30000"]
#@markdown More theshold lesser tags.
threshold = 0.3 #@param {type:"number", min:0, max:1, step:0.05}
#@markdown `undesire_tag` remove caption that you don't want it from the image(s). eg: `black shirt, vtuber` separated by comma.
undesire_caption = "bangs, breasts, multicolored hair, two-tone hair, gradient hair, virtual youtuber, parody, style parody, official alternate costume, official alternate hairstyle, official alternate hair length, alternate costume, alternate hairstyle, alternate hair length, alternate hair color, watermark, text, bubble text, signature, artist name" # @param {type:"string"}
extension = ".txt" # @param [".txt", ".caption"]
recursive = False # @param {type:"boolean"}
overwrite_existing_caption = False # @param {type:"boolean"}

#missing dependencies
print("installing dependencies please wait")
!uv pip install torch==2.5.1 torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html -q
!uv pip install fairscale==0.4.13 timm==0.6.12 -q
!uv pip install onnxruntime-gpu==1.18.1 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ -q
clear_output()
print("\033[96mtagging images please wait...\033[0m")

config = {
    "dir"         : train_data_dir,
    "model"       : MODEL,
    "threshold"   : threshold,
    "ext"         : extension,
    "exclude-tag" : undesire_caption,
}

ow = ""
if overwrite_existing_caption:
  ow = "--overwrite"

recursive = ""
if recursive:
  recursive = "--recursive"

args = ""
for k, v in config.items():
    if k.startswith("_"):
        args += f'"{v}" '
    elif isinstance(v, str):
        args += f'--{k}="{v}" '
    elif isinstance(v, bool) and v:
        args += f"--{k} "
    elif isinstance(v, float) and not isinstance(v, bool):
        args += f"--{k}={v} "
    elif isinstance(v, int) and not isinstance(v, bool):
        args += f"--{k}={v} "

final_args = f"python run.py {ow} {recursive} {args}"

os.chdir(tag_dir)
!{final_args}
#clear_output()
print("\033[96mDONE ✅\033[0m")

In [None]:
# @title ### **5. Custom Caption/Tag** 📑
import os

%store -r

os.chdir(train_data_dir)

# @markdown Add or remove custom tags here.
extension   = ".txt"  # @param [".txt", ".caption"]
custom_tag  = ""  # @param {type:"string"}
# @markdown Use `sub_folder` option to specify a subfolder for multi-concept training.
# @markdown > Specify `--all` to process all subfolders/`recursive`
sub_folder  = "" #@param {type: "string"}
# @markdown Enable this to append custom tags at the end of lines.
append      = False  # @param {type:"boolean"}
# @markdown Enable this if you want to remove captions/tags instead.
remove_tag  = False  # @param {type:"boolean"}
recursive   = False

if sub_folder == "":
    image_dir = train_data_dir
elif sub_folder == "--all":
    image_dir = train_data_dir
    recursive = True
elif sub_folder.startswith("/content"):
    image_dir = sub_folder
else:
    image_dir = os.path.join(train_data_dir, sub_folder)
    os.makedirs(image_dir, exist_ok=True)

def read_file(filename):
    with open(filename, "r") as f:
        contents = f.read()
    return contents

def write_file(filename, contents):
    with open(filename, "w") as f:
        f.write(contents)

def process_tags(filename, custom_tag, append, remove_tag):
    contents = read_file(filename)
    tags = [tag.strip() for tag in contents.split(',')]
    custom_tags = [tag.strip() for tag in custom_tag.split(',')]

    for custom_tag in custom_tags:
        custom_tag = custom_tag.replace("_", " ")
        if remove_tag:
            while custom_tag in tags:
                tags.remove(custom_tag)
        else:
            if custom_tag not in tags:
                if append:
                    tags.append(custom_tag)
                else:
                    tags.insert(0, custom_tag)

    contents = ', '.join(tags)
    write_file(filename, contents)

def process_directory(image_dir, tag, append, remove_tag, recursive):
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)

        if os.path.isdir(file_path) and recursive:
            process_directory(file_path, tag, append, remove_tag, recursive)
        elif filename.endswith(extension):
            process_tags(file_path, tag, append, remove_tag)

tag = custom_tag

if not any(
    [filename.endswith(extension) for filename in os.listdir(image_dir)]
):
    for filename in os.listdir(image_dir):
        if filename.endswith((".png", ".jpg", ".jpeg", ".webp", ".bmp")):
            open(
                os.path.join(image_dir, filename.split(".")[0] + extension),
                "w",
            ).close()

if custom_tag:
    process_directory(image_dir, tag, append, remove_tag, recursive)
print(f"finished ✅")

In [None]:
#@markdown ### **6. Analyze Tags 📝**
#@markdown Perhaps you need another look at your dataset.
#@markdown **You can see the tags of various folders like this:** `my_lora/dataset/subfolder`
from collections import Counter
import os

%store -r

# Define the project path
proyect_dir = train_data_dir  #param {type:"string"}

# Defines the path of the dataset
dataset_folder = f"{proyect_dir}/"

show_top_tags = 50 #@param {type:"number"}

# Check if the route exists
if not os.path.exists(dataset_folder):
  print("The dataset path does not exist")
  exit()

top_tags = Counter()

for txt in [f for f in os.listdir(dataset_folder) if f.lower().endswith(".txt")]:
  with open(os.path.join(dataset_folder, txt), 'r') as f:
    top_tags.update([s.strip() for s in f.read().split(",")])

top_tags = Counter(top_tags)
print(f"Top {show_top_tags} tags:")
for k, v in top_tags.most_common(show_top_tags):
  print(f"{k} ({v})")


In [None]:
#@markdown ### **7. 📂 Unzip dataset**
#@markdown It's much slower to upload individual files to your Drive, so you may want to upload a zip if you have your dataset in your computer.
zip = "/content/drive/MyDrive/Loras/example.zip" #@param {type:"string"}
extract_to = "/content/drive/MyDrive/Loras/example/dataset" #@param {type:"string"}

import os, zipfile

if not os.path.exists('/content/drive'):
  from google.colab import drive
  print("📂 Connecting to Google Drive...")
  drive.mount('/content/drive')

os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip, 'r') as f:
  f.extractall(extract_to)

print("✅ Done")

In [None]:
#@markdown ### **8. 🔢 Count datasets**
#@markdown Google Drive makes it impossible to count the files in a folder, so this will show you the file counts in all folders and subfolders.
folder = "/content/drive/MyDrive/Loras/example/dataset" #@param {type:"string"}

import os
from google.colab import drive

if not os.path.exists('/content/drive'):
    print("📂 Connecting to Google Drive...\n")
    drive.mount('/content/drive')

tree = {}
exclude = ("_logs", "/output")
for i, (root, dirs, files) in enumerate(os.walk(folder, topdown=True)):
  dirs[:] = [d for d in dirs if all(ex not in d for ex in exclude)]
  images = len([f for f in files if f.lower().endswith((".png", ".jpg", ".jpeg"))])
  captions = len([f for f in files if f.lower().endswith(".txt")])
  others = len(files) - images - captions
  path = root[folder.rfind("/")+1:]
  tree[path] = None if not images else f"{images:>4} images | {captions:>4} captions |"
  if tree[path] and others:
    tree[path] += f" {others:>4} other files"

pad = max(len(k) for k in tree)
print("\n".join(f"📁{k.ljust(pad)} | {v}" for k, v in tree.items() if v))