# 📊 Dataset Maker by Hollowstrawberry

In [1]:
!cat /etc/os-release
!export FIFTYONE_DATABASE_DIR=/storage/fiftyone/db
!echo "FIFTYONE_DATABASE_DIR ${FIFTYONE_DATABASE_DIR}"

NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal
FIFTYONE_DATABASE_DIR 


## *️⃣ Install

In [2]:
#import os
#os.chdir("/notebooks")

!pwd
!cd /notebooks/
!pip install accelerate==0.15.0
!pip install transformers==4.26.0
!pip install ftfy==6.1.1
!pip install albumentations==1.3.0
!pip install opencv-python==4.7.0.68
!pip install einops==0.6.0
!pip install diffusers==0.10.2
!pip install pytorch-lightning==1.9.0
!pip install bitsandbytes==0.35.0
!pip install tensorflow==2.11.0
!pip install safetensors==0.2.6
!pip install toml==0.10.2
!pip install voluptuous==0.13.1
!pip install xformers==0.0.20
!pip install lycoris_lora==0.1.4
!pip install dadaptation==1.5
!pip install lion_pytorch==0.0.6
!pip -q install fiftyone ftfy
!pip -q install fiftyone-db-ubuntu2004


#!ls
#!pip install -r requirements.txt

/notebooks
Collecting accelerate==0.15.0
  Downloading accelerate-0.15.0-py3-none-any.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.5/191.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.15.0
[0mCollecting transformers==4.26.0
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.21.3
    Uninstalling transformers-4.21.3:
      Successfully uninstalled transformers-4.21.3
Successfully installed transformers-4.26.0
[0mCollecting ftfy==6.1.1
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
I

## *️⃣ Copy Images & Create Folder Structure

In [3]:
my_project_name="donabull"
!pip show torchvision

Name: torchvision
Version: 0.13.1+cu116
Summary: image and video datasets and models for torch deep learning
Home-page: https://github.com/pytorch/vision
Author: PyTorch Core Team
Author-email: soumith@pytorch.org
License: BSD
Location: /usr/local/lib/python3.9/dist-packages
Requires: numpy, pillow, requests, torch, typing-extensions
Required-by: sentence-transformers


In [11]:
import os
import shutil
from IPython import get_ipython
from IPython.display import display, Markdown

COLAB = False
GRAD = False

if COLAB:
  from google.colab.output import clear as clear_output
else:
  from IPython.display import clear_output


#@title ## 🚩 Start Here

#@markdown ### 1️⃣ Setup
#@markdown This cell will load some requirements and create the necessary folders in your Google Drive. <p>
#@markdown Your project name can't contain spaces but it can contain a single / to make a subfolder in your dataset.
project_name = my_project_name #@param {type:"string"}
project_name = project_name.strip()
#@markdown The folder structure doesn't matter and is purely for comfort. Make sure to always pick the same one. I like organizing by project.
folder_structure = "Organize by project (MyDrive/Loras/project_name/dataset)" #@param ["Organize by category (MyDrive/lora_training/datasets/project_name)", "Organize by project (MyDrive/Loras/project_name/dataset)"]

if not project_name or any(c in project_name for c in " .()\"'\\") or project_name.count("/") > 1:
  print("Please write a valid project_name.")
else:
  if COLAB and not os.path.exists('/content/drive'):
    from google.colab import drive
    print("📂 Connecting to Google Drive...")
    drive.mount('/content/drive')

  project_base = project_name if "/" not in project_name else project_name[:project_name.rfind("/")]
  project_subfolder = project_name if "/" not in project_name else project_name[project_name.rfind("/")+1:]

  root_dir = "/storage/loras"
  deps_dir = os.path.join(root_dir, "deps")
  main_dir      =  root_dir
  config_folder = os.path.join(main_dir, project_base)
  images_folder = os.path.join(main_dir, project_base, "dataset")
  if "/" in project_name:
        images_folder = os.path.join(images_folder, project_subfolder)


  print(f"delet old images if exists")
  shutil.rmtree(images_folder,ignore_errors=True)

    
  print(f"Root Dirctory for {project_name} is {root_dir}!")
  for dir in [main_dir, deps_dir, images_folder, config_folder]:
    #print(f" Making {dir}")
    os.makedirs(dir, exist_ok=True)

  print(f"Copy all files from! /datasets/"+project_base +" to "+ images_folder)  
  shutil.copytree("/datasets/"+project_base +"/", images_folder+"/",dirs_exist_ok = True)

  print(f"✅ Project {project_name} is ready!")
  step1_installed_flag = True
    

#@markdown ### 🚮 Clean folder
#@markdown Careful! Deletes all non-image files in the project folder.

!find {images_folder} -type f ! \( -name '*.png' -o -name '*.jpg' -o -name '*.jpeg' \) -delete 
    

delet old images if exists
Root Dirctory for donabull is /storage/loras!
Copy all files from! /datasets/donabull to /storage/loras/donabull/dataset
✅ Project donabull is ready!


## *️⃣ Image Cleaning

In [12]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")

#@markdown ### 3️⃣ Curate your images
#@markdown We will find duplicate images with the FiftyOne AI, and mark them with `delete`. <p>
#@markdown Then, an interactive area will appear below this cell that lets you visualize all your images and manually mark with `delete` to the ones you don't like. <p>
#@markdown If the interactive area appears blank for over a minute, try enabling cookies and removing tracking protection for the Google Colab website, as they may break it.
#@markdown Regardless, you can save your changes by sending Enter in the input box above the interactive area.<p>
#@markdown This is how similar 2 images must be to be marked for deletion. I recommend 0.97 to 0.99:
similarity_threshold = 0.985 #@param {type:"number"}

print("root_dir "+root_dir)
print("images_folder "+images_folder)
os.chdir(root_dir)
model_name = "clip-vit-base32-torch"
supported_types = (".png", ".jpg", ".jpeg")
img_count = len(os.listdir(images_folder))
batch_size = min(250, img_count)

if "step3_installed_flag" not in globals():
  print("🏭 Installing dependencies...\n")
  !pip -q install fiftyone ftfy
  !pip -q install fiftyone-db-ubuntu2004
  if not get_ipython().__dict__['user_ns']['_exit_code']:
    clear_output()
    step3_installed_flag = True
  else:
    print("❌ Error installing dependencies, attempting to continue anyway...")

import numpy as np
import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F
from sklearn.metrics.pairwise import cosine_similarity

non_images = [f for f in os.listdir(images_folder) if not f.lower().endswith(supported_types)]
if non_images:
  print(f"💥 Error: Found non-image file {non_images[0]} - This program doesn't allow it. Sorry! Use the Extras at the bottom to clean the folder.")
elif img_count == 0:
  print(f"💥 Error: No images found in {images_folder}")
else:
  print("\n💿 Analyzing dataset...\n")
  dataset = fo.Dataset.from_dir(images_folder, dataset_type=fo.types.ImageDirectory)
  model = foz.load_zoo_model(model_name)
  embeddings = dataset.compute_embeddings(model, batch_size=batch_size)

  batch_embeddings = np.array_split(embeddings, batch_size)
  similarity_matrices = []
  max_size_x = max(array.shape[0] for array in batch_embeddings)
  max_size_y = max(array.shape[1] for array in batch_embeddings)

  for i, batch_embedding in enumerate(batch_embeddings):
    similarity = cosine_similarity(batch_embedding)
    #Pad 0 for np.concatenate
    padded_array = np.zeros((max_size_x, max_size_y))
    padded_array[0:similarity.shape[0], 0:similarity.shape[1]] = similarity
    similarity_matrices.append(padded_array)

  similarity_matrix = np.concatenate(similarity_matrices, axis=0)
  similarity_matrix = similarity_matrix[0:embeddings.shape[0], 0:embeddings.shape[0]]

  similarity_matrix = cosine_similarity(embeddings)
  similarity_matrix -= np.identity(len(similarity_matrix))

  dataset.match(F("max_similarity") > similarity_threshold)
  dataset.tags = ["delete", "has_duplicates"]

  id_map = [s.id for s in dataset.select_fields(["id"])]
  samples_to_remove = set()
  samples_to_keep = set()

  for idx, sample in enumerate(dataset):
    if sample.id not in samples_to_remove:
      # Keep the first instance of two duplicates
      samples_to_keep.add(sample.id)
      
      dup_idxs = np.where(similarity_matrix[idx] > similarity_threshold)[0]
      for dup in dup_idxs:
          # We kept the first instance so remove all other duplicates
          samples_to_remove.add(id_map[dup])

      if len(dup_idxs) > 0:
          sample.tags.append("has_duplicates")
          sample.save()
    else:
      sample.tags.append("delete")
      sample.save()

  clear_output()

  sidebar_groups = fo.DatasetAppConfig.default_sidebar_groups(dataset)
  for group in sidebar_groups[1:]:
    group.expanded = False
  dataset.app_config.sidebar_groups = sidebar_groups
  dataset.save()
  session = fo.launch_app(dataset)

  print("❗ Wait a minute for the session to load. If it doesn't, read above.")
  print("❗ When it's ready, you'll see a grid of your images.")
  print("❗ On the left side enable \"sample tags\" to visualize the images marked for deletion.")
  print("❗ You can mark your own images with the \"delete\" label by selecting them and pressing the tag icon at the top.")
  input("⭕ When you're done, enter something here to save your changes: ")

  print("💾 Saving...")

  kys = [s for s in dataset if "delete" in s.tags]
  dataset.remove_samples(kys)
  previous_folder = images_folder[:images_folder.rfind("/")]
  dataset.export(export_dir=os.path.join(images_folder, project_subfolder), dataset_type=fo.types.ImageDirectory)
  
  temp_suffix = "_temp"
  !mv {images_folder} {images_folder}{temp_suffix}
  !mv {images_folder}{temp_suffix}/{project_subfolder} {images_folder}
  !rm -r {images_folder}{temp_suffix}

  session.refresh()
  fo.close_app()
  clear_output()

  print(f"\n✅ Removed {len(kys)} images from dataset. You now have {len(os.listdir(images_folder))} images.")



✅ Removed 2 images from dataset. You now have 183 images.


## *️⃣ Tagging

In [None]:
!pwd
!bash /notebooks/tagger.sh {project_name}
!find {images_folder} -type f ! \( -name '*.png' -o -name '*.jpg' -o -name '*.jpeg' -o -name '*.txt'  -o -name '*.JPG'  -o -name '*.PNG'  -o -name '*.JPEG'   \) -delete


/storage/loras
images_folder /storage/loras/donabull/dataset
2023-08-10 18:40:39.955911: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-10 18:40:40.227440: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-08-10 18:40:41.318381: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /storage/env/tagger/lib/python3.9/site-packages/cv2/../../lib64:/usr/local/lib/python3.9/dist-packages/cv2/../../lib64:/usr/local/cuda-11.6/lib64
2023-

In [1]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")

#@markdown ### 4️⃣ Tag your images
#@markdown We will be using AI to automatically tag your images, specifically [Waifu Diffusion](https://huggingface.co/SmilingWolf/wd-v1-4-swinv2-tagger-v2) in the case of anime and [BLIP](https://huggingface.co/spaces/Salesforce/BLIP) in the case of photos.
#@markdown Giving tags/captions to your images allows for much better training. This process should take a couple minutes. <p>
method = "Photo captions" #@param ["Anime tags", "Photo captions"]
is_tag_enabled = True #@param ["Anime tags", "Photo captions"]
is_caption_enabled = True
#@markdown **Anime:** The threshold is the minimum level of confidence the tagger must have in order to include a tag. Lower threshold = More tags. Recommended 0.35 to 0.5
tag_threshold = 0.35 #@param {type:"slider", min:0.0, max:1.0, step:0.01}
blacklist_tags = "bangs, multicolored hair, two-tone hair, gradient hair, virtual youtuber, official alternate costume, official alternate hairstyle, official alternate hair length, alternate costume, alternate hairstyle, alternate hair length, alternate hair color" #@param {type:"string"}
#@markdown **Photos:** The minimum and maximum length of tokens/words in each caption.
caption_min = 10 #@param {type:"number"}
caption_max = 75 #@param {type:"number"}

%env PYTHONPATH=/env/python
os.chdir(root_dir)
kohya = "/storage/content/kohya-trainer"
if not os.path.exists(kohya):
  !git clone https://github.com/kohya-ss/sd-scripts {kohya}
  os.chdir(kohya)
  !git reset --hard 5050971ac687dca70ba0486a583d283e8ae324e2
  os.chdir(root_dir)  
    

if is_tag_enabled == True:
    print("\n🏭 Installing dependencies...\n")
    !pip uninstall -y torch
    !pip install torch==1.12.0
    !pip install tensorflow==2.11.0 
    !pip install huggingface-hub==0.12.0 
    !pip install accelerate==0.15.0 
    !pip install transformers==4.26.0 
    !pip install diffusers[torch]==0.10.2 
    !pip install einops==0.6.0 
    !pip install safetensors==0.2.6
    !pip install torchvision
    !pip install albumentations
    #!pip -q install tensorflow==2.11.0 huggingface-hub==0.12.0 accelerate==0.15.0 transformers==4.26.0 diffusers[torch]==0.10.2 einops==0.6.0 safetensors==0.2.6 torchvision albumentations
    if not get_ipython().__dict__['user_ns']['_exit_code']:
      clear_output()
      step4a_installed_flag = True
    else:
      print("❌ Error installing dependencies, trying to continue anyway...")

    print("\n🚶‍♂️ Launching program...\n")

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
    %env PYTHONPATH={kohya}
    !python {kohya}/finetune/tag_images_by_wd14_tagger.py \
        {images_folder} \
        --repo_id=SmilingWolf/wd-v1-4-swinv2-tagger-v2 \
        --model_dir={root_dir} \
        --thresh={tag_threshold} \
        --batch_size=8 \
        --caption_extension=.tags \
        --force_download

    if not get_ipython().__dict__['user_ns']['_exit_code']:
        print("removing underscores and blacklist...")
        blacklisted_tags = [t.strip() for t in blacklist_tags.split(",")]
        from collections import Counter
        top_tags = Counter()
        for txt in [f for f in os.listdir(images_folder) if f.lower().endswith(".txt")]:
          with open(os.path.join(images_folder, txt), 'r') as f:
            tags = [t.strip() for t in f.read().split(",")]
            tags = [t.replace("_", " ") if len(t) > 3 else t for t in tags]
            tags = [t for t in tags if t not in blacklisted_tags]
          top_tags.update(tags)
          with open(os.path.join(images_folder, txt), 'w') as f:
            f.write(", ".join(tags))

        %env PYTHONPATH=/env/python
        clear_output()
        print(f"📊 Tagging complete. Here are the top 50 tags in your dataset:")
        print("\n".join(f"{k} ({v})" for k, v in top_tags.most_common(50)))
        
        
if is_caption_enabled == True: # Photos
  if "step4a_installed_flag" not in globals():
    print("\n🏭 Installing dependencies...\n")
    !pip uninstall -y torch
    !pip install torch==1.12.0 
    !pip install timm==0.6.12 
    !pip install fairscale==0.4.13 
    !pip install transformers==4.26.0 
    !pip install requests==2.28.2 
    !pip install accelerate==0.15.0 
    !pip install diffusers[torch]==0.10.2 
    !pip install einops==0.6.0 
    !pip install safetensors==0.2.6
    #!pip -q install timm==0.6.12 fairscale==0.4.13 transformers==4.26.0 requests==2.28.2 accelerate==0.15.0 diffusers[torch]==0.10.2 einops==0.6.0 safetensors==0.2.6
    if not get_ipython().__dict__['user_ns']['_exit_code']:
      clear_output()
      step4a_installed_flag = True
    else:
      print("❌ Error installing dependencies, trying to continue anyway...")

  print("\n🚶‍♂️ Launching program...\n")
  os.chdir(kohya)
  %env PYTHONPATH={kohya}
  !python {kohya}/finetune/make_captions.py \
    {images_folder} \
    --beam_search \
    --max_data_loader_n_workers=2 \
    --batch_size=8 \
    --min_length={caption_min} \
    --max_length={caption_max} \
    --caption_extension=.caption

  if not get_ipython().__dict__['user_ns']['_exit_code']:
    import random
    captions = [f for f in os.listdir(images_folder) if f.lower().endswith(".txt")]
    sample = []
    for txt in random.sample(captions, min(10, len(captions))):
      with open(os.path.join(images_folder, txt), 'r') as f:
        sample.append(f.read())

    os.chdir(root_dir)
    %env PYTHONPATH=/env/python
    clear_output()
    print(f"📊 Captioning complete. Here are {len(sample)} example captions from your dataset:")
    print("".join(sample))

    print(f"train_data_dir = {args.train_data_dir}")
    train_data_dir_path = Path(args.train_data_dir)
    print(f"train_data_dir_path= {train_data_dir_path}")

# fix the seed for reproducibility
for txt in [f for f in os.listdir(images_folder) if f.lower().endswith(".caption")]:
  image_name,image_ext =  os.path.splitext(txt)
  #print(f"image_path={os.path.join(images_folder,image_name + ".caption")}")
  tags=""
  caption=""

  with open(os.path.join(images_folder,image_name + args.caption_extension), "r") as f:
    caption=f.read().strip() 
  with open(os.path.join(images_folder, image_name + args.tags_extention), "r") as f:
    tags = f.read().strip() 
  with open(os.path.join(images_folder, image_name + args.txt_extention), "wt", encoding="utf-8") as f:
    print(f"{caption}")
    f.write(caption + ", " + tags)

!find {images_folder} -type f ! \( -name '*.png' -o -name '*.jpg' -o -name '*.jpeg' -o -name '*.txt'  -o -name '*.JPG'  -o -name '*.PNG'  -o -name '*.JPEG'   \) -delete

print("done!")
    


🚶‍♂️ Launching program...

env: PYTHONPATH=/content/kohya-trainer
Traceback (most recent call last):
  File "/content/kohya-trainer/finetune/tag_images_by_wd14_tagger.py", line 14, in <module>
    import library.train_util as train_util
  File "/content/kohya-trainer/library/train_util.py", line 21, in <module>
    from accelerate import Accelerator
  File "/usr/local/lib/python3.9/dist-packages/accelerate/__init__.py", line 7, in <module>
    from .accelerator import Accelerator
  File "/usr/local/lib/python3.9/dist-packages/accelerate/accelerator.py", line 27, in <module>
    from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
  File "/usr/local/lib/python3.9/dist-packages/accelerate/checkpointing.py", line 22, in <module>
    from torch.cuda.amp import GradScaler
ModuleNotFoundError: No module named 'torch.cuda'


## *️⃣ Global Activation

In [14]:
!find {images_folder} -type f ! \( -name '*.png' -o -name '*.jpg' -o -name '*.jpeg' \) -delete
!rm -r .ipynb_checkpoints


if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")
  
#@markdown ### 5️⃣ Curate your tags
#@markdown Modify your dataset's tags. You can run this cell multiple times with different parameters. <p>

#@markdown Put an activation tag at the start of every text file. This is useful to make learning better and activate your Lora easier. Set `keep_tokens` to 1 when training.<p>
#@markdown Common tags that are removed such as hair color, etc. will be "absorbed" by your activation tag.
global_activation_tag = project_name #@param {type:"string"}
remove_tags = "" #@param {type:"string"}
#@markdown &nbsp;

#@markdown In this advanced section, you can search text files containing matching tags, and replace them with less/more/different tags. If you select the checkbox below, any extra tags will be put at the start of the file, letting you assign different activation tags to different parts of your dataset. Still, you may want a more advanced tool for this.
search_tags = "" #@param {type:"string"}
replace_with = "" #@param {type:"string"}
search_mode = "OR" #@param ["OR", "AND"]
new_becomes_activation_tag = False #@param {type:"boolean"}
#@markdown These may be useful sometimes. Will remove existing activation tags, be careful.
sort_alphabetically = False #@param {type:"boolean"}
remove_duplicates = False #@param {type:"boolean"}

def split_tags(tagstr):
  return [s.strip() for s in tagstr.split(",") if s.strip()]

activation_tag_list = split_tags(global_activation_tag)
remove_tags_list = split_tags(remove_tags)
search_tags_list = split_tags(search_tags)
replace_with_list = split_tags(replace_with)
replace_new_list = [t for t in replace_with_list if t not in search_tags_list]

replace_with_list = [t for t in replace_with_list if t not in replace_new_list]
replace_new_list.reverse()
activation_tag_list.reverse()

remove_count = 0
replace_count = 0

for txt in [f for f in os.listdir(images_folder) if f.lower().endswith(".txt")]:

  with open(os.path.join(images_folder, txt), 'r') as f:
    tags = [s.strip() for s in f.read().split(",")]

  if remove_duplicates:
    tags = list(set(tags))
  if sort_alphabetically:
    tags.sort()

  for rem in remove_tags_list:
    if rem in tags:
      remove_count += 1
      tags.remove(rem)

  if "AND" in search_mode and all(r in tags for r in search_tags_list) \
      or "OR" in search_mode and any(r in tags for r in search_tags_list):
    replace_count += 1
    for rem in search_tags_list:
      if rem in tags:
        tags.remove(rem)
    for add in replace_with_list:
      if add not in tags:
        tags.append(add)
    for new in replace_new_list:
      if new_becomes_activation_tag:
        if new in tags:
          tags.remove(new)
        tags.insert(0, new)
      else:
        if new not in tags:
          tags.append(new)

  for act in activation_tag_list:
    if act in tags:
      tags.remove(act)
    tags.insert(0, act)

  with open(os.path.join(images_folder, txt), 'w') as f:
    f.write(", ".join(tags))

if global_activation_tag:
  print(f"\n📎 Applied new activation tag(s): {', '.join(activation_tag_list)}")
if remove_tags:
  print(f"\n🚮 Removed {remove_count} tags.")
if search_tags:
  print(f"\n💫 Replaced in {replace_count} files.")
print("\n✅ Done! Check your updated tags in the Extras below.")


rm: cannot remove '.ipynb_checkpoints': No such file or directory

📎 Applied new activation tag(s): donabull

✅ Done! Check your updated tags in the Extras below.


## *️⃣ Training

In [24]:
# ImageCount x Repeat x Epoch < 20000
my_image_count=len(os.listdir(images_folder))/2
my_per_image_repeat=5
my_net_epoch=20
print(f"For {my_image_count} files",str(my_image_count*my_per_image_repeat*my_net_epoch))

For 47.0 files 4700.0


In [None]:
import os
import re
import toml
import shutil
import zipfile
from time import time
from IPython.display import Markdown, display


# These carry information from past executions
if "model_url" in globals():
  old_model_url = model_url
else:
  old_model_url = None
if "dependencies_installed" not in globals():
  dependencies_installed = False
if "model_file" not in globals():
  model_file = None

# These may be set by other cells, some are legacy
if "custom_dataset" not in globals():
  custom_dataset = None
if "override_dataset_config_file" not in globals():
  override_dataset_config_file = None
if "override_config_file" not in globals():
  override_config_file = None
if "optimizer" not in globals():
  optimizer = "AdamW8bit"
if "optimizer_args" not in globals():
  optimizer_args = None
if "continue_from_lora" not in globals():
  continue_from_lora = ""
if "weighted_captions" not in globals():
  weighted_captions = False
if "adjust_tags" not in globals():
  adjust_tags = False
if "keep_tokens_weight" not in globals():
  keep_tokens_weight = 1.0


print("project_name "+project_name)
print("folder_structure "+folder_structure)
print("main_dir "+main_dir)
print("config_folder "+config_folder)
print("images_folder "+images_folder)
#print("output_folder "+output_folder)



COLAB = False # low ram
COMMIT = "e6ad3cbc66130fdc3bf9ecd1e0272969b1d613f7"
BETTER_EPOCH_NAMES = True
LOAD_TRUNCATED_IMAGES = True

#@title ## 🚩 Start Here

#@markdown ### ▶️ Setup
#@markdown Your project name will be the same as the folder containing your images. Spaces aren't allowed.
#project_name = "maxicanlust-maritza-mendez" #@param {type:"string"}
#@markdown The folder structure doesn't matter and is purely for comfort. Make sure to always pick the same one. I like organizing by project.
#folder_structure = "Organize by project (MyDrive/Loras/project_name/dataset)" #@param ["Organize by category (MyDrive/lora_training/datasets/project_name)", "Organize by project (MyDrive/Loras/project_name/dataset)"]
#@markdown Decide the model that will be downloaded and used for training. These options should produce clean and consistent results. You can also choose your own by pasting its download link.
training_model = "Stable Diffusion (sd-v1-5-pruned-noema-fp16.safetensors)" #@param ["Anime (animefull-final-pruned-fp16.safetensors)", "AnyLora (AnyLoRA_noVae_fp16-pruned.ckpt)", "Stable Diffusion (sd-v1-5-pruned-noema-fp16.safetensors)"]
optional_custom_training_model_url = "" #@param {type:"string"}
custom_model_is_based_on_sd2 = False #@param {type:"boolean"}

if optional_custom_training_model_url:
  model_url = optional_custom_training_model_url
elif "AnyLora" in training_model:
  model_url = "https://huggingface.co/Lykon/AnyLoRA/resolve/main/AnyLoRA_noVae_fp16-pruned.ckpt"
elif "Anime" in training_model:
  model_url = "https://huggingface.co/hollowstrawberry/stable-diffusion-guide/resolve/main/models/animefull-final-pruned-fp16.safetensors"
else:
  model_url = "https://huggingface.co/hollowstrawberry/stable-diffusion-guide/resolve/main/models/sd-v1-5-pruned-noema-fp16.safetensors"

#@markdown ### ▶️ Processing
#@markdown Resolution of 512 is standard for Stable Diffusion 1.5. Higher resolution training is much slower but can lead to better details. <p>
#@markdown Images will be automatically scaled while training to produce the best results, so you don't need to crop or resize anything yourself.
resolution = 512 #@param {type:"slider", min:512, max:1024, step:128}
#@markdown This option will train your images both normally and flipped, for no extra cost, to learn more from them. Turn it on specially if you have less than 20 images. <p>
#@markdown **Turn it off if you care about asymmetrical elements in your Lora**.
flip_aug = False #@param {type:"boolean"}
#markdown Leave empty for no captions.
caption_extension = ".txt" #param {type:"string"}
#@markdown Shuffling anime tags in place improves learning and prompting. An activation tag goes at the start of every text file and will not be shuffled.
shuffle_tags = True #@param {type:"boolean"}
shuffle_caption = shuffle_tags
activation_tags = "1" #@param [0,1,2,3]
keep_tokens = int(activation_tags)

#@markdown ### ▶️ Steps <p>
#@markdown Your images will repeat this number of times during training. I recommend that your images multiplied by their repeats is between 200 and 400.
num_repeats = my_per_image_repeat #@param {type:"number"}
#@markdown Choose how long you want to train for. A good starting point is around 10 epochs or around 2000 steps.<p>
#@markdown One epoch is a number of steps equal to: your number of images multiplied by their repeats, divided by batch size. <p>
preferred_unit = "Epochs" #@param ["Epochs", "Steps"]
how_many = my_net_epoch #@param {type:"number"}
max_train_epochs = how_many if preferred_unit == "Epochs" else None
max_train_steps = how_many if preferred_unit == "Steps" else None
#@markdown Saving more epochs will let you compare your Lora's progress better.
save_every_n_epochs = 1 #@param {type:"number"}
keep_only_last_n_epochs = 10 #@param {type:"number"}
if not save_every_n_epochs:
  save_every_n_epochs = max_train_epochs
if not keep_only_last_n_epochs:
  keep_only_last_n_epochs = max_train_epochs
#@markdown Increasing the batch size makes training faster, but may make learning worse. Recommended 2 or 3.
train_batch_size = 2 #@param {type:"slider", min:1, max:8, step:1}

#@markdown ### ▶️ Learning
#@markdown The learning rate is the most important for your results. If you want to train slower with lots of images, or if your dim and alpha are high, move the unet to 2e-4 or lower. <p>
#@markdown The text encoder helps your Lora learn concepts slightly better. It is recommended to make it half or a fifth of the unet. If you're training a style you can even set it to 0.
unet_lr = 5e-4 #@param {type:"number"}
text_encoder_lr = 1e-4 #@param {type:"number"}
#@markdown The scheduler is the algorithm that guides the learning rate. If you're not sure, pick `constant` and ignore the number. I personally recommend `cosine_with_restarts` with 3 restarts.
lr_scheduler = "cosine_with_restarts" #@param ["constant", "cosine", "cosine_with_restarts", "constant_with_warmup", "linear", "polynomial"]
lr_scheduler_number = 3 #@param {type:"number"}
lr_scheduler_num_cycles = lr_scheduler_number if lr_scheduler == "cosine_with_restarts" else 0
lr_scheduler_power = lr_scheduler_number if lr_scheduler == "polynomial" else 0
#@markdown Steps spent "warming up" the learning rate during training for efficiency. I recommend leaving it at 5%.
lr_warmup_ratio = 0.05 #@param {type:"slider", min:0.0, max:0.5, step:0.01}
lr_warmup_steps = 0
#@markdown New feature that adjusts loss over time, makes learning much more efficient, and training can be done with about half as many epochs. Uses a value of 5.0 as recommended by [the paper](https://arxiv.org/abs/2303.09556).
min_snr_gamma = True #@param {type:"boolean"}
min_snr_gamma_value = 5.0 if min_snr_gamma else None

#@markdown ### ▶️ Structure
#@markdown LoRA is the classic type, while LoCon is good with styles. Lycoris require [this extension](https://github.com/KohakuBlueleaf/a1111-sd-webui-lycoris) for webui to work like normal loras. More info [here](https://github.com/KohakuBlueleaf/Lycoris).
lora_type = "LoRA" #@param ["LoRA", "LoCon Lycoris", "LoHa Lycoris"]

#@markdown Below are some recommended values for the following settings:

#@markdown | type | network_dim | network_alpha | conv_dim | conv_alpha |
#@markdown | :---: | :---: | :---: | :---: | :---: |
#@markdown | LoRA | 32 | 16 |   |   |
#@markdown | LoCon | 16 | 8 | 8 | 1 |
#@markdown | LoHa | 8 | 4 | 4 | 1 |

#@markdown More dim means larger Lora, it can hold more information but more isn't always better. A dim between 8-32 is recommended, and alpha equal to half the dim.
network_dim = 16 #@param {type:"slider", min:1, max:128, step:1}
network_alpha = 8 #@param {type:"slider", min:1, max:128, step:1}
#@markdown The following values don't affect LoRA. They work like dim/alpha but only for the additional learning layers of Lycoris.
conv_dim = 8 #@param {type:"slider", min:1, max:64, step:1}
conv_alpha = 1 #@param {type:"slider", min:1, max:64, step:1}
conv_compression = False #@param {type:"boolean"}

network_module = "lycoris.kohya" if "Lycoris" in lora_type else "networks.lora"
network_args = None if lora_type == "LoRA" else [
  f"conv_dim={conv_dim}",
  f"conv_alpha={conv_alpha}",
]
if "Lycoris" in lora_type:
  network_args.append(f"algo={'loha' if 'LoHa' in lora_type else 'lora'}")
  network_args.append(f"disable_conv_cp={str(not conv_compression)}")

#markdown ### ▶️ Experimental
#markdown Save additional data equaling ~1 GB allowing you to resume training later.
save_state = False #param {type:"boolean"}
#markdown Resume training if a save state is found.
resume = False #param {type:"boolean"}

#@markdown ### ▶️ Ready
#@markdown You can now run this cell to cook your Lora. Good luck! <p>


# 👩‍💻 Cool code goes here

if optimizer == "DAdaptation":
  optimizer_args = ["decouple=True","weight_decay=0.02","betas=[0.9,0.99]"]
  unet_lr = 0.5
  text_encoder_lr = 0.5
  lr_scheduler = "constant_with_warmup"
  network_alpha = network_dim

#root_dir = "/content" if COLAB else "~/Loras"
deps_dir = os.path.join(root_dir, "deps")
repo_dir = os.path.join(root_dir, "kohya-trainer")

#main_dir      = os.path.join(root_dir, "drive/MyDrive/Loras") if COLAB else root_dir
log_folder    = os.path.join(main_dir, "_logs")
#config_folder = os.path.join(main_dir, project_name)
#images_folder = os.path.join(main_dir, project_name, "dataset")
output_folder = os.path.join(main_dir, project_name, "output")

config_file = os.path.join(config_folder, "training_config.toml")
dataset_config_file = os.path.join(config_folder, "dataset_config.toml")
accelerate_config_file = os.path.join(repo_dir, "accelerate_config/config.yaml")

print("project_name "+project_name)
print("folder_structure "+folder_structure)
print("main_dir "+main_dir)
print("config_folder "+config_folder)
print("images_folder "+images_folder)
print("output_folder "+output_folder)

def clone_repo():
  os.chdir(root_dir)
  !git clone https://github.com/kohya-ss/sd-scripts {repo_dir}
  os.chdir(repo_dir)
  if COMMIT:
    !git reset --hard {COMMIT}
  !wget https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/requirements.txt -q -O requirements.txt

def install_dependencies():
  clone_repo()
  !apt -y update -qq
  !apt -y install aria2 -qq
  !pip -q install --upgrade -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu118

  # patch kohya for minor stuff
  if COLAB:
    !sed -i "s@cpu@cuda@" library/model_util.py # low ram
  if LOAD_TRUNCATED_IMAGES:
    !sed -i 's/from PIL import Image/from PIL import Image, ImageFile\nImageFile.LOAD_TRUNCATED_IMAGES=True/g' library/train_util.py # fix truncated jpegs error
  if BETTER_EPOCH_NAMES:
    !sed -i 's/{:06d}/{:02d}/g' library/train_util.py # make epoch names shorter
    !sed -i 's/"." + args.save_model_as)/"-{:02d}.".format(num_train_epochs) + args.save_model_as)/g' train_network.py # name of the last epoch will match the rest

  from accelerate.utils import write_basic_config
  if not os.path.exists(accelerate_config_file):
    write_basic_config(save_location=accelerate_config_file)

  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
  os.environ["BITSANDBYTES_NOWELCOME"] = "1"
  os.environ["SAFETENSORS_FAST_GPU"] = "1"

def validate_dataset():
  global lr_warmup_steps, lr_warmup_ratio, caption_extension, keep_tokens, keep_tokens_weight, weighted_captions, adjust_tags
  supported_types = (".png", ".jpg", ".jpeg", ".webp", ".bmp")

  print("\n💿 Checking dataset...")
  if not project_name.strip() or any(c in project_name for c in " .()\"'\\/"):
    print("💥 Error: Please choose a valid project name.")
    return

  if custom_dataset:
    try:
      datconf = toml.loads(custom_dataset)
      datasets = [d for d in datconf["datasets"][0]["subsets"]]
    except:
      print(f"💥 Error: Your custom dataset is invalid or contains an error! Please check the original template.")
      return
    reg = [d for d in datasets if d.get("is_reg", False)]
    for r in reg:
      print("📁"+r["image_dir"].replace("/content/drive/", "") + " (Regularization)")
    datasets = [d for d in datasets if d not in reg]
    datasets_dict = {d["image_dir"]: d["num_repeats"] for d in datasets}
    folders = datasets_dict.keys()
    files = [f for folder in folders for f in os.listdir(folder)]
    images_repeats = {folder: (len([f for f in os.listdir(folder) if f.lower().endswith(supported_types)]), datasets_dict[folder]) for folder in folders}
  else:
    folders = [images_folder]
    files = os.listdir(images_folder)
    images_repeats = {images_folder: (len([f for f in files if f.lower().endswith(supported_types)]), num_repeats)}

  for folder in folders:
    if not os.path.exists(folder):
      print(f"💥 Error: The folder {folder.replace('/content/drive/', '')} doesn't exist.")
      return
  for folder, (img, rep) in images_repeats.items():
    if not img:
      print(f"💥 Error: Your {folder.replace('/content/drive/', '')} folder is empty.")
      return
  for f in files:
    if not f.lower().endswith(".txt") and not f.lower().endswith(supported_types):
      print(f"💥 Error: Invalid file in dataset: \"{f}\". Aborting.")
      return

  if not [txt for txt in files if txt.lower().endswith(".txt")]:
    caption_extension = ""
  if continue_from_lora and not (continue_from_lora.endswith(".safetensors") and os.path.exists(continue_from_lora)):
    print(f"💥 Error: Invalid path to existing Lora. Example: /content/drive/MyDrive/Loras/example.safetensors")
    return

  pre_steps_per_epoch = sum(img*rep for (img, rep) in images_repeats.values())
  steps_per_epoch = pre_steps_per_epoch/train_batch_size
  total_steps = max_train_steps or int(max_train_epochs*steps_per_epoch)
  estimated_epochs = int(total_steps/steps_per_epoch)
  lr_warmup_steps = int(total_steps*lr_warmup_ratio)

  for folder, (img, rep) in images_repeats.items():
    print("📁"+folder.replace("/content/drive/", ""))
    print(f"📈 Found {img} images with {rep} repeats, equaling {img*rep} steps.")
  print(f"📉 Divide {pre_steps_per_epoch} steps by {train_batch_size} batch size to get {steps_per_epoch} steps per epoch.")
  if max_train_epochs:
    print(f"🔮 There will be {max_train_epochs} epochs, for around {total_steps} total training steps.")
  else:
    print(f"🔮 There will be {total_steps} steps, divided into {estimated_epochs} epochs and then some.")

  if total_steps > 10000:
    print("💥 Error: Your total steps are too high. You probably made a mistake. Aborting...")
    return

  if adjust_tags:
    print(f"\n📎 Weighted tags: {'ON' if weighted_captions else 'OFF'}")
    if weighted_captions:
      print(f"📎 Will use {keep_tokens_weight} weight on {keep_tokens} activation tag(s)")
    print("📎 Adjusting tags...")
    adjust_weighted_tags(folders, keep_tokens, keep_tokens_weight, weighted_captions)

  return True

def adjust_weighted_tags(folders, keep_tokens: int, keep_tokens_weight: float, weighted_captions: bool):
  weighted_tag = re.compile(r"\((.+?):[.\d]+\)(,|$)")
  for folder in folders:
    for txt in [f for f in os.listdir(folder) if f.lower().endswith(".txt")]:
      with open(os.path.join(folder, txt), 'r') as f:
        content = f.read()
      # reset previous changes
      content = content.replace('\\', '')
      content = weighted_tag.sub(r'\1\2', content)
      if weighted_captions:
        # re-apply changes
        content = content.replace(r'(', r'\(').replace(r')', r'\)').replace(r':', r'\:')
        if keep_tokens_weight > 1:
          tags = [s.strip() for s in content.split(",")]
          for i in range(min(keep_tokens, len(tags))):
            tags[i] = f'({tags[i]}:{keep_tokens_weight})'
          content = ", ".join(tags)
      with open(os.path.join(folder, txt), 'w') as f:
        f.write(content)

def create_config():
  global dataset_config_file, config_file, model_file

  if resume:
    resume_points = [f.path for f in os.scandir(output_folder) if f.is_dir()]
    resume_points.sort()
    last_resume_point = resume_points[-1] if resume_points else None
  else:
    last_resume_point = None

  if override_config_file:
    config_file = override_config_file
    print(f"\n⭕ Using custom config file {config_file}")
  else:
    config_dict = {
      "additional_network_arguments": {
        "unet_lr": unet_lr,
        "text_encoder_lr": text_encoder_lr,
        "network_dim": network_dim,
        "network_alpha": network_alpha,
        "network_module": network_module,
        "network_args": network_args,
        "network_train_unet_only": True if text_encoder_lr == 0 else None,
        "network_weights": continue_from_lora if continue_from_lora else None
      },
      "optimizer_arguments": {
        "learning_rate": unet_lr,
        "lr_scheduler": lr_scheduler,
        "lr_scheduler_num_cycles": lr_scheduler_num_cycles if lr_scheduler == "cosine_with_restarts" else None,
        "lr_scheduler_power": lr_scheduler_power if lr_scheduler == "polynomial" else None,
        "lr_warmup_steps": lr_warmup_steps if lr_scheduler != "constant" else None,
        "optimizer_type": optimizer,
        "optimizer_args": optimizer_args if optimizer_args else None,
      },
      "training_arguments": {
        "max_train_steps": max_train_steps,
        "max_train_epochs": max_train_epochs,
        "save_every_n_epochs": save_every_n_epochs,
        "save_last_n_epochs": keep_only_last_n_epochs,
        "train_batch_size": train_batch_size,
        "noise_offset": None,
        "clip_skip": 2,
        "min_snr_gamma": min_snr_gamma_value,
        "weighted_captions": weighted_captions,
        "seed": 42,
        "max_token_length": 225,
        "xformers": True,
        "lowram": COLAB,
        "max_data_loader_n_workers": 8,
        "persistent_data_loader_workers": True,
        "save_precision": "fp16",
        "mixed_precision": "fp16",
        "output_dir": output_folder,
        "logging_dir": log_folder,
        "output_name": project_name,
        "log_prefix": project_name,
        "save_state": save_state,
        "save_last_n_epochs_state": 1 if save_state else None,
        "resume": last_resume_point
      },
      "model_arguments": {
        "pretrained_model_name_or_path": model_file,
        "v2": custom_model_is_based_on_sd2,
        "v_parameterization": True if custom_model_is_based_on_sd2 else None,
      },
      "saving_arguments": {
        "save_model_as": "safetensors",
      },
      "dreambooth_arguments": {
        "prior_loss_weight": 1.0,
      },
      "dataset_arguments": {
        "cache_latents": True,
      },
    }

    for key in config_dict:
      if isinstance(config_dict[key], dict):
        config_dict[key] = {k: v for k, v in config_dict[key].items() if v is not None}

    with open(config_file, "w") as f:
      f.write(toml.dumps(config_dict))
    print(f"\n📄 Config saved to {config_file}")

  if override_dataset_config_file:
    dataset_config_file = override_dataset_config_file
    print(f"⭕ Using custom dataset config file {dataset_config_file}")
  else:
    dataset_config_dict = {
      "general": {
        "resolution": resolution,
        "shuffle_caption": shuffle_caption,
        "keep_tokens": keep_tokens,
        "flip_aug": flip_aug,
        "caption_extension": caption_extension,
        "enable_bucket": True,
        "bucket_reso_steps": 64,
        "bucket_no_upscale": False,
        "min_bucket_reso": 320 if resolution > 640 else 256,
        "max_bucket_reso": 1280 if resolution > 640 else 1024,
      },
      "datasets": toml.loads(custom_dataset)["datasets"] if custom_dataset else [
        {
          "subsets": [
            {
              "num_repeats": num_repeats,
              "image_dir": images_folder,
              "class_tokens": None if caption_extension else project_name
            }
          ]
        }
      ]
    }

    for key in dataset_config_dict:
      if isinstance(dataset_config_dict[key], dict):
        dataset_config_dict[key] = {k: v for k, v in dataset_config_dict[key].items() if v is not None}

    with open(dataset_config_file, "w") as f:
      f.write(toml.dumps(dataset_config_dict))
    print(f"📄 Dataset config saved to {dataset_config_file}")

def download_model():
  global old_model_url, model_url, model_file
  real_model_url = model_url.strip()

  if real_model_url.lower().endswith((".ckpt", ".safetensors")):
    model_file = f"/content{real_model_url[real_model_url.rfind('/'):]}"
  else:
    model_file = "/content/downloaded_model.safetensors"
    if os.path.exists(model_file):
      !rm "{model_file}"

  if m := re.search(r"(?:https?://)?(?:www\.)?huggingface\.co/[^/]+/[^/]+/blob", model_url):
    real_model_url = real_model_url.replace("blob", "resolve")
  elif m := re.search(r"(?:https?://)?(?:www\.)?civitai\.com/models/([0-9]+)", model_url):
    real_model_url = f"https://civitai.com/api/download/models/{m.group(1)}"

  !aria2c "{real_model_url}" --console-log-level=warn -c -s 16 -x 16 -k 10M -d / -o "{model_file}"

  if model_file.lower().endswith(".safetensors"):
    from safetensors.torch import load_file as load_safetensors
    try:
      test = load_safetensors(model_file)
      del test
    except Exception as e:
      #if "HeaderTooLarge" in str(e):
      new_model_file = os.path.splitext(model_file)[0]+".ckpt"
      !mv "{model_file}" "{new_model_file}"
      model_file = new_model_file
      print(f"Renamed model to {os.path.splitext(model_file)[0]}.ckpt")

  if model_file.lower().endswith(".ckpt"):
    from torch import load as load_ckpt
    try:
      test = load_ckpt(model_file)
      del test
    except Exception as e:
      return False

  return True

def main():
  global dependencies_installed

  for dir in (main_dir, deps_dir, repo_dir, log_folder, images_folder, output_folder, config_folder):
    os.makedirs(dir, exist_ok=True)

  if not validate_dataset():
    return

  if not dependencies_installed:
    print("\n🏭 Installing dependencies...\n")
    t0 = time()
    install_dependencies()
    t1 = time()
    dependencies_installed = True
    print(f"\n✅ Installation finished in {int(t1-t0)} seconds.")
  else:
    print("\n✅ Dependencies already installed.")

  if old_model_url != model_url or not model_file or not os.path.exists(model_file):
    print("\n🔄 Downloading model...")
    if not download_model():
      print("\n💥 Error: The model you selected is invalid or corrupted, or couldn't be downloaded. You can use a civitai or huggingface link, or any direct download link.")
      return
    print()
  else:
    print("\n🔄 Model already downloaded.\n")

  create_config()

  print("\n⭐ Starting trainer...\n")
  os.chdir(repo_dir)

  !accelerate launch --config_file={accelerate_config_file} --num_cpu_threads_per_process=1 train_network.py --dataset_config={dataset_config_file} --config_file={config_file}

  if not get_ipython().__dict__['user_ns']['_exit_code']:
    display(Markdown("### ✅ Done! [Go download your Lora(s) from Google Drive](https://drive.google.com/drive/my-drive)"))

main()


project_name desipose
folder_structure Organize by project (MyDrive/Loras/project_name/dataset)
main_dir /storage/loras
config_folder /storage/loras/desipose
images_folder /storage/loras/desipose/dataset
project_name desipose
folder_structure Organize by project (MyDrive/Loras/project_name/dataset)
main_dir /storage/loras
config_folder /storage/loras/desipose
images_folder /storage/loras/desipose/dataset
output_folder /storage/loras/desipose/output

💿 Checking dataset...
📁/storage/loras/desipose/dataset
📈 Found 94 images with 5 repeats, equaling 470 steps.
📉 Divide 470 steps by 2 batch size to get 235.0 steps per epoch.
🔮 There will be 20 epochs, for around 4700 total training steps.

🏭 Installing dependencies...

fatal: destination path '/storage/loras/kohya-trainer' already exists and is not an empty directory.
HEAD is now at e6ad3cb Merge pull request #478 from rockerBOO/patch-1
143 packages can be upgraded. Run 'apt list --upgradable' to see them.
The following additional packages 

2023-08-09 22:35:23.885203: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



✅ Installation finished in 67 seconds.

🔄 Downloading model...
[35m[[0m#f0d436 1.7GiB/1.9GiB[36m(88%)[0m CN:16 DL:[32m212MiB[0m ETA:[33m1s[0m[35m][0m[0m
Download Results:
gid   |stat|avg speed  |path/URI
f0d436|[1;32mOK[0m  |   214MiB/s|//content/sd-v1-5-pruned-noema-fp16.safetensors

Status Legend:
(OK):download completed.


📄 Config saved to /storage/loras/desipose/training_config.toml
📄 Dataset config saved to /storage/loras/desipose/dataset_config.toml

⭐ Starting trainer...



  with safe_open(filename, framework="pt", device=device) as f:


  warn(f"Failed to load image Python extension: {e}")
Loading settings from /storage/loras/desipose/training_config.toml...
/storage/loras/desipose/training_config
prepare tokenizer
Downloading (…)olve/main/vocab.json: 100%|███| 961k/961k [00:00<00:00, 19.9MB/s]
Downloading (…)olve/main/merges.txt: 100%|███| 525k/525k [00:00<00:00, 15.8MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████| 389/389 [00:00<00:00, 107kB/s]
Downloading (…)okenizer_config.json: 100%|██████| 905/905 [00:00<00:00, 265kB/s]
update token length: 225
Load dataset config from /storage/loras/desipose/dataset_config.toml
prepare images.
found directory /storage/loras/desipose/dataset contains 94 image files
470 train images with repeating.
0 reg images.
no regularization images / 正則化画像が見つかりませんでした
[Dataset 0]
  batch_size: 2
  resolution: (512, 512)
  enable_bucket: True
  min_bucket_reso: 256
  max_bucket_reso: 1024
  bucket_reso_steps: 64
  bucket_no_upscale: False

  [Subset 0 of Dataset 0]
    image_dir: "/stor

## *️⃣ Extras

In [16]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")
  
#@markdown ### 📈 Analyze Tags
#@markdown Perhaps you need another look at your dataset.
show_top_tags = 50 #@param {type:"number"}

from collections import Counter
top_tags = Counter()

for txt in [f for f in os.listdir(images_folder) if f.lower().endswith(".txt")]:
  with open(os.path.join(images_folder, txt), 'r') as f:
    top_tags.update([s.strip() for s in f.read().split(",")])

top_tags = Counter(top_tags)
print(f"📊 Top {show_top_tags} tags:")
for k, v in top_tags.most_common(show_top_tags):
  print(f"{k} ({v})")

📊 Top 50 tags:
