# Train Stable Diffusion with CodeFlare

In [20]:
!pwd

/opt/app-root/src/text-to-image-demo/dim-dreambooth/dreambooth


In [43]:
# !pip install codeflare-sdk
# !pip install -Ur requirements.txt

## Prep Infra

In [4]:
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication

In [5]:
# Create authentication object for user permissions
# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config
# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually
auth = TokenAuthentication(
    token = "",
    server = "",
    skip_tls=False
)
auth.login()

'Logged into https://api.aisrhods-dim.u1hh.p1.openshiftapps.com:6443'

In [65]:
cluster = Cluster(ClusterConfiguration(
    name='stab-diff-model',
    namespace='default',
    num_workers=2,
    min_cpus=16,
    max_cpus=16,
    min_memory=24,
    max_memory=24,
    num_gpus=1,
    instascale=True, #<---instascale enabled
    machine_types=["m5.xlarge", "g5.4xlarge"],
))

Written to: stab-diff-mod.yaml


In [66]:
cluster.up()

In [64]:
cluster.down()

In [70]:
cluster.details()

RayCluster(name='stab-diff-mod', status=<CodeFlareClusterStatus.QUEUEING: 4>, workers=2, worker_mem_min=16, worker_mem_max=16, worker_cpu=4, worker_gpu=1, namespace='default', dashboard='Dashboard route not available yet, have you run cluster.up()?')

In [71]:
cluster.status()

(<CodeFlareClusterStatus.QUEUEING: 4>, False)

In [None]:
import codeflare_sdk as cf

In [63]:
cf.cluster.cluster.list_all_clusters('default')

## Step 0: Prep

In [26]:
!pwd

/opt/app-root/src/text-to-image-demo


In [None]:
# THIS IS ORIGINAL

# TODO: If running on multiple nodes, change this path to a shared directory (ex: NFS)
!export DATA_PREFIX="/opt/app-root/src/text-to-image-demo/dim-dreambooth"
!export ORIG_MODEL_NAME="CompVis/stable-diffusion-v1-4"
!export ORIG_MODEL_HASH="b95be7d6f134c3a9e62ee616f310733567f069ce"
!export ORIG_MODEL_DIR="$DATA_PREFIX/model-orig"
!export ORIG_MODEL_PATH="$ORIG_MODEL_DIR/models--${ORIG_MODEL_NAME/\//--}/snapshots/$ORIG_MODEL_HASH"
!export TUNED_MODEL_DIR="$DATA_PREFIX/model-tuned"
!export IMAGES_REG_DIR="$DATA_PREFIX/images-reg"
!export IMAGES_OWN_DIR="$DATA_PREFIX/images-own"
!export IMAGES_NEW_DIR="$DATA_PREFIX/images-new"
# TODO: Add more worker nodes and increase NUM_WORKERS for more data-parallelism
!export NUM_WORKERS=2

# !mkdir -p 
!echo $ORIG_MODEL_DIR 
# $TUNED_MODEL_DIR $IMAGES_REG_DIR $IMAGES_OWN_DIR $IMAGES_NEW_DIR
!mkdir -p $ORIG_MODEL_DIR $TUNED_MODEL_DIR $IMAGES_REG_DIR $IMAGES_OWN_DIR $IMAGES_NEW_DIR

In [30]:
# THIS IS ORIGINAL

# TODO: If running on multiple nodes, change this path to a shared directory (ex: NFS)
DATA_PREFIX="/opt/app-root/src/text-to-image-demo/dim-dreambooth"
ORIG_MODEL_NAME="CompVis/stable-diffusion-v1-4"
ORIG_MODEL_HASH="b95be7d6f134c3a9e62ee616f310733567f069ce"
ORIG_MODEL_DIR="$DATA_PREFIX/model-orig"
ORIG_MODEL_PATH="$ORIG_MODEL_DIR/models--${ORIG_MODEL_NAME/\//--}/snapshots/$ORIG_MODEL_HASH"
TUNED_MODEL_DIR="$DATA_PREFIX/model-tuned"
IMAGES_REG_DIR="$DATA_PREFIX/images-reg"
IMAGES_OWN_DIR="$DATA_PREFIX/images-own"
IMAGES_NEW_DIR="$DATA_PREFIX/images-new"
# TODO: Add more worker nodes and increase NUM_WORKERS for more data-parallelism
!export NUM_WORKERS=2

os.makedirs(ORIG_MODEL_DIR, exist_ok=True)
os.makedirs(TUNED_MODEL_DIR, exist_ok=True)
os.makedirs(IMAGES_REG_DIR, exist_ok=True)
os.makedirs(IMAGES_OWN_DIR, exist_ok=True)
os.makedirs(IMAGES_NEW_DIR, exist_ok=True)

In [33]:
!export DATA_PREFIX="/opt/app-root/src/text-to-image-demo/dim-dreambooth"
!export ORIG_MODEL_DIR="$DATA_PREFIX/model-orig"
!export ORIG_MODEL_NAME="CompVis/stable-diffusion-v1-4"
!export ORIG_MODEL_HASH="b95be7d6f134c3a9e62ee616f310733567f069ce"


In [34]:
!echo $DATA_PREFIX
!echo $ORIG_MODEL_NAME
!echo $ORIG_MODEL_HASH

/opt/app-root/src/text-to-image-demo/dim-dreambooth
CompVis/stable-diffusion-v1-4
b95be7d6f134c3a9e62ee616f310733567f069ce


## Step 1: Download the pre-trained model

In [35]:
!pwd

/opt/app-root/src/text-to-image-demo/dim-dreambooth/dreambooth


In [42]:
# !python cache_model.py --model_dir=$ORIG_MODEL_DIR --model_name=$ORIG_MODEL_NAME --revision=$ORIG_MODEL_HASH

## Step 2: Supply images of your subject

In [39]:
  # Only uncomment one of the following:

  # Option 1: Use the dog dataset ---------
  # export CLASS_NAME="dog"
  # python download_example_dataset.py ./images/dog
  # export INSTANCE_DIR=./images/dog
  # ---------------------------------------

  # Option 2: Use the lego car dataset ----
  !export CLASS_NAME="car"
  !export INSTANCE_DIR=./images/lego-car
  # ---------------------------------------

  # Option 3: Use your own images ---------
  # export CLASS_NAME="<class-of-your-subject>"
  # export INSTANCE_DIR="/path/to/images/of/subject"
  # ---------------------------------------

  # Copy own images into IMAGES_OWN_DIR
  # cp -rf $INSTANCE_DIR/* "$IMAGES_OWN_DIR/"

## Step 3: Create the regularization images

In [50]:
import hashlib
from os import path

import time
import torch
import ray

from flags import run_model_flags
from generate_utils import get_pipeline


In [None]:
def __init__(self, model_dir, output_dir, lora_weights_dir=None):
    print(f"Loading model from {model_dir}")
    self.pipeline = get_pipeline(model_dir, lora_weights_dir)
    self.pipeline.set_progress_bar_config(disable=True)
    if torch.cuda.is_available():
        self.pipeline.to("cuda")
    self.output_dir = output_dir

def __call__(self, batch):
    filenames = []
    for i, prompt in zip(batch["idx"], batch["prompt"]):
        # Generate 1 image at a time to reduce memory consumption.
        for image in self.pipeline(prompt).images:
            hash_image = hashlib.sha1(image.tobytes()).hexdigest()
            image_filename = path.join(self.output_dir, f"{i}-{hash_image}.jpg")
            image.save(image_filename)
            print(f"Saved {image_filename}")
            filenames.append(image_filename)
    return {"filename": filenames}

prompts = args.prompts.split(",")

start_time = time.time()
num_samples = len(prompts) * args.num_samples_per_prompt

if args.use_ray_data:
# Use Ray Data to perform batch inference to generate many images in parallel
prompts_with_idxs = []
for prompt in prompts:
    prompts_with_idxs.extend(
        [
            {"idx": i, "prompt": prompt}
            for i in range(args.num_samples_per_prompt)
        ]
    )

prompt_ds = ray.data.from_items(prompts_with_idxs)
num_workers = 4

# Run the batch inference by consuming output with `take_all`.
prompt_ds.map_batches(
    StableDiffusionCallable,
    compute=ray.data.ActorPoolStrategy(size=num_workers),
    fn_constructor_args=(args.model_dir, args.output_dir),
    num_gpus=1,
    batch_size=num_samples // num_workers,
).take_all()

else:
# Generate images one by one
stable_diffusion_predictor = StableDiffusionCallable(
    args.model_dir, args.output_dir, args.lora_weights_dir
)
for prompt in prompts:
    for i in range(args.num_samples_per_prompt):
        stable_diffusion_predictor({"idx": [i], "prompt": [prompt]})

elapsed = time.time() - start_time
print(
f"Generated and saved {num_samples} images to {args.output_dir} in "
f"{elapsed} seconds."
)

In [51]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-13.0.0-cp39-cp39-manylinux_2_28_x86_64.whl (40.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m282.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-13.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


the following is for script

In [None]:
# Only uncomment one of the following:

# Option 1: Use the dog dataset ---------
export CLASS_NAME="dog"
python download_example_dataset.py ./images/dog
export INSTANCE_DIR=./images/dog
# ---------------------------------------

# Option 2: Use the lego car dataset ----
# export CLASS_NAME="car"
# export INSTANCE_DIR=./images/lego-car
# ---------------------------------------

# Option 3: Use your own images ---------
# export CLASS_NAME="<class-of-your-subject>"
# export INSTANCE_DIR="/path/to/images/of/subject"
# ---------------------------------------

# Copy own images into IMAGES_OWN_DIR
cp -rf $INSTANCE_DIR/* "$IMAGES_OWN_DIR/"

In [52]:
!python generate.py \
    --model_dir=$ORIG_MODEL_PATH \
    --output_dir=$IMAGES_REG_DIR \
    --prompts="photo of a $CLASS_NAME" \
    --num_samples_per_prompt=200 \
    --use_ray_data

2023-10-06 18:55:10,033	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m

Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m
2023-10-06 18:55:11,373	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(StableDiffusionCallable)]
2023-10-06 18:55:11,373	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-06 18:55:11,374	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[31m╭─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m─────────────────────[0m[31m─╮[0m
[31m│

In [None]:
cluster.up()

In [None]:
cluster.wait_ready()

In [None]:
cluster.details()

In [None]:
from codeflare_sdk.job.jobs import DDPJobDefinition

In [None]:
arg_list = [
    "--model_name_or_path", "gpt2",
    "--dataset_name", "wikitext",
    "--dataset_config_name", "wikitext-2-raw-v1",
    "--per_device_train_batch_size", "2",
    "--per_device_eval_batch_size", "2",
    "--do_train",
    "--do_eval",
    "--output_dir", "/tmp/test-clm",
    "--overwrite_output_dir"
]

In [None]:
jobdef = DDPJobDefinition(
    name="gpttest",
    script="gpt_og.py",
    script_args=arg_list,
    scheduler_args={"requirements": "requirements_gpt.txt"}
)
job = jobdef.submit(cluster)

In [None]:
job.status()

Retrieve raw log output at anytime with:

In [None]:
job.logs()

View live updates for status, logs, and other information with:

In [None]:
cluster.cluster_dashboard_uri()

In [None]:
job.status()

In [None]:
cluster.down()

In [None]:
auth.logout()