# Train Stable Diffusion with CodeFlare

In [10]:
!pwd

/opt/app-root/src/text-to-image-demo


In [58]:
# !pip install -Ur dim-dreambooth/dreambooth/requirements.txt

## Prep Infra

In [63]:
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication

In [64]:
# Create authentication object for user permissions
# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config
# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually
auth = TokenAuthentication(
    token = "sha256~4ExAoqZIHMRd89RZjrnNF8yUODITzIzNh3mn65MBAWE",
    server = "https://api.aisrhods-dim.u1hh.p1.openshiftapps.com:6443",
    skip_tls=False
)
auth.login()

'Logged into https://api.aisrhods-dim.u1hh.p1.openshiftapps.com:6443'

In [None]:
cluster = Cluster(ClusterConfiguration(
    name='stab-diff',
    namespace='default',
    num_workers=2,
    min_cpus=2,
    max_cpus=2,
    min_memory=10,
    max_memory=20,
    num_gpus=2,
    instascale=True, #<---instascale enabled
    machine_types=["g4dn.xlarge", "g4dn.xlarge"],
))

In [None]:
cluster.details()

In [None]:
cluster.status()

In [None]:
# cluster.up()

## Step 0: Prep

In [26]:
!pwd

/opt/app-root/src/text-to-image-demo


In [67]:
# THIS IS ORIGINAL

# TODO: If running on multiple nodes, change this path to a shared directory (ex: NFS)
!export DATA_PREFIX="/opt/app-root/src/text-to-image-demo/dim-dreambooth"
!export ORIG_MODEL_NAME="CompVis/stable-diffusion-v1-4"
!export ORIG_MODEL_HASH="b95be7d6f134c3a9e62ee616f310733567f069ce"
!export ORIG_MODEL_DIR="$DATA_PREFIX/model-orig"
!export ORIG_MODEL_PATH="$ORIG_MODEL_DIR/models--${ORIG_MODEL_NAME/\//--}/snapshots/$ORIG_MODEL_HASH"
!export TUNED_MODEL_DIR="$DATA_PREFIX/model-tuned"
!export IMAGES_REG_DIR="$DATA_PREFIX/images-reg"
!export IMAGES_OWN_DIR="$DATA_PREFIX/images-own"
!export IMAGES_NEW_DIR="$DATA_PREFIX/images-new"
# TODO: Add more worker nodes and increase NUM_WORKERS for more data-parallelism
!export NUM_WORKERS=2

# !mkdir -p 
!echo $ORIG_MODEL_DIR 
# $TUNED_MODEL_DIR $IMAGES_REG_DIR $IMAGES_OWN_DIR $IMAGES_NEW_DIR
!mkdir -p $ORIG_MODEL_DIR $TUNED_MODEL_DIR $IMAGES_REG_DIR $IMAGES_OWN_DIR $IMAGES_NEW_DIR

/model-orig
mkdir: cannot create directory ‘/model-orig’: Permission denied
mkdir: cannot create directory ‘/model-tuned’: Permission denied
mkdir: cannot create directory ‘/images-reg’: Permission denied
mkdir: cannot create directory ‘/images-own’: Permission denied
mkdir: cannot create directory ‘/images-new’: Permission denied


In [46]:
# fix this
import os
# TODO: If running on multiple nodes, change this path to a shared directory (ex: NFS)
os.environ["DATA_PREFIX"]="/opt/app-root/src/text-to-image-demo/dim-dreambooth"
os.environ["ORIG_MODEL_NAME"]="CompVis/stable-diffusion-v1-4"
os.environ["ORIG_MODEL_HASH"]="b95be7d6f134c3a9e62ee616f310733567f069ce"
os.environ["ORIG_MODEL_DIR"]="$DATA_PREFIX/model-orig"
os.environ["ORIG_MODEL_PATH"]="$ORIG_MODEL_DIR/models--${ORIG_MODEL_NAME/\//--}/snapshots/$ORIG_MODEL_HASH"
TUNED_MODEL_DIR="$DATA_PREFIX/model-tuned"
IMAGES_REG_DIR="$DATA_PREFIX/images-reg"
IMAGES_OWN_DIR="$DATA_PREFIX/images-own"
IMAGES_NEW_DIR="$DATA_PREFIX/images-new"
# TODO: Add more worker nodes and increase NUM_WORKERS for more data-parallelism
NUM_WORKERS=2

# !mkdir -p 
# !echo $ORIG_MODEL_DIR 
# $TUNED_MODEL_DIR $IMAGES_REG_DIR $IMAGES_OWN_DIR $IMAGES_NEW_DIR
os.makedirs(ORIG_MODEL_DIR, exist_ok=True)
os.makedirs(TUNED_MODEL_DIR, exist_ok=True)
os.makedirs(IMAGES_REG_DIR, exist_ok=True)
os.makedirs(IMAGES_OWN_DIR, exist_ok=True)
os.makedirs(IMAGES_NEW_DIR, exist_ok=True)


In [53]:
!echo $DATA_PREFIX
!echo $ORIG_MODEL_NAME
!echo $ORIG_MODEL_HASH

/opt/app-root/src/text-to-image-demo/dim-dreambooth
CompVis/stable-diffusion-v1-4
b95be7d6f134c3a9e62ee616f310733567f069ce


## Step 1: Download the pre-trained model

In [55]:
!pwd

/opt/app-root/src/text-to-image-demo


In [57]:
!python dim-dreambooth/dreambooth/cache_model.py --model_dir=$ORIG_MODEL_DIR --model_name=$ORIG_MODEL_NAME --revision=$ORIG_MODEL_HASH

Traceback (most recent call last):
  File "/opt/app-root/src/text-to-image-demo/dim-dreambooth/dreambooth/cache_model.py", line 20, in <module>
    cache(args)
  File "/opt/app-root/src/text-to-image-demo/dim-dreambooth/dreambooth/cache_model.py", line 11, in cache
    os.makedirs(args.model_dir, exist_ok=True)
  File "/usr/lib64/python3.9/os.py", line 225, in makedirs
    mkdir(name, mode)
PermissionError: [Errno 13] Permission denied: '/model-orig'


## Step 2: Supply images of your subject

In [None]:
  # Only uncomment one of the following:

  # Option 1: Use the dog dataset ---------
  export CLASS_NAME="dog"
  python download_example_dataset.py ./images/dog
  export INSTANCE_DIR=./images/dog
  # ---------------------------------------

  # Option 2: Use the lego car dataset ----
  # export CLASS_NAME="car"
  # export INSTANCE_DIR=./images/lego-car
  # ---------------------------------------

  # Option 3: Use your own images ---------
  # export CLASS_NAME="<class-of-your-subject>"
  # export INSTANCE_DIR="/path/to/images/of/subject"
  # ---------------------------------------

  # Copy own images into IMAGES_OWN_DIR
  cp -rf $INSTANCE_DIR/* "$IMAGES_OWN_DIR/"

## Step 3: Create the regularization images

In [None]:
  python generate.py \
    --model_dir=$ORIG_MODEL_PATH \
    --output_dir=$IMAGES_REG_DIR \
    --prompts="photo of a $CLASS_NAME" \
    --num_samples_per_prompt=200 \
    --use_ray_data

In [None]:
cluster.up()

In [None]:
cluster.wait_ready()

In [None]:
cluster.details()

In [None]:
from codeflare_sdk.job.jobs import DDPJobDefinition

In [None]:
arg_list = [
    "--model_name_or_path", "gpt2",
    "--dataset_name", "wikitext",
    "--dataset_config_name", "wikitext-2-raw-v1",
    "--per_device_train_batch_size", "2",
    "--per_device_eval_batch_size", "2",
    "--do_train",
    "--do_eval",
    "--output_dir", "/tmp/test-clm",
    "--overwrite_output_dir"
]

In [None]:
jobdef = DDPJobDefinition(
    name="gpttest",
    script="gpt_og.py",
    script_args=arg_list,
    scheduler_args={"requirements": "requirements_gpt.txt"}
)
job = jobdef.submit(cluster)

In [None]:
job.status()

Retrieve raw log output at anytime with:

In [None]:
job.logs()

View live updates for status, logs, and other information with:

In [None]:
cluster.cluster_dashboard_uri()

In [None]:
job.status()

In [None]:
cluster.down()

In [None]:
auth.logout()