In [1]:
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication
import os
import sys

In [2]:
# Create the training and evaluation datasets.
# This can be run only once.
!{sys.executable} -m pip install datasets
import create_dataset
create_dataset.main()

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pyarrow-hotfix
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting huggingface-hub>=0.21.2
  Downloading huggingface_hub-0.23.4-py3-none-any.whl (402 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m402.6/402.6 kB[0m [31m281.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.8/193.8 kB[0m [31m271.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m251.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhas

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

In [13]:
# Authenticate the CodeFlare SDK
# On OpenShift, you can retrieve the token by running `oc whoami -t`,
# and the server with `oc cluster-info`.
auth = TokenAuthentication(
    token = "sha256~dIX9S6rue_zQkkcdbLRZ47TmXMZcMxmwepX6Hfwfoi8",
    server = "https://api.cluster-m5t2c.m5t2c.sandbox3238.opentlc.com:6443",
    skip_tls=True
)
auth.login()



'Logged into https://api.cluster-m5t2c.m5t2c.sandbox3238.opentlc.com:6443'

In [10]:
# Configure the Ray cluster
cluster = Cluster(ClusterConfiguration(
    name='ray',
    namespace='ray-finetune-llm-deepspeed',
    num_workers=5,
    min_cpus=8,
    max_cpus=8,
    head_cpus=16,
    min_memory=48,
    max_memory=48,
    head_memory=48,
    head_gpus=1,
    num_gpus=1,
    image="quay.io/rhoai/ray:2.23.0-py39-cu121-torch",
    local_queue="local-queue",
))

Yaml resources loaded for ray


In [14]:
# Create the Ray cluster
cluster.up()

No instances found, nothing to be done.


In [15]:
cluster.wait_ready()

Waiting for requested resources to be set up...


KeyboardInterrupt: 

In [None]:
cluster.details()

In [40]:
# Initialize the Job Submission Client
client = cluster.job_client

In [None]:
# The S3 bucket where to store checkpoint.
# It can be set manually, otherwise it's retrieved from configured the data connection.
s3_bucket = ""
if not s3_bucket:
    s3_bucket = os.environ.get('AWS_S3_BUCKET')
assert s3_bucket, "An S3 bucket must be provided to store checkpoints"

In [None]:
submission_id = client.submit_job(
    entrypoint="python ray_finetune_llm_deepspeed.py "
               "--model-name=TinyLlama/TinyLlama_v1.1 "
               "--lora "
               "--num-devices=8 "
               "--num-epochs=3 "
               "--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json "
               f"--storage-path=s3://{s3_bucket}/ray_finetune_llm_deepspeed/ "
               "--batch-size-per-device=32 "
               "--eval-batch-size-per-device=32 ",
    runtime_env={
        "env_vars": {
            "AWS_ACCESS_KEY_ID": os.environ.get('AWS_ACCESS_KEY_ID'),
            "AWS_SECRET_ACCESS_KEY": os.environ.get('AWS_SECRET_ACCESS_KEY'),
            "AWS_DEFAULT_REGION": os.environ.get('AWS_DEFAULT_REGION')
        },
        "pip": "requirements.txt",
        "working_dir": "./",
        "excludes": ["/docs/", "*.ipynb", "*.md"]
    },
)
print(submission_id)

In [None]:
client.stop_job(submission_id)

In [42]:
cluster.down()