From bc15998ca0a9834e552585c6df66dd2b0fd0f754 Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Sun, 26 Jun 2022 02:09:11 +0000 Subject: [PATCH 1/2] modified wandb related files --- README.md | 44 +++++++++++++----- pipeline/train_wandb.py | 99 +++++++++++++++++++++++++++++++++++++++++ scripts/jl_exp_wandb.sh | 51 +++++++++++++++++++++ 3 files changed, 182 insertions(+), 12 deletions(-) create mode 100644 pipeline/train_wandb.py create mode 100644 scripts/jl_exp_wandb.sh diff --git a/README.md b/README.md index fe83a0e..a2d67da 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,12 @@ This project shows how to realize MLOps in Git/GitHub. In order to achieve this 4. Run `dvc add [ADDED FILE OR DIRECTORY]` to track your data with DVC 5. Run `dvc remote add -d gdrive_storage gdrive://[ID of specific folder in gdrive]` to add Google Drive as the remote data storage 6. Run `dvc push`, then URL to auth is provided. Copy and paste it to the browser, and autheticate -7. Copy the content of `.dvc/tmp/gdrive-user-credentials.json` and put it as in [GitHub Secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository) with the name of `GDRIVE_CREDENTIALS` +7. Copy the content of `.dvc/tmp/gdrive-user-credentials.json` and put it as in [GitHub Secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository) with the name of `GDRIVE_CREDENTIAL` 8. Run `git add . && git commit -m "initial commit" && git push origin main` to keep the initial setup 9. Write your own pipeline under `pipeline` directory. Codes for basic image classification in TensorFlow are provided initially. 10. Run the following `dvc stage add` for training stage ```bash +# if you want to use Iterative Studio / DVCLive for tracking training progress $ dvc stage add -n train \ -p train.train_size,train.batch_size,train.epoch,train.lr \ -d pipeline/modeling.py -d pipeline/train.py -d data \ @@ -35,25 +36,44 @@ $ dvc stage add -n train \ --plots-no-cache dvclive/scalars/eval/sparse_categorical_accuracy.tsv \ -o outputs/model \ python pipeline/train.py outputs/model + +# if you want to use W&B for tracking training progress +$ dvc stage add -n train \ + -p train.train_size,train.batch_size,train.epoch,train.lr \ + -d pipeline/modeling.py -d pipeline/train.py -d data \ + -o outputs/model \ + python pipeline/train.py outputs/model ``` -10. Run the following `dvc stage add` for evaluate stage +11. Run the following `dvc stage add` for evaluate stage ```bash +# if you want to use Iterative Studio / DVCLive for tracking training progress $ dvc stage add -n evaluate \ -p evaluate.test,evaluate.batch_size \ -d pipeline/evaluate.py -d data/test -d outputs/model \ -M outputs/metrics.json \ python pipeline/evaluate.py outputs/model + +# if you want to use W&B for tracking training progress +$ dvc stage add -n evaluate \ + -p evaluate.test,evaluate.batch_size \ + -d pipeline/evaluate.py -d data/test -d outputs/model \ + python pipeline/evaluate.py outputs/model ``` -11. Update `params.yaml` as you need. -12. Run `git add . && git commit -m "add initial pipeline setup" && git push origin main` -13. Run `dvc repro` to run the pipeline initially -14. Run `dvc add outputs/model.tar.gz` to add compressed version of model -15. Run `dvc push outputs/model.tar.gz` -16. Run `echo "/pipeline/__pycache__" >> .gitignore` to ignore unnecessary directory -17. Run `git add . && git commit -m "add initial pipeline run" && git push origin main` -18. Add access token and user email of [JarvisLabs.ai](https://jarvislabs.ai/) to GitHub Secret as `JARVISLABS_ACCESS_TOKEN` and `JARVISLABS_USER_EMAIL` -19. Add GitHub access token to GitHub Secret as `GH_ACCESS_TOKEN` -20. Create a PR and write `#train` as in comment (you have to be the onwer of the repo) +12. Update `params.yaml` as you need. +13. Run `git add . && git commit -m "add initial pipeline setup" && git push origin main` +14. Run `dvc repro` to run the pipeline initially +15. Run `dvc add outputs/model.tar.gz` to add compressed version of model +16. Run `dvc push outputs/model.tar.gz` +17. Run `echo "/pipeline/__pycache__" >> .gitignore` to ignore unnecessary directory +18. Run `git add . && git commit -m "add initial pipeline run" && git push origin main` +19. Add access token and user email of [JarvisLabs.ai](https://jarvislabs.ai/) to GitHub Secret as `JARVISLABS_ACCESS_TOKEN` and `JARVISLABS_USER_EMAIL` +20. Add GitHub access token to GitHub Secret as `GH_ACCESS_TOKEN` +21. Create a PR and write `#train` as in comment (you have to be the onwer of the repo) + +### W&B Integration Setup + +1. Add W&B's project name to GitHub Secret as `WANDB_PROJECT` +2. Add W&B's API KEY to GitHub Secret as `WANDB_API_KEY` ### HuggingFace Integration Setup diff --git a/pipeline/train_wandb.py b/pipeline/train_wandb.py new file mode 100644 index 0000000..cbb0830 --- /dev/null +++ b/pipeline/train_wandb.py @@ -0,0 +1,99 @@ +import os +import sys +import glob +import yaml +import json +import random +import tarfile +from pathlib import Path + +import tensorflow as tf +from tensorflow.keras.applications import resnet50 + +import modeling + +import wandb +from wandb.keras import WandbCallback + +if len(sys.argv) != 2: + sys.stderr.write("Arguments error. Usage:\n") + sys.stderr.write("\tpython prepare.py data-file\n") + sys.exit(1) + +params = yaml.safe_load(open("params.yaml"))["train"] +print(params) + +train = 'data'/Path(params['train']) +test = 'data'/Path(params['test']) +output = Path(sys.argv[1]) + +_image_feature_description = { + 'image': tf.io.FixedLenFeature([], tf.string), + 'label': tf.io.FixedLenFeature([], tf.int64), +} + +def _parse_image_function(example_proto): + features = tf.io.parse_single_example(example_proto, _image_feature_description) + image = tf.io.decode_png(features['image'], channels=3) # tf.io.decode_raw(features['image'], tf.uint8) + image = tf.image.resize(image, [224, 224]) + image = resnet50.preprocess_input(image) + + label = tf.cast(features['label'], tf.int32) + + return image, label + +def _read_dataset(epochs, batch_size, channel): + filenames = glob.glob(str(channel/'*.tfrecord')) + dataset = tf.data.TFRecordDataset(filenames) + + dataset = dataset.map(_parse_image_function, num_parallel_calls=4) + dataset = dataset.prefetch(tf.data.AUTOTUNE) + dataset = dataset.repeat(epochs) + dataset = dataset.shuffle(buffer_size=10 * batch_size) + dataset = dataset.batch(batch_size, drop_remainder=True) + + return dataset + +def make_tarfile(output_filename, source_dir): + with tarfile.open(output_filename, "w:gz") as tar: + tar.add(source_dir, arcname=os.path.basename(source_dir)) + +def run_train(): + project_name = os.environ["WANDB_PROJECT"] + wandb_key = os.environ["WANDB_API_KEY"] + wandb_run_name = os.environ["WANDB_RUN_NAME"] + + wandb.login( + anonymous="never", + key=wandb_key + ) + _ = wandb.init(project=project_name, + config=params, + name=wandb_run_name) + + train_size = params['train_size'] + train_step_size = train_size // params['batch_size'] + + train_ds = _read_dataset(params['epoch'], params['batch_size'], train) + test_ds = _read_dataset(params['epoch'], params['batch_size'], test) + + wandbCallback = WandbCallback(training_data=train_ds, + log_weights=(True), log_gradients=(True)) + + m = modeling._build_keras_model() + m = modeling._compile(m, float(params['lr'])) + + m.fit( + train_ds, + epochs=params['epoch'], + steps_per_epoch=train_step_size, + validation_data=test_ds, + callbacks=[wandbCallback]) + + m.save(output, + save_format='tf', + signatures=modeling._get_signature(m)) + + make_tarfile(f'{output}.tar.gz', output) + +run_train() \ No newline at end of file diff --git a/scripts/jl_exp_wandb.sh b/scripts/jl_exp_wandb.sh new file mode 100644 index 0000000..7590114 --- /dev/null +++ b/scripts/jl_exp_wandb.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +# install gh cli +apt install sudo +curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | sudo dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg +echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null +sudo apt update +sudo apt install gh + +# grant gh access +export GH_TOKEN='$GH_ACCESS_TOKEN' +git config --global user.name "chansung" +git config --global user.email "deep.diver.csp@gmail.com" + +# set W&B specific keys +export WANDB_PROJECT='$WANDB_PROJECT' +export WANDB_API_KEY='$WANDB_API_KEY' + +# move to the repo +git clone https://github.com/codingpot/git-mlops.git + +# install dependencies +cd git-mlops +gh auth setup-git +git checkout $CUR_BRANCH +pip install -r requirements.txt +pip install git+https://github.com/jarvislabsai/jlclient.git + +# set Gdrive credential +mkdir .dvc/tmp +echo '$GDRIVE_CREDENTIAL' > .dvc/tmp/gdrive-user-credentials.json + +# pull data +dvc pull + +export WANDB_RUN_NAME=$CUR_BRANCH +dvc repro + +exp_result=$(dvc exp show --only-changed --md) +wandb_url="https://wandb.ai/codingpot/git-mlops" +gh pr comment $CUR_PR_ID --body "[Visit W&B Log Page for this Pull Request]($wandb_url)" + +git reset --hard + +echo ${exp_ids[$idx]} +echo ${exp_names[$idx]} +dvc add outputs/model.tar.gz +dvc push outputs/model.tar.gz + +VM_ID=$(tail -n 2 /home/.jarviscloud/jarvisconfig | head -n 1) +python clouds/jarvislabs.py vm destroy $CLOUD_AT $CLOUD_ID $VM_ID From 0ec7e02ddf9713c7ab96ee0baf028808d8104e99 Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Sun, 26 Jun 2022 02:44:01 +0000 Subject: [PATCH 2/2] split experiment action file --- .github/actions/exp/jl-dvc/action.yml | 67 ++++++++++++ .github/actions/exp/jl-wandb/action.yml | 68 ++++++++++++ .github/workflows/experiments.yml | 82 +++++++++++++++ .github/workflows/jarvislabs-experiments.yml | 105 ------------------- 4 files changed, 217 insertions(+), 105 deletions(-) create mode 100644 .github/actions/exp/jl-dvc/action.yml create mode 100644 .github/actions/exp/jl-wandb/action.yml create mode 100644 .github/workflows/experiments.yml delete mode 100644 .github/workflows/jarvislabs-experiments.yml diff --git a/.github/actions/exp/jl-dvc/action.yml b/.github/actions/exp/jl-dvc/action.yml new file mode 100644 index 0000000..ceaacfa --- /dev/null +++ b/.github/actions/exp/jl-dvc/action.yml @@ -0,0 +1,67 @@ +name: "Run experiments with JarvisLabs and DVCLive" +description: "Run experiments with JarvisLabs and DVCLive" +inputs: + CUR_BRANCH: + required: true + CUR_PR_ID: + required: true + GDRIVE_CREDENTIAL: + required: true + GH_ACCESS_TOKEN: + required: true + JARVISLABS_ID: + required: true + JARVISLABS_ACCESS_TOKEN: + required: true +runs: + using: "composite" + steps: + - name: prepare script + if: steps.check.outputs.triggered == 'true' + env: + CUR_BRANCH: ${{ steps.pr_data.outputs.branch }} + CUR_PR_ID: ${{ steps.pr_data.outputs.number }} + GDRIVE_CREDENTIAL: ${{ secrets.GDRIVE_CREDENTIAL }} + GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} + CLOUD_ID: ${{ env.JARVISLABS_ID }} + CLOUD_AT: ${{ env.JARVISLABS_ACCESS_TOKEN }} + run: | + envsubst \ + '$CUR_BRANCH, \ + $CUR_PR_ID, \ + $GDRIVE_CREDENTIAL, \ + $GH_ACCESS_TOKEN, \ + $CLOUD_ID, \ + $CLOUD_AT' \ + < scripts/experiments.sh \ + > scripts/experiments_tmp.sh + + - name: install jarvislabs client + if: steps.check.outputs.triggered == 'true' + run: | + pip install typer + pip install git+https://github.com/jarvislabsai/jlclient.git + + - name: add script to jarvislabs + id: add_script + if: steps.check.outputs.triggered == 'true' + run: | + python clouds/jarvislabs.py \ + script add \ + ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} scripts/experiments_tmp.sh \ + > outputs.txt + echo "::set-output name=script_id::$(cat outputs.txt)" + + - name: create vm on jarvislabs + if: steps.check.outputs.triggered == 'true' + run: | + python clouds/jarvislabs.py \ + vm create \ + ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} ${{ steps.add_script.outputs.script_id }} + + - name: remove script from jarvislabs + if: steps.check.outputs.triggered == 'true' + run: | + python clouds/jarvislabs.py \ + script remove \ + ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} ${{ steps.add_script.outputs.script_id }} \ No newline at end of file diff --git a/.github/actions/exp/jl-wandb/action.yml b/.github/actions/exp/jl-wandb/action.yml new file mode 100644 index 0000000..4296546 --- /dev/null +++ b/.github/actions/exp/jl-wandb/action.yml @@ -0,0 +1,68 @@ +name: "Run experiments with JarvisLabs and W&B" +description: "Run experiments with JarvisLabs and W&B" +inputs: + CUR_BRANCH: + required: true + CUR_PR_ID: + required: true + GDRIVE_CREDENTIAL: + required: true + GH_ACCESS_TOKEN: + required: true + JARVISLABS_ID: + required: true + JARVISLABS_ACCESS_TOKEN: + required: true + WANDB_PROJECT: + required: true + WANDB_API_KEY: + required: true +runs: + using: "composite" + steps: + - name: prepare script + env: + CUR_BRANCH: $CUR_BRANCH + CUR_PR_ID: $CUR_PR_ID + GDRIVE_CREDENTIAL: $GDRIVE_CREDENTIAL + GH_ACCESS_TOKEN: $GH_ACCESS_TOKEN + CLOUD_ID: $JARVISLABS_ID + CLOUD_AT: $JARVISLABS_ACCESS_TOKEN + WANDB_PROJECT: $WANDB_PROJECT + WANDB_API_KEY: $WANDB_API_KEY + run: | + envsubst \ + '$CUR_BRANCH, \ + $CUR_PR_ID, \ + $GDRIVE_CREDENTIAL, \ + $GH_ACCESS_TOKEN, \ + $CLOUD_ID, \ + $CLOUD_AT' \ + < scripts/js_exp_wandb.sh \ + > scripts/js_exp_wandb_tmp.sh + + - name: install jarvislabs client + run: | + pip install typer + pip install git+https://github.com/jarvislabsai/jlclient.git + + - name: add script to jarvislabs + id: add_script + run: | + python clouds/jarvislabs.py \ + script add \ + ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} scripts/js_exp_wandb_tmp.sh \ + > outputs.txt + echo "::set-output name=script_id::$(cat outputs.txt)" + + - name: create vm on jarvislabs + run: | + python clouds/jarvislabs.py \ + vm create \ + ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} ${{ steps.add_script.outputs.script_id }} + + - name: remove script from jarvislabs + run: | + python clouds/jarvislabs.py \ + script remove \ + ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} ${{ steps.add_script.outputs.script_id }} \ No newline at end of file diff --git a/.github/workflows/experiments.yml b/.github/workflows/experiments.yml new file mode 100644 index 0000000..e28e93d --- /dev/null +++ b/.github/workflows/experiments.yml @@ -0,0 +1,82 @@ +name: Train Trigger + +on: + pull_request: + types: [opened] + + issue_comment: + types: [created] + +jobs: + experiments-on-jarvislabs: + runs-on: ubuntu-latest + steps: + - uses: octokit/request-action@v2.0.0 + name: GitHub API Request + id: request + with: + route: ${{ github.event.issue.pull_request.url }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Get PR informations + id: pr_data + run: | + echo "::set-output name=number::${{ fromJson(steps.request.outputs.data).number }}" + echo "::set-output name=branch::${{ fromJson(steps.request.outputs.data).head.ref }}" + echo "::set-output name=repo_owner::${{ github.event.repository.owner.login }}" + echo "::set-output name=comment_owner::${{ github.event.sender.login }}" + echo "::set-output name=comment::${{ github.event.comment.body }}" + + - name: Extract comment + if: ${{ steps.pr_data.outputs.repo_owner == steps.pr_data.outputs.comment_owner }} + run: | + echo "Eligible!!" + + - uses: khan/pull-request-comment-trigger@v1.1.0 + name: Listen to comment on PR (training) + id: check_dvclive + if: ${{ steps.pr_data.outputs.repo_owner == steps.pr_data.outputs.comment_owner }} + with: + trigger: '#train --with dvclive' + env: + GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + + - uses: khan/pull-request-comment-trigger@v1.1.0 + name: Listen to comment on PR (training) + id: check_wandb + if: ${{ steps.pr_data.outputs.repo_owner == steps.pr_data.outputs.comment_owner }} + with: + trigger: '#train --with wandb' + env: + GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + + - uses: actions/checkout@v3 + name: clone branch of PR + with: + token: ${{ secrets.GITHUB_TOKEN }} + ref: ${{ steps.pr_data.outputs.branch }} + + - name: clone branch of PR + if: steps.check_dvclive.outputs.triggered == 'true' + uses: ./.github/actions/exp/jl-dvc + with: + CUR_BRANCH: ${{ steps.pr_data.outputs.branch }} + CUR_PR_ID: ${{ steps.pr_data.outputs.number }} + GDRIVE_CREDENTIAL: ${{ secrets.GDRIVE_CREDENTIAL }} + GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} + JARVISLABS_ID: ${{ secrets.JARVISLABS_USER_EMAIL }} + JARVISLABS_ACCESS_TOKEN: ${{ secrets.JARVISLABS_ACCESS_TOKEN }} + + - name: clone branch of PR + if: steps.check_wandb.outputs.triggered == 'true' + uses: ./.github/actions/exp/jl-wandb + with: + CUR_BRANCH: ${{ steps.pr_data.outputs.branch }} + CUR_PR_ID: ${{ steps.pr_data.outputs.number }} + GDRIVE_CREDENTIAL: ${{ secrets.GDRIVE_CREDENTIAL }} + GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} + JARVISLABS_ID: ${{ secrets.JARVISLABS_USER_EMAIL }} + JARVISLABS_ACCESS_TOKEN: ${{ secrets.JARVISLABS_ACCESS_TOKEN }} + WANDB_PROJECT: ${{ secrets.WANDB_PROJECT }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} \ No newline at end of file diff --git a/.github/workflows/jarvislabs-experiments.yml b/.github/workflows/jarvislabs-experiments.yml deleted file mode 100644 index ca8ab3d..0000000 --- a/.github/workflows/jarvislabs-experiments.yml +++ /dev/null @@ -1,105 +0,0 @@ -name: Train Trigger - -env: - CLOUD_ID: ${{ secrets.JARVISLABS_USER_EMAIL }} - CLOUD_AT: ${{ secrets.JARVISLABS_ACCESS_TOKEN }} - -on: - pull_request: - types: [opened] - - issue_comment: - types: [created] - -jobs: - experiments-on-jarvislabs: - runs-on: ubuntu-latest - steps: - - uses: octokit/request-action@v2.0.0 - name: GitHub API Request - id: request - with: - route: ${{ github.event.issue.pull_request.url }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Get PR informations - id: pr_data - run: | - echo "::set-output name=number::${{ fromJson(steps.request.outputs.data).number }}" - echo "::set-output name=branch::${{ fromJson(steps.request.outputs.data).head.ref }}" - echo "::set-output name=repo_owner::${{ github.event.repository.owner.login }}" - echo "::set-output name=comment_owner::${{ github.event.sender.login }}" - echo "::set-output name=comment::${{ github.event.comment.body }}" - - - name: Extract comment - if: ${{ steps.pr_data.outputs.repo_owner == steps.pr_data.outputs.comment_owner }} - run: | - echo "Eligible!!" - - - uses: khan/pull-request-comment-trigger@v1.1.0 - name: Listen to comment on PR (training) - id: check - if: ${{ steps.pr_data.outputs.repo_owner == steps.pr_data.outputs.comment_owner }} - with: - trigger: '#train' - reaction: rocket - env: - GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}' - - - uses: actions/checkout@v3 - name: clone branch of PR - if: steps.check.outputs.triggered == 'true' - with: - token: ${{ secrets.GITHUB_TOKEN }} - ref: ${{ steps.pr_data.outputs.branch }} - - - name: prepare script - if: steps.check.outputs.triggered == 'true' - env: - CUR_BRANCH: ${{ steps.pr_data.outputs.branch }} - CUR_PR_ID: ${{ steps.pr_data.outputs.number }} - GDRIVE_CREDENTIAL: ${{ secrets.GDRIVE_CREDENTIAL }} - GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} - CLOUD_ID: ${{ env.CLOUD_ID }} - CLOUD_AT: ${{ env.CLOUD_AT }} - run: | - envsubst \ - '$CUR_BRANCH, \ - $CUR_PR_ID, \ - $GDRIVE_CREDENTIAL, \ - $GH_ACCESS_TOKEN, \ - $CLOUD_ID, \ - $CLOUD_AT' \ - < scripts/experiments.sh \ - > scripts/experiments_tmp.sh - - - name: install jarvislabs client - if: steps.check.outputs.triggered == 'true' - run: | - pip install typer - pip install git+https://github.com/jarvislabsai/jlclient.git - - - name: add script to jarvislabs - id: add_script - if: steps.check.outputs.triggered == 'true' - run: | - python clouds/jarvislabs.py \ - script add \ - ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} scripts/experiments_tmp.sh \ - > outputs.txt - echo "::set-output name=script_id::$(cat outputs.txt)" - - - name: create vm on jarvislabs - if: steps.check.outputs.triggered == 'true' - run: | - python clouds/jarvislabs.py \ - vm create \ - ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} ${{ steps.add_script.outputs.script_id }} - - - name: remove script from jarvislabs - if: steps.check.outputs.triggered == 'true' - run: | - python clouds/jarvislabs.py \ - script remove \ - ${{ env.CLOUD_AT }} ${{ env.CLOUD_ID }} ${{ steps.add_script.outputs.script_id }} \ No newline at end of file