From ad7c2eef7a5d1c4785715553dc801c1db317c46e Mon Sep 17 00:00:00 2001 From: barry-jin Date: Fri, 21 Aug 2020 13:59:14 -0700 Subject: [PATCH 01/32] [CI] Add GPU pytest + Submit jobs to AWS Batch through GitHub Actions --- .github/workflows/unittests.yml | 9 +----- test.sh | 11 ++++++++ tests/test_models.py | 5 ++-- tests/test_optimizer.py | 50 +++++++++++++++++---------------- 4 files changed, 41 insertions(+), 34 deletions(-) create mode 100644 test.sh diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 7ff2b0dfd0..5eca4cf107 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -30,16 +30,9 @@ jobs: with: python-version: ${{ matrix.python-version }} architecture: x64 - - name: Install Other Dependencies - run: | - python -m pip install --user --upgrade pip - python -m pip install --user setuptools pytest pytest-cov contextvars - python -m pip install --upgrade cython - python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python - python -m pip install --user -e .[extras] - name: Test project run: | - python -m pytest --cov=./ --cov-report=xml --durations=50 tests/ + python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "chmod +x test.sh | ./test.sh" --wait - name: Upload coverage to Codecov uses: codecov/codecov-action@v1.0.10 with: diff --git a/test.sh b/test.sh new file mode 100644 index 0000000000..0f7d3204c6 --- /dev/null +++ b/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Shell script for installing dependencies and running test on AWS Batch + +# alias python3='/usr/bin/python3' + +python3 -m pip install --user -upgrade pip +python3 -m pip install --user setuptools pytest pytest-cov contextvars +python3 -m pip install --upgrade cython +python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python +python3 -m pip install --user -e .[extras] +python3 -m pytest --cov=./ --cov-report=xml --durations=50 tests/ \ No newline at end of file diff --git a/tests/test_models.py b/tests/test_models.py index 0d35330387..61e1068bc6 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -12,8 +12,9 @@ def test_list_backbone_names(): @pytest.mark.parametrize('name', list_backbone_names()) -def test_get_backbone(name): - with tempfile.TemporaryDirectory() as root: +@pytest.mark.parametrize('use_gpu', [mx.cpu(), mx.gpu()]) +def test_get_backbone(name, use_gpu): + with tempfile.TemporaryDirectory() as root, use_gpu: model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root) net = model_cls.from_cfg(cfg) net.load_parameters(local_params_path) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 85a704f98c..6d2907d4d5 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -6,27 +6,29 @@ mx.npx.reset_np() -def test_adam(): - opt1 = AdamW - opt2 = AdamW - shapes = [(3, 4, 5), (10, 4), (7,)] - beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}] - beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}] - cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] - rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] - wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}] - mp_options = [{'multi_precision': False}] # TODO(sxjscience) Test for FP16 - agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1}, - {'aggregate_num': 4}, {'aggregate_num': np.inf}] - correct_bias_options = [{'correct_bias': True}, {'correct_bias': False}] - for dtype in [np.float16, np.float32]: - for params in itertools.product(beta1_options, beta2_options, cg_options, - rg_options, wd_options, mp_options, - agg_options, correct_bias_options): - kwarg = {k: v for param in params for k, v in param.items()} - if (dtype == np.float16 and ('multi_precision' not in kwarg or - not kwarg['multi_precision'])): - continue - compare_optimizer(opt1(use_fused_step=False, **kwarg), - opt2(use_fused_step=True, **kwarg), shapes, dtype, - rtol=1e-4, atol=2e-5) +@pytest.mark.parametrize('use_gpu', [mx.cpu(), mx.gpu()]) +def test_adam(use_gpu): + with use_gpu: + opt1 = AdamW + opt2 = AdamW + shapes = [(3, 4, 5), (10, 4), (7,)] + beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}] + beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}] + cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] + rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] + wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}] + mp_options = [{'multi_precision': False}] # TODO(sxjscience) Test for FP16 + agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1}, + {'aggregate_num': 4}, {'aggregate_num': np.inf}] + correct_bias_options = [{'correct_bias': True}, {'correct_bias': False}] + for dtype in [np.float16, np.float32]: + for params in itertools.product(beta1_options, beta2_options, cg_options, + rg_options, wd_options, mp_options, + agg_options, correct_bias_options): + kwarg = {k: v for param in params for k, v in param.items()} + if (dtype == np.float16 and ('multi_precision' not in kwarg or + not kwarg['multi_precision'])): + continue + compare_optimizer(opt1(use_fused_step=False, **kwarg), + opt2(use_fused_step=True, **kwarg), shapes, dtype, + rtol=1e-4, atol=2e-5) From e9901c29e42837d1c22b5a7276a99cdb2d8cf4ec Mon Sep 17 00:00:00 2001 From: barry-jin Date: Fri, 21 Aug 2020 15:19:34 -0700 Subject: [PATCH 02/32] [CI] Update GPU tests and parameters use --- conftest.py | 7 +++++++ tests/test_models.py | 1 - tests/test_optimizer.py | 1 - 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index 4ff9b028a8..ed4654697b 100644 --- a/conftest.py +++ b/conftest.py @@ -206,3 +206,10 @@ def doctest(doctest_namespace): doctest_namespace['gluon'] = mx.gluon import doctest doctest.ELLIPSIS_MARKER = '-etc-' + +def pytest_addoption(parser): + parser.addoption("--device", action="append", default=[], help="list of device choices to run the tests. ex: mx.gpu() (For GPU test only)") + +def pytest_generate_tests(metafunc): + if 'use_gpu' in metafunc.fixturenames: + metafunc.parametrize("use_gpu", metafunc.config.option.device) diff --git a/tests/test_models.py b/tests/test_models.py index 61e1068bc6..e1be3e0dcb 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -12,7 +12,6 @@ def test_list_backbone_names(): @pytest.mark.parametrize('name', list_backbone_names()) -@pytest.mark.parametrize('use_gpu', [mx.cpu(), mx.gpu()]) def test_get_backbone(name, use_gpu): with tempfile.TemporaryDirectory() as root, use_gpu: model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 6d2907d4d5..1de36c2f55 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -6,7 +6,6 @@ mx.npx.reset_np() -@pytest.mark.parametrize('use_gpu', [mx.cpu(), mx.gpu()]) def test_adam(use_gpu): with use_gpu: opt1 = AdamW From 84fac9110d845e0ee0e1907a81344646dd284264 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Fri, 21 Aug 2020 15:44:52 -0700 Subject: [PATCH 03/32] [CI] Update CI pipeline --- .github/workflows/unittests.yml | 37 ++++++++++++++++++++++++++++++++- test.sh | 3 ++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 5eca4cf107..f657fe5056 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -30,10 +30,45 @@ jobs: with: python-version: ${{ matrix.python-version }} architecture: x64 + - name: Install Other Dependencies + run: | + python -m pip install --user --upgrade pip + python -m pip install --user setuptools pytest pytest-cov contextvars + python -m pip install --upgrade cython + python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python + python -m pip install --user -e .[extras] - name: Test project run: | - python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "chmod +x test.sh | ./test.sh" --wait + python -m pytest --cov=./ --cov-report=xml --durations=50 tests/ - name: Upload coverage to Codecov uses: codecov/codecov-action@v1.0.10 with: env_vars: OS,PYTHON + + unittest-gpu: + runs-on: ubuntu-latest + strategy: + fail-fast: false + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Install Linux dependencies + run: sudo apt-get install libopenblas-dev + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Test project on AWS Batch + run: | + python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "chmod +x test.sh | ./test.sh" --wait + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1.0.10 + with: + env_vars: OS,PYTHON + diff --git a/test.sh b/test.sh index 0f7d3204c6..8160f86cab 100644 --- a/test.sh +++ b/test.sh @@ -3,9 +3,10 @@ # alias python3='/usr/bin/python3' +sudo apt-get install libopenblas-dev python3 -m pip install --user -upgrade pip python3 -m pip install --user setuptools pytest pytest-cov contextvars python3 -m pip install --upgrade cython python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python python3 -m pip install --user -e .[extras] -python3 -m pytest --cov=./ --cov-report=xml --durations=50 tests/ \ No newline at end of file +python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="mx.gpu()" tests/ \ No newline at end of file From c2f80d9d95b7c457e50fb9825909595c51824660 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Fri, 21 Aug 2020 15:48:59 -0700 Subject: [PATCH 04/32] [CI] Add new line --- test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.sh b/test.sh index 8160f86cab..3b8eea5a8a 100644 --- a/test.sh +++ b/test.sh @@ -9,4 +9,4 @@ python3 -m pip install --user setuptools pytest pytest-cov contextvars python3 -m pip install --upgrade cython python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python python3 -m pip install --user -e .[extras] -python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="mx.gpu()" tests/ \ No newline at end of file +python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="mx.gpu()" tests/ From e5ab220e50f649888d7f7cc8e932f062c3066d9a Mon Sep 17 00:00:00 2001 From: barry-jin Date: Fri, 21 Aug 2020 16:08:20 -0700 Subject: [PATCH 05/32] [CI] Update pytest command for cpu test --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index f657fe5056..20bccb2456 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -39,7 +39,7 @@ jobs: python -m pip install --user -e .[extras] - name: Test project run: | - python -m pytest --cov=./ --cov-report=xml --durations=50 tests/ + python -m pytest --cov=./ --cov-report=xml --device="mx.cpu()" --durations=50 tests/ - name: Upload coverage to Codecov uses: codecov/codecov-action@v1.0.10 with: From 0a6a1d3d0ebc62179c99ad555122f7a6aff2b98c Mon Sep 17 00:00:00 2001 From: barry-jin Date: Fri, 21 Aug 2020 16:45:35 -0700 Subject: [PATCH 06/32] [CI] Update use_gpu to ctx + add permissions to test.sh --- conftest.py | 4 ++-- tests/test_models.py | 4 ++-- tests/test_optimizer.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index ed4654697b..bb84ce0051 100644 --- a/conftest.py +++ b/conftest.py @@ -211,5 +211,5 @@ def pytest_addoption(parser): parser.addoption("--device", action="append", default=[], help="list of device choices to run the tests. ex: mx.gpu() (For GPU test only)") def pytest_generate_tests(metafunc): - if 'use_gpu' in metafunc.fixturenames: - metafunc.parametrize("use_gpu", metafunc.config.option.device) + if 'ctx' in metafunc.fixturenames: + metafunc.parametrize("ctx", metafunc.config.option.device) diff --git a/tests/test_models.py b/tests/test_models.py index e1be3e0dcb..03491b6272 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -12,8 +12,8 @@ def test_list_backbone_names(): @pytest.mark.parametrize('name', list_backbone_names()) -def test_get_backbone(name, use_gpu): - with tempfile.TemporaryDirectory() as root, use_gpu: +def test_get_backbone(name, ctx): + with tempfile.TemporaryDirectory() as root, ctx: model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root) net = model_cls.from_cfg(cfg) net.load_parameters(local_params_path) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 1de36c2f55..48c2331a7a 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -6,8 +6,8 @@ mx.npx.reset_np() -def test_adam(use_gpu): - with use_gpu: +def test_adam(ctx): + with ctx: opt1 = AdamW opt2 = AdamW shapes = [(3, 4, 5), (10, 4), (7,)] From 92b9e85665aad334859dd724643e7348a09b5fff Mon Sep 17 00:00:00 2001 From: barry-jin Date: Fri, 21 Aug 2020 16:47:03 -0700 Subject: [PATCH 07/32] [CI] Update submitted command --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 20bccb2456..9502d0d490 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -65,7 +65,7 @@ jobs: - name: Test project on AWS Batch run: | - python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "chmod +x test.sh | ./test.sh" --wait + python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait - name: Upload coverage to Codecov uses: codecov/codecov-action@v1.0.10 From 749acece178e2137ed341e0ebe8437782263339c Mon Sep 17 00:00:00 2001 From: barry-jin Date: Fri, 21 Aug 2020 20:58:55 -0700 Subject: [PATCH 08/32] [CI] De-stringify input to mxnet attribute --- .github/workflows/unittests.yml | 2 +- test.sh | 2 +- tests/test_models.py | 2 +- tests/test_optimizer.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) mode change 100644 => 100755 test.sh diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 9502d0d490..98a85dabd4 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -39,7 +39,7 @@ jobs: python -m pip install --user -e .[extras] - name: Test project run: | - python -m pytest --cov=./ --cov-report=xml --device="mx.cpu()" --durations=50 tests/ + python -m pytest --cov=./ --cov-report=xml --device="cpu" --durations=50 tests/ - name: Upload coverage to Codecov uses: codecov/codecov-action@v1.0.10 with: diff --git a/test.sh b/test.sh old mode 100644 new mode 100755 index 3b8eea5a8a..9ebba20abd --- a/test.sh +++ b/test.sh @@ -9,4 +9,4 @@ python3 -m pip install --user setuptools pytest pytest-cov contextvars python3 -m pip install --upgrade cython python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python python3 -m pip install --user -e .[extras] -python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="mx.gpu()" tests/ +python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" tests/ diff --git a/tests/test_models.py b/tests/test_models.py index 03491b6272..413941250b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -13,7 +13,7 @@ def test_list_backbone_names(): @pytest.mark.parametrize('name', list_backbone_names()) def test_get_backbone(name, ctx): - with tempfile.TemporaryDirectory() as root, ctx: + with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)(): model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root) net = model_cls.from_cfg(cfg) net.load_parameters(local_params_path) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 48c2331a7a..f5935fbef4 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -7,7 +7,7 @@ def test_adam(ctx): - with ctx: + with getattr(mx, ctx)(): opt1 = AdamW opt2 = AdamW shapes = [(3, 4, 5), (10, 4), (7,)] From 44d0c5b6c295ee9c956db6fec61791c2f330e929 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Sun, 23 Aug 2020 12:27:14 -0700 Subject: [PATCH 09/32] [CI] Change pull_request event to pull_request_target event --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 98a85dabd4..2a054527b3 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -1,6 +1,6 @@ name: continuous build -on: [push, pull_request] +on: [push, pull_request_target] defaults: run: From 3e02d5fc2529a4838458d5cdad02b540a982cd1f Mon Sep 17 00:00:00 2001 From: barry-jin Date: Sun, 23 Aug 2020 13:26:34 -0700 Subject: [PATCH 10/32] [CI] Add new workflow for GPU unit tests --- .github/workflows/unittests-gpu.yml | 35 +++++++++++++++++++++++++++++ .github/workflows/unittests.yml | 31 ++----------------------- 2 files changed, 37 insertions(+), 29 deletions(-) create mode 100644 .github/workflows/unittests-gpu.yml diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml new file mode 100644 index 0000000000..36490b63ee --- /dev/null +++ b/.github/workflows/unittests-gpu.yml @@ -0,0 +1,35 @@ +name: continuous build - gpu + +on: [push, pull_request_target] + +defaults: + run: + shell: bash + +jobs: + unittest-gpu: + runs-on: ubuntu-latest + strategy: + fail-fast: false + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Install Linux dependencies + run: sudo apt-get install libopenblas-dev + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Test project on AWS Batch + run: | + python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1.0.10 + with: + env_vars: OS,PYTHON \ No newline at end of file diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 2a054527b3..ced8f9a1c8 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -1,6 +1,6 @@ name: continuous build -on: [push, pull_request_target] +on: [push, pull_request] defaults: run: @@ -43,32 +43,5 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v1.0.10 with: - env_vars: OS,PYTHON - - unittest-gpu: - runs-on: ubuntu-latest - strategy: - fail-fast: false - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Install Linux dependencies - run: sudo apt-get install libopenblas-dev - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-east-1 - - - name: Test project on AWS Batch - run: | - python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1.0.10 - with: - env_vars: OS,PYTHON + env_vars: OS,PYTHON From d174fcf33a8f742df829f74d9415b7b292a2d0af Mon Sep 17 00:00:00 2001 From: barry-jin Date: Sun, 23 Aug 2020 22:18:43 -0700 Subject: [PATCH 11/32] [CI] Update unittests-gpu.yml --- .github/workflows/unittests-gpu.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 36490b63ee..3d840af095 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -18,6 +18,20 @@ jobs: - name: Install Linux dependencies run: sudo apt-get install libopenblas-dev + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: 3.7 + architecture: x64 + + - name: Install Other Dependencies + run: | + python -m pip install --user --upgrade pip + python -m pip install --user setuptools pytest pytest-cov contextvars + python -m pip install --upgrade cython + python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python + python -m pip install --user -e .[extras] + - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v1 with: @@ -32,4 +46,4 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v1.0.10 with: - env_vars: OS,PYTHON \ No newline at end of file + env_vars: OS,PYTHON From a73161a5e2fc3903f3ba53ee5d11891ff27d61e5 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Sun, 23 Aug 2020 22:25:14 -0700 Subject: [PATCH 12/32] [CI] Update unittests-gpu.yml --- .github/workflows/unittests-gpu.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 3d840af095..f278c9a067 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -27,9 +27,6 @@ jobs: - name: Install Other Dependencies run: | python -m pip install --user --upgrade pip - python -m pip install --user setuptools pytest pytest-cov contextvars - python -m pip install --upgrade cython - python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python python -m pip install --user -e .[extras] - name: Configure AWS Credentials @@ -41,7 +38,7 @@ jobs: - name: Test project on AWS Batch run: | - python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait + python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait - name: Upload coverage to Codecov uses: codecov/codecov-action@v1.0.10 From 994c2c1ee04c73313a7ffb933453e384c28f00e6 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Mon, 24 Aug 2020 12:41:52 -0700 Subject: [PATCH 13/32] [CI] Update path of test.sh --- .github/workflows/unittests-gpu.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index f278c9a067..72cac182f3 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -38,9 +38,5 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait + python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1.0.10 - with: - env_vars: OS,PYTHON From 39d23512a279d07f5bae0f4d387b4740b7257b9f Mon Sep 17 00:00:00 2001 From: barry-jin Date: Mon, 24 Aug 2020 12:57:28 -0700 Subject: [PATCH 14/32] [CI] Update path of /test --- test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test.sh b/test.sh index 9ebba20abd..1053f2ac06 100755 --- a/test.sh +++ b/test.sh @@ -3,10 +3,12 @@ # alias python3='/usr/bin/python3' +echo $PWD + sudo apt-get install libopenblas-dev python3 -m pip install --user -upgrade pip python3 -m pip install --user setuptools pytest pytest-cov contextvars python3 -m pip install --upgrade cython python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python python3 -m pip install --user -e .[extras] -python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" tests/ +python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ From 43bb922b17a4faaa47ed124d529aace0025850de Mon Sep 17 00:00:00 2001 From: barry-jin Date: Mon, 24 Aug 2020 13:16:36 -0700 Subject: [PATCH 15/32] [CI] Update remote to barry-jin/gluon-nlp --- .github/workflows/unittests-gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 72cac182f3..6e9e41c14b 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -38,5 +38,5 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait + python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait From 006305294e6a44ed9fad4cc9ad68a0eb165a7a47 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Mon, 24 Aug 2020 14:35:21 -0700 Subject: [PATCH 16/32] [CI] Update remote to dmlc/gluon-nlp --- .github/workflows/unittests-gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 6e9e41c14b..72cac182f3 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -38,5 +38,5 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait + python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait From 68f814fdfb3e38bcd1111fd1e1085c74ee1bc238 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 10:04:04 -0700 Subject: [PATCH 17/32] [CI] Add gpu tests for attention cells, bert, electra + Update README --- tests/README.md | 16 +- tests/test_attention_cell.py | 634 ++++++++++++++++++----------------- tests/test_models_bert.py | 143 ++++---- tests/test_models_electra.py | 67 ++-- 4 files changed, 439 insertions(+), 421 deletions(-) diff --git a/tests/README.md b/tests/README.md index 69e08e039e..ceeaeaf68f 100644 --- a/tests/README.md +++ b/tests/README.md @@ -3,13 +3,25 @@ To run the unittests, use the following command ```bash -python3 -m pytest . +python3 -m pytest --device="cpu" . ``` To test for certain file, e.g., the `test_models_transformer.py`, use the following command ```bash -python3 -m pytest test_models_transformer +python3 -m pytest --device="cpu" test_models_transformer.py +``` + +To test only for gpu device, use the following command + +```bash +python3 -m pytest --device="gpu" test_models_transformer.py +``` + +To test both for cpu and gpu device, use the following command + +```bash +python3 -m pytest --device="cpu" --device="gpu" test_models_transformer.py ``` Refer to the [official guide of pytest](https://docs.pytest.org/en/latest/) for more details. diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py index 3b874b0d55..1964f9db4c 100644 --- a/tests/test_attention_cell.py +++ b/tests/test_attention_cell.py @@ -17,161 +17,163 @@ @pytest.mark.parametrize('hybridize', [True, False]) @pytest.mark.parametrize('rel_score_type', ['share_head', 'no_share_head', 'no']) @pytest.mark.seed(123) -def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize, rel_score_type): - batch_size = 5 - query_length, mem_length = 16, 32 - query_head_units = 8 - mem_head_units = 6 - query_units = query_head_units * num_heads - mem_units = mem_head_units * num_heads - seed = 100 - attn_cells = dict() - for layout in ['NKT', 'NTK', 'TNK']: - for use_einsum in [False, True]: - attn_cells[(layout, use_einsum)] = MultiHeadAttentionCell( - query_units=query_units, - num_heads=num_heads, - attention_dropout=0.0, - scaled=scaled, - normalized=normalized, - layout=layout, - use_einsum=use_einsum) - if hybridize: - attn_cells[(layout, use_einsum)].hybridize() - # Generate the data - query_np = np.random.normal(0, 1, (batch_size, num_heads, query_length, query_head_units)) - key_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, query_head_units)) - value_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, mem_head_units)) - mask_np = np.random.randint(0, 2, (batch_size, query_length, mem_length)) - if rel_score_type == 'share_head': - rel_scores_np = np.random.normal(0, 1, (query_length, mem_length)) - elif rel_score_type == 'no_share_head': - rel_scores_np = np.random.normal(0, 1, (num_heads, query_length, mem_length)) - else: - rel_scores_np = None - out_np = None - score_np = None - attn_weights_np = None - stored_layout = None - query_grad_np = None - key_grad_np = None - value_grad_np = None - rel_scores_grad_np = None - for (layout, use_einsum), attn_cell in attn_cells.items(): - mx.npx.random.seed(seed) - if rel_score_type != 'no': - rel_scores = mx.np.array(rel_scores_np, dtype=np.float32) - else: - rel_scores = None - if layout == 'NKT': - query = mx.np.array(query_np, dtype=np.float32) - key = mx.np.array(key_np, dtype=np.float32) - value = mx.np.array(value_np, dtype=np.float32) - elif layout == 'NTK': - query = mx.np.array(query_np.transpose((0, 2, 1, 3)), dtype=np.float32) - key = mx.np.array(key_np.transpose((0, 2, 1, 3)), dtype=np.float32) - value = mx.np.array(value_np.transpose((0, 2, 1, 3)), dtype=np.float32) - elif layout == 'TNK': - query = mx.np.array(query_np.transpose((2, 0, 1, 3)), dtype=np.float32) - key = mx.np.array(key_np.transpose((2, 0, 1, 3)), dtype=np.float32) - value = mx.np.array(value_np.transpose((2, 0, 1, 3)), dtype=np.float32) - else: - raise NotImplementedError - mask = mx.np.array(mask_np, dtype=np.int32) - query.attach_grad() - key.attach_grad() - value.attach_grad() - if rel_scores is not None: - rel_scores.attach_grad() - with mx.autograd.record(): - out, [score, attn_weights] = attn_cell(query, key, value, mask, rel_scores) - out.backward() - if layout == 'NKT': - assert out.shape == (batch_size, query_length, num_heads * mem_head_units) - elif layout == 'NTK': - assert out.shape == (batch_size, query_length, num_heads * mem_head_units) - elif layout == 'TNK': - assert out.shape == (query_length, batch_size, num_heads * mem_head_units) +def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize, rel_score_type, ctx): + with getattr(mx, ctx)(): + batch_size = 5 + query_length, mem_length = 16, 32 + query_head_units = 8 + mem_head_units = 6 + query_units = query_head_units * num_heads + mem_units = mem_head_units * num_heads + seed = 100 + attn_cells = dict() + for layout in ['NKT', 'NTK', 'TNK']: + for use_einsum in [False, True]: + attn_cells[(layout, use_einsum)] = MultiHeadAttentionCell( + query_units=query_units, + num_heads=num_heads, + attention_dropout=0.0, + scaled=scaled, + normalized=normalized, + layout=layout, + use_einsum=use_einsum) + if hybridize: + attn_cells[(layout, use_einsum)].hybridize() + # Generate the data + query_np = np.random.normal(0, 1, (batch_size, num_heads, query_length, query_head_units)) + key_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, query_head_units)) + value_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, mem_head_units)) + mask_np = np.random.randint(0, 2, (batch_size, query_length, mem_length)) + if rel_score_type == 'share_head': + rel_scores_np = np.random.normal(0, 1, (query_length, mem_length)) + elif rel_score_type == 'no_share_head': + rel_scores_np = np.random.normal(0, 1, (num_heads, query_length, mem_length)) else: - raise NotImplementedError - for i in range(num_heads): - assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(), - mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5) - - if stored_layout is None: - out_np = out.asnumpy() - score_np = score.asnumpy() - attn_weights_np = attn_weights.asnumpy() - stored_layout = layout - query_grad_np = query.grad.asnumpy() - key_grad_np = key.grad.asnumpy() - value_grad_np = value.grad.asnumpy() + rel_scores_np = None + out_np = None + score_np = None + attn_weights_np = None + stored_layout = None + query_grad_np = None + key_grad_np = None + value_grad_np = None + rel_scores_grad_np = None + for (layout, use_einsum), attn_cell in attn_cells.items(): + mx.npx.random.seed(seed) if rel_score_type != 'no': - rel_scores_grad_np = rel_scores.grad.asnumpy() - else: - assert stored_layout == 'NKT' - # Begin to match the output + rel_scores = mx.np.array(rel_scores_np, dtype=np.float32) + else: + rel_scores = None if layout == 'NKT': - m_out_np = out.asnumpy() - m_score_np = score.asnumpy() - m_attn_weights_np = attn_weights.asnumpy() - m_query_grad_np = query.grad.asnumpy() - m_key_grad_np = key.grad.asnumpy() - m_value_grad_np = value.grad.asnumpy() - if rel_score_type != 'no': - m_rel_scores_grad_np = rel_scores.grad.asnumpy() + query = mx.np.array(query_np, dtype=np.float32) + key = mx.np.array(key_np, dtype=np.float32) + value = mx.np.array(value_np, dtype=np.float32) elif layout == 'NTK': - m_out_np = out.asnumpy() - m_score_np = score.asnumpy() - m_attn_weights_np = attn_weights.asnumpy() - m_query_grad_np = query.grad.asnumpy().transpose((0, 2, 1, 3)) - m_key_grad_np = key.grad.asnumpy().transpose((0, 2, 1, 3)) - m_value_grad_np = value.grad.asnumpy().transpose((0, 2, 1, 3)) - if rel_score_type != 'no': - m_rel_scores_grad_np = rel_scores.grad.asnumpy() + query = mx.np.array(query_np.transpose((0, 2, 1, 3)), dtype=np.float32) + key = mx.np.array(key_np.transpose((0, 2, 1, 3)), dtype=np.float32) + value = mx.np.array(value_np.transpose((0, 2, 1, 3)), dtype=np.float32) elif layout == 'TNK': - m_out_np = out.asnumpy().transpose((1, 0, 2)) - m_score_np = score.asnumpy() - m_attn_weights_np = attn_weights.asnumpy() - m_query_grad_np = query.grad.asnumpy().transpose((1, 2, 0, 3)) - m_key_grad_np = key.grad.asnumpy().transpose((1, 2, 0, 3)) - m_value_grad_np = value.grad.asnumpy().transpose((1, 2, 0, 3)) - if rel_score_type != 'no': - m_rel_scores_grad_np = rel_scores.grad.asnumpy() + query = mx.np.array(query_np.transpose((2, 0, 1, 3)), dtype=np.float32) + key = mx.np.array(key_np.transpose((2, 0, 1, 3)), dtype=np.float32) + value = mx.np.array(value_np.transpose((2, 0, 1, 3)), dtype=np.float32) else: raise NotImplementedError - assert_allclose(m_out_np, out_np, 1E-5, 1E-5) - assert_allclose(m_score_np, score_np, 1E-5, 1E-5) - assert_allclose(m_attn_weights_np, attn_weights_np, 1E-5, 1E-5) - assert_allclose(m_query_grad_np, query_grad_np, 1E-5, 1E-5) - assert_allclose(m_key_grad_np, key_grad_np, 1E-5, 1E-5) - assert_allclose(m_value_grad_np, value_grad_np, 1E-5, 1E-5) - if rel_score_type != 'no': - assert_allclose(m_rel_scores_grad_np, rel_scores_grad_np, 1E-5, 1E-5) + mask = mx.np.array(mask_np, dtype=np.int32) + query.attach_grad() + key.attach_grad() + value.attach_grad() + if rel_scores is not None: + rel_scores.attach_grad() + with mx.autograd.record(): + out, [score, attn_weights] = attn_cell(query, key, value, mask, rel_scores) + out.backward() + if layout == 'NKT': + assert out.shape == (batch_size, query_length, num_heads * mem_head_units) + elif layout == 'NTK': + assert out.shape == (batch_size, query_length, num_heads * mem_head_units) + elif layout == 'TNK': + assert out.shape == (query_length, batch_size, num_heads * mem_head_units) + else: + raise NotImplementedError + for i in range(num_heads): + assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(), + mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5) + + if stored_layout is None: + out_np = out.asnumpy() + score_np = score.asnumpy() + attn_weights_np = attn_weights.asnumpy() + stored_layout = layout + query_grad_np = query.grad.asnumpy() + key_grad_np = key.grad.asnumpy() + value_grad_np = value.grad.asnumpy() + if rel_score_type != 'no': + rel_scores_grad_np = rel_scores.grad.asnumpy() + else: + assert stored_layout == 'NKT' + # Begin to match the output + if layout == 'NKT': + m_out_np = out.asnumpy() + m_score_np = score.asnumpy() + m_attn_weights_np = attn_weights.asnumpy() + m_query_grad_np = query.grad.asnumpy() + m_key_grad_np = key.grad.asnumpy() + m_value_grad_np = value.grad.asnumpy() + if rel_score_type != 'no': + m_rel_scores_grad_np = rel_scores.grad.asnumpy() + elif layout == 'NTK': + m_out_np = out.asnumpy() + m_score_np = score.asnumpy() + m_attn_weights_np = attn_weights.asnumpy() + m_query_grad_np = query.grad.asnumpy().transpose((0, 2, 1, 3)) + m_key_grad_np = key.grad.asnumpy().transpose((0, 2, 1, 3)) + m_value_grad_np = value.grad.asnumpy().transpose((0, 2, 1, 3)) + if rel_score_type != 'no': + m_rel_scores_grad_np = rel_scores.grad.asnumpy() + elif layout == 'TNK': + m_out_np = out.asnumpy().transpose((1, 0, 2)) + m_score_np = score.asnumpy() + m_attn_weights_np = attn_weights.asnumpy() + m_query_grad_np = query.grad.asnumpy().transpose((1, 2, 0, 3)) + m_key_grad_np = key.grad.asnumpy().transpose((1, 2, 0, 3)) + m_value_grad_np = value.grad.asnumpy().transpose((1, 2, 0, 3)) + if rel_score_type != 'no': + m_rel_scores_grad_np = rel_scores.grad.asnumpy() + else: + raise NotImplementedError + assert_allclose(m_out_np, out_np, 1E-5, 1E-5) + assert_allclose(m_score_np, score_np, 1E-5, 1E-5) + assert_allclose(m_attn_weights_np, attn_weights_np, 1E-5, 1E-5) + assert_allclose(m_query_grad_np, query_grad_np, 1E-5, 1E-5) + assert_allclose(m_key_grad_np, key_grad_np, 1E-5, 1E-5) + assert_allclose(m_value_grad_np, value_grad_np, 1E-5, 1E-5) + if rel_score_type != 'no': + assert_allclose(m_rel_scores_grad_np, rel_scores_grad_np, 1E-5, 1E-5) @pytest.mark.parametrize('scaled', [True, False]) @pytest.mark.parametrize('normalized', [True, False]) @pytest.mark.seed(123) -def test_dot_product_attention(scaled, normalized): - num_heads = 4 - batch_size = 32 - query_length, mem_length = 16, 32 - num_channel = 8 - query = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, num_channel)) - key = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel)) - value = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel)) - mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length)) - out, [score, attn_weights] = multi_head_dot_attn(mx.nd, query, key, value, mask, - scaled=scaled, normalized=normalized) - assert out.shape == (batch_size, query_length, num_heads * num_channel) - for i in range(num_heads): - assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(), - mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5) +def test_dot_product_attention(scaled, normalized, ctx): + with getattr(mx, ctx)(): + num_heads = 4 + batch_size = 32 + query_length, mem_length = 16, 32 + num_channel = 8 + query = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, num_channel)) + key = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel)) + value = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel)) + mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length)) + out, [score, attn_weights] = multi_head_dot_attn(mx.nd, query, key, value, mask, + scaled=scaled, normalized=normalized) + assert out.shape == (batch_size, query_length, num_heads * num_channel) + for i in range(num_heads): + assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(), + mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5) @pytest.mark.seed(123) -def test_gen_attn_mask(): +def test_gen_attn_mask(ctx): class GenSelfAttnMask(HybridBlock): def __init__(self, dtype, layout, attn_type): super().__init__() @@ -195,74 +197,75 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): return gen_mem_attn_mask(F, mem, mem_valid_length, data, valid_length, dtype=self._dtype, layout=self._layout) - batch_size = 4 - query_length = 8 - mem_length = 6 - nchannel = 5 - data = mx.np.random.normal(0, 1, (batch_size, query_length, nchannel), dtype=np.float32) - valid_length = mx.np.random.randint(1, query_length + 1, (batch_size,)) + with getattr(mx, ctx)(): + batch_size = 4 + query_length = 8 + mem_length = 6 + nchannel = 5 + data = mx.np.random.normal(0, 1, (batch_size, query_length, nchannel), dtype=np.float32) + valid_length = mx.np.random.randint(1, query_length + 1, (batch_size,)) - mem = mx.np.random.normal(0, 1, (batch_size, mem_length, nchannel), dtype=np.float32) - mem_valid_length = mx.np.random.randint(1, mem_length + 1, (batch_size,)) + mem = mx.np.random.normal(0, 1, (batch_size, mem_length, nchannel), dtype=np.float32) + mem_valid_length = mx.np.random.randint(1, mem_length + 1, (batch_size,)) - for hybridize in [False, True]: - # Test Full Attention Mask - mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='full') - mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='full') - if hybridize: - mask_gen_nt.hybridize() - mask_gen_tn.hybridize() - mask_nt = mask_gen_nt(data, valid_length) - mask_nt = mask_nt.asnumpy() - mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length) - mask_tn = mask_tn.asnumpy() - mask = mask_nt - assert_allclose(mask_nt, mask_tn) - for b in range(batch_size): - v_l = valid_length.asnumpy()[b] - for i in range(v_l): - assert (mask[b, i, :v_l] == 1).all() - assert(mask[b, i, v_l:] == 0).all() - for i in range(v_l, query_length): - assert (mask[b, i, :] == 0).all() + for hybridize in [False, True]: + # Test Full Attention Mask + mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='full') + mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='full') + if hybridize: + mask_gen_nt.hybridize() + mask_gen_tn.hybridize() + mask_nt = mask_gen_nt(data, valid_length) + mask_nt = mask_nt.asnumpy() + mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length) + mask_tn = mask_tn.asnumpy() + mask = mask_nt + assert_allclose(mask_nt, mask_tn) + for b in range(batch_size): + v_l = valid_length.asnumpy()[b] + for i in range(v_l): + assert (mask[b, i, :v_l] == 1).all() + assert(mask[b, i, v_l:] == 0).all() + for i in range(v_l, query_length): + assert (mask[b, i, :] == 0).all() - # Test Causal Attention Mask - mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='causal') - mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='causal') - if hybridize: - mask_gen_nt.hybridize() - mask_gen_tn.hybridize() - mask_nt = mask_gen_nt(data, valid_length) - mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length) - assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy()) - mask = mask_nt.asnumpy() - for b in range(batch_size): - v_l = valid_length.asnumpy()[b] - for i in range(v_l): - assert (mask[b, i, :(i + 1)] == 1).all() - assert (mask[b, i, (i + 1):] == 0).all() - for i in range(v_l, query_length): - assert (mask[b, i, :] == 0).all() + # Test Causal Attention Mask + mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='causal') + mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='causal') + if hybridize: + mask_gen_nt.hybridize() + mask_gen_tn.hybridize() + mask_nt = mask_gen_nt(data, valid_length) + mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length) + assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy()) + mask = mask_nt.asnumpy() + for b in range(batch_size): + v_l = valid_length.asnumpy()[b] + for i in range(v_l): + assert (mask[b, i, :(i + 1)] == 1).all() + assert (mask[b, i, (i + 1):] == 0).all() + for i in range(v_l, query_length): + assert (mask[b, i, :] == 0).all() - # Test Mem Attention Mask - mask_gen_nt = GenMemAttnMask(dtype=np.float32, layout='NT') - mask_gen_tn = GenMemAttnMask(dtype=np.float32, layout='TN') - if hybridize: - mask_gen_nt.hybridize() - mask_gen_tn.hybridize() - mask_nt = mask_gen_nt(mem, mem_valid_length, data, valid_length) - mask_tn = mask_gen_tn(mx.np.swapaxes(mem, 0, 1), mem_valid_length, - mx.np.swapaxes(data, 0, 1), valid_length) - mask = mask_nt.asnumpy() - assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy()) - for b in range(batch_size): - data_v_l = valid_length.asnumpy()[b] - mem_v_l = mem_valid_length.asnumpy()[b] - for i in range(data_v_l): - assert (mask[b, i, :mem_v_l] == 1).all() - assert (mask[b, i, mem_v_l:] == 0).all() - for i in range(data_v_l, query_length): - assert (mask[b, i, :] == 0).all() + # Test Mem Attention Mask + mask_gen_nt = GenMemAttnMask(dtype=np.float32, layout='NT') + mask_gen_tn = GenMemAttnMask(dtype=np.float32, layout='TN') + if hybridize: + mask_gen_nt.hybridize() + mask_gen_tn.hybridize() + mask_nt = mask_gen_nt(mem, mem_valid_length, data, valid_length) + mask_tn = mask_gen_tn(mx.np.swapaxes(mem, 0, 1), mem_valid_length, + mx.np.swapaxes(data, 0, 1), valid_length) + mask = mask_nt.asnumpy() + assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy()) + for b in range(batch_size): + data_v_l = valid_length.asnumpy()[b] + mem_v_l = mem_valid_length.asnumpy()[b] + for i in range(data_v_l): + assert (mask[b, i, :mem_v_l] == 1).all() + assert (mask[b, i, mem_v_l:] == 0).all() + for i in range(data_v_l, query_length): + assert (mask[b, i, :] == 0).all() @pytest.mark.parametrize('num_heads', [1, 2, 3]) @@ -270,118 +273,119 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): @pytest.mark.parametrize('bidirectional', [False, True]) @pytest.mark.parametrize('hybridize', [False, True]) @pytest.mark.seed(123) -def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize): - batch_size = 6 - query_length = 25 - mem_length = 20 - query_head_units = 7 +def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize, ctx): + with getattr(mx, ctx)(): + batch_size = 6 + query_length = 25 + mem_length = 20 + query_head_units = 7 - # Initialize the attention cell with relative positional embedding - base_layout = 'NKT' - base_use_einsum = False - if method == 'shaw': - num_buckets = None - max_distance = 20 - elif method == 't5': - num_buckets = 10 - max_distance = 20 - elif method == 'transformer_xl': - num_buckets = None - max_distance = None - else: - raise NotImplementedError - base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units, - num_heads=num_heads, - dropout=0.0, - method=method, - num_buckets=num_buckets, - max_distance=max_distance, - layout=base_layout, - use_einsum=base_use_einsum) - base_score_cell.initialize() - if hybridize: - base_score_cell.hybridize() - # Generate the data - query = mx.np.random.normal(0, 1, - (batch_size, num_heads, query_length, query_head_units), - dtype=np.float32) - if method != 't5': - rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length), - dtype=np.float32) - else: - rel_score_grad = mx.np.random.normal(0, 1, - (num_heads, query_length, mem_length), - dtype=np.float32) - query_positions = mx.np.arange(query_length, dtype=np.int32) - mem_positions = mx.np.arange(mem_length, dtype=np.int32) - rel_positions = mx.np.expand_dims(query_positions, axis=-1)\ - - mx.np.expand_dims(mem_positions, axis=0) - mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32) - query.attach_grad() - with mx.autograd.record(): - rel_score = base_score_cell(rel_positions, query) - rel_score.backward(rel_score_grad) - original_rel_score = rel_score.asnumpy() - original_grad_norm = grad_global_norm(base_score_cell.collect_params().values()) - original_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) - assert original_grad_norm > 0 - # 1. Test for permutation equivariant - # We can permutate the query, rel_positions and the rel_score_grad and the result should - # always be the same. - query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32) - mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32) + # Initialize the attention cell with relative positional embedding + base_layout = 'NKT' + base_use_einsum = False + if method == 'shaw': + num_buckets = None + max_distance = 20 + elif method == 't5': + num_buckets = 10 + max_distance = 20 + elif method == 'transformer_xl': + num_buckets = None + max_distance = None + else: + raise NotImplementedError + base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units, + num_heads=num_heads, + dropout=0.0, + method=method, + num_buckets=num_buckets, + max_distance=max_distance, + layout=base_layout, + use_einsum=base_use_einsum) + base_score_cell.initialize() + if hybridize: + base_score_cell.hybridize() + # Generate the data + query = mx.np.random.normal(0, 1, + (batch_size, num_heads, query_length, query_head_units), + dtype=np.float32) + if method != 't5': + rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length), + dtype=np.float32) + else: + rel_score_grad = mx.np.random.normal(0, 1, + (num_heads, query_length, mem_length), + dtype=np.float32) + query_positions = mx.np.arange(query_length, dtype=np.int32) + mem_positions = mx.np.arange(mem_length, dtype=np.int32) + rel_positions = mx.np.expand_dims(query_positions, axis=-1)\ + - mx.np.expand_dims(mem_positions, axis=0) + mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32) + query.attach_grad() + with mx.autograd.record(): + rel_score = base_score_cell(rel_positions, query) + rel_score.backward(rel_score_grad) + original_rel_score = rel_score.asnumpy() + original_grad_norm = grad_global_norm(base_score_cell.collect_params().values()) + original_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) + assert original_grad_norm > 0 + # 1. Test for permutation equivariant + # We can permutate the query, rel_positions and the rel_score_grad and the result should + # always be the same. + query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32) + mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32) - query.grad[:] = 0 - with mx.autograd.record(): - rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm], - query[:, :, query_perm, :]) + query.grad[:] = 0 + with mx.autograd.record(): + rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm], + query[:, :, query_perm, :]) + if method != 't5': + rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm]) + else: + rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm]) + permutated_out = rel_score.asnumpy() + permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values()) + permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) if method != 't5': - rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm]) + assert_allclose( + original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()], + permutated_out, 1E-4, 1E-4) else: - rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm]) - permutated_out = rel_score.asnumpy() - permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values()) - permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) - if method != 't5': - assert_allclose( - original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()], - permutated_out, 1E-4, 1E-4) - else: - assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()], - permutated_out, 1E-4, 1E-4) - assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4) - assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4) - # 2. Test for different layout + use/not use einsum - for layout in ['NKT', 'NTK', 'TNK']: - for use_einsum in [False, True]: - if layout == base_layout and use_einsum == base_use_einsum: - continue - score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units, - num_heads=num_heads, - dropout=0.0, - method=method, - num_buckets=num_buckets, - max_distance=max_distance, - layout=layout, - use_einsum=use_einsum) - score_cell.initialize() - if hybridize: - score_cell.hybridize() - score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()}) - query.attach_grad() - query.grad[:] = 0 - with mx.autograd.record(): - if layout == 'NKT': - rel_score = score_cell(rel_positions, query) - rel_score.backward(rel_score_grad) - elif layout == 'NTK': - rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3))) - rel_score.backward(rel_score_grad) - elif layout == 'TNK': - rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3))) - rel_score.backward(rel_score_grad) - else: - raise NotImplementedError - assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5) - layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) - assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5) + assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()], + permutated_out, 1E-4, 1E-4) + assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4) + assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4) + # 2. Test for different layout + use/not use einsum + for layout in ['NKT', 'NTK', 'TNK']: + for use_einsum in [False, True]: + if layout == base_layout and use_einsum == base_use_einsum: + continue + score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units, + num_heads=num_heads, + dropout=0.0, + method=method, + num_buckets=num_buckets, + max_distance=max_distance, + layout=layout, + use_einsum=use_einsum) + score_cell.initialize() + if hybridize: + score_cell.hybridize() + score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()}) + query.attach_grad() + query.grad[:] = 0 + with mx.autograd.record(): + if layout == 'NKT': + rel_score = score_cell(rel_positions, query) + rel_score.backward(rel_score_grad) + elif layout == 'NTK': + rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3))) + rel_score.backward(rel_score_grad) + elif layout == 'TNK': + rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3))) + rel_score.backward(rel_score_grad) + else: + raise NotImplementedError + assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5) + layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) + assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5) diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py index f2a2ffdfc1..294582239e 100644 --- a/tests/test_models_bert.py +++ b/tests/test_models_bert.py @@ -12,87 +12,88 @@ def test_list_pretrained_bert(): @pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) -def test_bert_small_cfg(compute_layout): - cfg = BertModel.get_cfg() - cfg.defrost() - cfg.MODEL.vocab_size = 100 - cfg.MODEL.units = 12 * 4 - cfg.MODEL.hidden_size = 64 - cfg.MODEL.num_layers = 2 - cfg.MODEL.num_heads = 2 - cfg.MODEL.compute_layout = compute_layout - cfg.freeze() +def test_bert_small_cfg(compute_layout, ctx): + with getattr(mx, ctx)(): + cfg = BertModel.get_cfg() + cfg.defrost() + cfg.MODEL.vocab_size = 100 + cfg.MODEL.units = 12 * 4 + cfg.MODEL.hidden_size = 64 + cfg.MODEL.num_layers = 2 + cfg.MODEL.num_heads = 2 + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() - # Generate TN layout - cfg_tn = cfg.clone() - cfg_tn.defrost() - cfg_tn.MODEL.layout = 'TN' - cfg_tn.freeze() + # Generate TN layout + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() - # Sample data - batch_size = 4 - sequence_length = 8 - num_mask = 3 - inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) - token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) - valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) - masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) + # Sample data + batch_size = 4 + sequence_length = 8 + num_mask = 3 + inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) + token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) + valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) + masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) - # Test for BertModel - bert_model = BertModel.from_cfg(cfg) - bert_model.initialize() - bert_model.hybridize() - contextual_embedding, pooled_out = bert_model(inputs, token_types, valid_length) - bert_model_tn = BertModel.from_cfg(cfg_tn) - bert_model_tn.share_parameters(bert_model.collect_params()) - bert_model_tn.hybridize() - contextual_embedding_tn, pooled_out_tn = bert_model_tn(inputs.T, token_types.T, valid_length) - assert_allclose(contextual_embedding.asnumpy(), - mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), - 1E-4, 1E-4) - assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + # Test for BertModel + bert_model = BertModel.from_cfg(cfg) + bert_model.initialize() + bert_model.hybridize() + contextual_embedding, pooled_out = bert_model(inputs, token_types, valid_length) + bert_model_tn = BertModel.from_cfg(cfg_tn) + bert_model_tn.share_parameters(bert_model.collect_params()) + bert_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn = bert_model_tn(inputs.T, token_types.T, valid_length) + assert_allclose(contextual_embedding.asnumpy(), + mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) - # Test for BertForMLM - bert_mlm_model = BertForMLM(cfg) - bert_mlm_model.initialize() - bert_mlm_model.hybridize() - contextual_embedding, pooled_out, mlm_score = bert_mlm_model(inputs, token_types, - valid_length, masked_positions) - bert_mlm_model_tn = BertForMLM(cfg_tn) - bert_mlm_model_tn.share_parameters(bert_mlm_model.collect_params()) - bert_mlm_model_tn.hybridize() - contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\ - bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions) - assert_allclose(contextual_embedding.asnumpy(), - mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), - 1E-4, 1E-4) - assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) - assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4) + # Test for BertForMLM + bert_mlm_model = BertForMLM(cfg) + bert_mlm_model.initialize() + bert_mlm_model.hybridize() + contextual_embedding, pooled_out, mlm_score = bert_mlm_model(inputs, token_types, + valid_length, masked_positions) + bert_mlm_model_tn = BertForMLM(cfg_tn) + bert_mlm_model_tn.share_parameters(bert_mlm_model.collect_params()) + bert_mlm_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\ + bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(contextual_embedding.asnumpy(), + mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4) - # Test for BertForPretrain - bert_pretrain_model = BertForPretrain(cfg) - bert_pretrain_model.initialize() - bert_pretrain_model.hybridize() - contextual_embedding, pooled_out, nsp_score, mlm_scores =\ - bert_pretrain_model(inputs, token_types, valid_length, masked_positions) - bert_pretrain_model_tn = BertForPretrain(cfg_tn) - bert_pretrain_model_tn.share_parameters(bert_pretrain_model.collect_params()) - bert_pretrain_model_tn.hybridize() - contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \ - bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) - assert_allclose(contextual_embedding.asnumpy(), - mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), - 1E-4, 1E-4) - assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) - assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4) - assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4) + # Test for BertForPretrain + bert_pretrain_model = BertForPretrain(cfg) + bert_pretrain_model.initialize() + bert_pretrain_model.hybridize() + contextual_embedding, pooled_out, nsp_score, mlm_scores =\ + bert_pretrain_model(inputs, token_types, valid_length, masked_positions) + bert_pretrain_model_tn = BertForPretrain(cfg_tn) + bert_pretrain_model_tn.share_parameters(bert_pretrain_model.collect_params()) + bert_pretrain_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \ + bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(contextual_embedding.asnumpy(), + mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4) @pytest.mark.remote_required @pytest.mark.parametrize('model_name', list_pretrained_bert()) -def test_bert_get_pretrained(model_name): +def test_bert_get_pretrained(model_name, ctx): assert len(list_pretrained_bert()) > 0 - with tempfile.TemporaryDirectory() as root: + with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)(): cfg, tokenizer, backbone_params_path, mlm_params_path =\ get_pretrained_bert(model_name, load_backbone=True, load_mlm=True, root=root) assert cfg.MODEL.vocab_size == len(tokenizer.vocab) diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py index 17f9420a07..6940af717d 100644 --- a/tests/test_models_electra.py +++ b/tests/test_models_electra.py @@ -26,47 +26,48 @@ def get_test_cfg(): @pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) -def test_electra_model(compute_layout): - cfg = get_test_cfg() - cfg.defrost() - cfg.MODEL.compute_layout = compute_layout - cfg.freeze() +def test_electra_model(compute_layout, ctx): + with getattr(mx, ctx)(): + cfg = get_test_cfg() + cfg.defrost() + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() - # Generate TN layout - cfg_tn = cfg.clone() - cfg_tn.defrost() - cfg_tn.MODEL.layout = 'TN' - cfg_tn.freeze() + # Generate TN layout + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() - # Sample data - batch_size = 4 - sequence_length = 16 - num_mask = 3 - inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) - token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) - valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) - masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) + # Sample data + batch_size = 4 + sequence_length = 16 + num_mask = 3 + inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) + token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) + valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) + masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) - electra_model = ElectraModel.from_cfg(cfg) - electra_model.initialize() - electra_model.hybridize() - contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length) - electra_model_tn = ElectraModel.from_cfg(cfg_tn) - electra_model_tn.share_parameters(electra_model.collect_params()) - electra_model_tn.hybridize() - contextual_embedding_tn, pooled_out_tn = electra_model_tn(inputs.T, token_types.T, valid_length) - assert_allclose(contextual_embedding.asnumpy(), - np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), - 1E-4, 1E-4) - assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), - 1E-4, 1E-4) + electra_model = ElectraModel.from_cfg(cfg) + electra_model.initialize() + electra_model.hybridize() + contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length) + electra_model_tn = ElectraModel.from_cfg(cfg_tn) + electra_model_tn.share_parameters(electra_model.collect_params()) + electra_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn = electra_model_tn(inputs.T, token_types.T, valid_length) + assert_allclose(contextual_embedding.asnumpy(), + np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), + 1E-4, 1E-4) @pytest.mark.remote_required @pytest.mark.parametrize('model_name', list_pretrained_electra()) -def test_electra_get_pretrained(model_name): +def test_electra_get_pretrained(model_name, ctx): assert len(list_pretrained_electra()) > 0 - with tempfile.TemporaryDirectory() as root: + with tempfile.TemporaryDirectory() as root, getattr(mx, ctx): cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\ get_pretrained_electra(model_name, root=root, load_backbone=True, load_disc=True, load_gen=True) From 76cf1c4787ec4d486cc409b4e9aa92d7c16c4efb Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 10:15:38 -0700 Subject: [PATCH 18/32] [CI] Change remote from dmlc to barry-jin --- .github/workflows/unittests-gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 72cac182f3..6e9e41c14b 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -38,5 +38,5 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait + python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait From c0bfc6d761f988ca71f6dddae1e0640ee77a4299 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 12:32:25 -0700 Subject: [PATCH 19/32] [CI] Bug Fix --- tests/test_models_electra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py index 6940af717d..dcc20a76be 100644 --- a/tests/test_models_electra.py +++ b/tests/test_models_electra.py @@ -67,7 +67,7 @@ def test_electra_model(compute_layout, ctx): @pytest.mark.parametrize('model_name', list_pretrained_electra()) def test_electra_get_pretrained(model_name, ctx): assert len(list_pretrained_electra()) > 0 - with tempfile.TemporaryDirectory() as root, getattr(mx, ctx): + with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)(): cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\ get_pretrained_electra(model_name, root=root, load_backbone=True, load_disc=True, load_gen=True) From b134ac132e5e51f82f2a5741b2f53d5bb1e43434 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 13:19:57 -0700 Subject: [PATCH 20/32] [CI] Truncate logs + Add failure test --- test.sh | 14 +++++++++++++- tests/test_optimizer.py | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/test.sh b/test.sh index 1053f2ac06..77e19177d1 100755 --- a/test.sh +++ b/test.sh @@ -11,4 +11,16 @@ python3 -m pip install --user setuptools pytest pytest-cov contextvars python3 -m pip install --upgrade cython python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python python3 -m pip install --user -e .[extras] -python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ +python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ > output.txt + +flag=false +while IFS= read -r line; do + if $flag; then + echo $line + else + if [ "$line" == "/gluon-nlp/tools/batch" ]; then + echo $line + flag=true + fi + fi +done < output.txt diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index f5935fbef4..eac29f1265 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -8,6 +8,7 @@ def test_adam(ctx): with getattr(mx, ctx)(): + assert False opt1 = AdamW opt2 = AdamW shapes = [(3, 4, 5), (10, 4), (7,)] From 91cd6f063f9c46880a21d6c3aa2414fa85eba6af Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 14:38:51 -0700 Subject: [PATCH 21/32] [CI] Duplicate script to submit test and get logs --- test.sh | 14 +-- tools/batch/submit-test.py | 174 +++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+), 13 deletions(-) create mode 100644 tools/batch/submit-test.py diff --git a/test.sh b/test.sh index 77e19177d1..1053f2ac06 100755 --- a/test.sh +++ b/test.sh @@ -11,16 +11,4 @@ python3 -m pip install --user setuptools pytest pytest-cov contextvars python3 -m pip install --upgrade cython python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python python3 -m pip install --user -e .[extras] -python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ > output.txt - -flag=false -while IFS= read -r line; do - if $flag; then - echo $line - else - if [ "$line" == "/gluon-nlp/tools/batch" ]; then - echo $line - flag=true - fi - fi -done < output.txt +python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ diff --git a/tools/batch/submit-test.py b/tools/batch/submit-test.py new file mode 100644 index 0000000000..38eb601073 --- /dev/null +++ b/tools/batch/submit-test.py @@ -0,0 +1,174 @@ +import argparse +import random +import re +import sys +import time +from datetime import datetime + +import boto3 +from botocore.compat import total_seconds + +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + +parser.add_argument('--profile', help='profile name of aws account.', type=str, + default=None) +parser.add_argument('--region', help='Default region when creating new connections', type=str, + default=None) +parser.add_argument('--name', help='name of the job', type=str, default='dummy') +parser.add_argument('--job-type', help='type of job to submit.', type=str, + choices=['g4dn.4x', 'g4dn.8x', 'g4dn.12x', 'g4dn.16x', + 'p3.2x', 'p3.8x', 'p3.16x', 'p3dn.24x', + 'c5n.18x'], default='g4dn.4x') +parser.add_argument('--source-ref', + help='ref in GluonNLP main github. e.g. master, refs/pull/500/head', + type=str, default='master') +parser.add_argument('--work-dir', + help='working directory inside the repo. e.g. scripts/preprocess', + type=str, default='scripts/preprocess') +parser.add_argument('--saved-output', + help='output to be saved, relative to working directory. ' + 'it can be either a single file or a directory', + type=str, default='.') +parser.add_argument('--save-path', + help='s3 path where files are saved.', + type=str, default='batch/temp/{}'.format(datetime.now().isoformat())) +parser.add_argument('--command', help='command to run', type=str, + default='git rev-parse HEAD | tee stdout.log') +parser.add_argument('--remote', + help='git repo address. https://github.com/dmlc/gluon-nlp', + type=str, default="https://github.com/dmlc/gluon-nlp") +parser.add_argument('--wait', help='block wait until the job completes. ' + 'Non-zero exit code if job fails.', action='store_true') +parser.add_argument('--timeout', help='job timeout in seconds', default=None, type=int) + + +args = parser.parse_args() + +session = boto3.Session(profile_name=args.profile, region_name=args.region) +batch, cloudwatch = [session.client(service_name=sn) for sn in ['batch', 'logs']] + + +def printLogs(logGroupName, logStreamName, startTime): + kwargs = {'logGroupName': logGroupName, + 'logStreamName': logStreamName, + 'startTime': startTime, + 'startFromHead': True} + + lastTimestamp = 0 + printMessage = False + while True: + logEvents = cloudwatch.get_log_events(**kwargs) + + for event in logEvents['events']: + lastTimestamp = event['timestamp'] + timestamp = datetime.utcfromtimestamp(lastTimestamp / 1000.0).isoformat() + if printMessage: + print(event['message']) + else: + if event['message']=="/gluon-nlp/tools/batch": + printMessage = True + print(event['message']) + # print('[{}] {}'.format((timestamp + '.000')[:23] + 'Z', event['message'])) + + nextToken = logEvents['nextForwardToken'] + if nextToken and kwargs.get('nextToken') != nextToken: + kwargs['nextToken'] = nextToken + else: + break + return lastTimestamp + + +def nowInMillis(): + endTime = long(total_seconds(datetime.utcnow() - datetime(1970, 1, 1))) * 1000 + return endTime + + +job_definitions = { + 'g4dn.4x': 'gluon-nlp-1-jobs:5', + 'g4dn.8x': 'gluon-nlp-1-jobs:4', + 'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:1', + 'g4dn.16x': 'gluon-nlp-1-jobs:3', + 'p3.2x': 'gluon-nlp-1-jobs:11', + 'p3.8x': 'gluon-nlp-1-4gpu-jobs:2', + 'p3.16x': 'gluon-nlp-1-8gpu-jobs:1', + 'p3dn.24x': 'gluon-nlp-1-8gpu-jobs:2', + 'c5n.18x': 'gluon-nlp-1-cpu-jobs:2', +} + +job_queues = { + 'g4dn.4x': 'g4dn', + 'g4dn.8x': 'g4dn', + 'g4dn.12x': 'g4dn-multi-gpu', + 'g4dn.16x': 'g4dn', + 'p3.2x': 'p3', + 'p3.8x': 'p3-4gpu', + 'p3.16x': 'p3-8gpu', + 'p3dn.24x': 'p3dn-8gpu', + 'c5n.18x': 'c5n', +} + + +def main(): + spin = ['-', '/', '|', '\\', '-', '/', '|', '\\'] + logGroupName = '/aws/batch/job' + + jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128] # Enforce AWS Batch jobName rules + jobType = args.job_type + jobQueue = job_queues[jobType] + jobDefinition = job_definitions[jobType] + command = args.command.split() + wait = args.wait + + parameters = { + 'SOURCE_REF': args.source_ref, + 'WORK_DIR': args.work_dir, + 'SAVED_OUTPUT': args.saved_output, + 'SAVE_PATH': args.save_path, + 'COMMAND': args.command, + 'REMOTE': args.remote + } + kwargs = dict( + jobName=jobName, + jobQueue=jobQueue, + jobDefinition=jobDefinition, + parameters=parameters, + ) + if args.timeout is not None: + kwargs['timeout'] = {'attemptDurationSeconds': args.timeout} + submitJobResponse = batch.submit_job(**kwargs) + + jobId = submitJobResponse['jobId'] + print('Submitted job [{} - {}] to the job queue [{}]'.format(jobName, jobId, jobQueue)) + + spinner = 0 + running = False + status_set = set() + startTime = 0 + while wait: + time.sleep(random.randint(5, 10)) + describeJobsResponse = batch.describe_jobs(jobs=[jobId]) + status = describeJobsResponse['jobs'][0]['status'] + if status == 'SUCCEEDED' or status == 'FAILED': + print('=' * 80) + print('Job [{} - {}] {}'.format(jobName, jobId, status)) + + sys.exit(status == 'FAILED') + + elif status == 'RUNNING': + logStreamName = describeJobsResponse['jobs'][0]['container']['logStreamName'] + if not running: + running = True + print('\rJob [{}, {}] is RUNNING.'.format(jobName, jobId)) + if logStreamName: + print('Output [{}]:\n {}'.format(logStreamName, '=' * 80)) + if logStreamName: + startTime = printLogs(logGroupName, logStreamName, startTime) + 1 + elif status not in status_set: + status_set.add(status) + print('\rJob [%s - %s] is %-9s... %s' % (jobName, jobId, status, spin[spinner % len(spin)]),) + sys.stdout.flush() + spinner += 1 + + +if __name__ == '__main__': + main() From 837903dfa991056c25d3c79a6cb81bcf1d6af174 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 15:09:47 -0700 Subject: [PATCH 22/32] [CI] Update unittest-gpu --- .github/workflows/unittests-gpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 6e9e41c14b..78697f31fa 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -38,5 +38,6 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait + python ./tools/batch/submit-test.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait + From 074880c099a85f4e5b096dd7559ddbc592adf304 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 17:09:12 -0700 Subject: [PATCH 23/32] [CI] Quiet the pip install + Redirect the logs to script.log --- .github/workflows/unittests-gpu.yml | 6 +++--- test.sh | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 78697f31fa..c11ecdbf4e 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -26,8 +26,8 @@ jobs: - name: Install Other Dependencies run: | - python -m pip install --user --upgrade pip - python -m pip install --user -e .[extras] + python -m pip install --user --quiet --upgrade pip + python -m pip install --user --quiet -e .[extras] - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v1 @@ -38,6 +38,6 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-test.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait + python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait | tee > script.log diff --git a/test.sh b/test.sh index 1053f2ac06..be6b9b6d3b 100755 --- a/test.sh +++ b/test.sh @@ -6,9 +6,9 @@ echo $PWD sudo apt-get install libopenblas-dev -python3 -m pip install --user -upgrade pip -python3 -m pip install --user setuptools pytest pytest-cov contextvars -python3 -m pip install --upgrade cython -python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python -python3 -m pip install --user -e .[extras] +python3 -m pip install --user --quiet -upgrade pip +python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars +python3 -m pip install --upgrade --quiet cython +python3 -m pip install --pre --user --quiet "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python +python3 -m pip install --user --quiet -e .[extras] python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ From 061cdfb693c229891b400534ddf21f41257f8899 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 17:16:00 -0700 Subject: [PATCH 24/32] [CI] Remove asserts --- tests/test_optimizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index eac29f1265..f5935fbef4 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -8,7 +8,6 @@ def test_adam(ctx): with getattr(mx, ctx)(): - assert False opt1 = AdamW opt2 = AdamW shapes = [(3, 4, 5), (10, 4), (7,)] From f8b87f47bc2166e843dcefeec0a5207c8bae6b69 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 17:33:32 -0700 Subject: [PATCH 25/32] [CI] Simplify ctx statement --- conftest.py | 2 +- tests/test_attention_cell.py | 8 ++++---- tests/test_models_bert.py | 4 ++-- tests/test_models_electra.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/conftest.py b/conftest.py index bb84ce0051..04efde9756 100644 --- a/conftest.py +++ b/conftest.py @@ -212,4 +212,4 @@ def pytest_addoption(parser): def pytest_generate_tests(metafunc): if 'ctx' in metafunc.fixturenames: - metafunc.parametrize("ctx", metafunc.config.option.device) + metafunc.parametrize("ctx", [getattr(mx, device)() for device in metafunc.config.option.device]) diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py index 1964f9db4c..c3ddbcfd10 100644 --- a/tests/test_attention_cell.py +++ b/tests/test_attention_cell.py @@ -18,7 +18,7 @@ @pytest.mark.parametrize('rel_score_type', ['share_head', 'no_share_head', 'no']) @pytest.mark.seed(123) def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize, rel_score_type, ctx): - with getattr(mx, ctx)(): + with ctx: batch_size = 5 query_length, mem_length = 16, 32 query_head_units = 8 @@ -155,7 +155,7 @@ def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize, @pytest.mark.parametrize('normalized', [True, False]) @pytest.mark.seed(123) def test_dot_product_attention(scaled, normalized, ctx): - with getattr(mx, ctx)(): + with ctx: num_heads = 4 batch_size = 32 query_length, mem_length = 16, 32 @@ -197,7 +197,7 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): return gen_mem_attn_mask(F, mem, mem_valid_length, data, valid_length, dtype=self._dtype, layout=self._layout) - with getattr(mx, ctx)(): + with ctx: batch_size = 4 query_length = 8 mem_length = 6 @@ -274,7 +274,7 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): @pytest.mark.parametrize('hybridize', [False, True]) @pytest.mark.seed(123) def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize, ctx): - with getattr(mx, ctx)(): + with ctx: batch_size = 6 query_length = 25 mem_length = 20 diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py index 294582239e..30ae207248 100644 --- a/tests/test_models_bert.py +++ b/tests/test_models_bert.py @@ -13,7 +13,7 @@ def test_list_pretrained_bert(): @pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) def test_bert_small_cfg(compute_layout, ctx): - with getattr(mx, ctx)(): + with ctx: cfg = BertModel.get_cfg() cfg.defrost() cfg.MODEL.vocab_size = 100 @@ -93,7 +93,7 @@ def test_bert_small_cfg(compute_layout, ctx): @pytest.mark.parametrize('model_name', list_pretrained_bert()) def test_bert_get_pretrained(model_name, ctx): assert len(list_pretrained_bert()) > 0 - with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)(): + with tempfile.TemporaryDirectory() as root, ctx: cfg, tokenizer, backbone_params_path, mlm_params_path =\ get_pretrained_bert(model_name, load_backbone=True, load_mlm=True, root=root) assert cfg.MODEL.vocab_size == len(tokenizer.vocab) diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py index dcc20a76be..998ee72f53 100644 --- a/tests/test_models_electra.py +++ b/tests/test_models_electra.py @@ -27,7 +27,7 @@ def get_test_cfg(): @pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) def test_electra_model(compute_layout, ctx): - with getattr(mx, ctx)(): + with ctx: cfg = get_test_cfg() cfg.defrost() cfg.MODEL.compute_layout = compute_layout @@ -67,7 +67,7 @@ def test_electra_model(compute_layout, ctx): @pytest.mark.parametrize('model_name', list_pretrained_electra()) def test_electra_get_pretrained(model_name, ctx): assert len(list_pretrained_electra()) > 0 - with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)(): + with tempfile.TemporaryDirectory() as root, ctx: cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\ get_pretrained_electra(model_name, root=root, load_backbone=True, load_disc=True, load_gen=True) From 86a4ff224f41a5bfa3f4175efb6316cce4132f94 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 17:40:20 -0700 Subject: [PATCH 26/32] [CI] Simplify ctx statement --- tests/test_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index f5935fbef4..48c2331a7a 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -7,7 +7,7 @@ def test_adam(ctx): - with getattr(mx, ctx)(): + with ctx: opt1 = AdamW opt2 = AdamW shapes = [(3, 4, 5), (10, 4), (7,)] From b3c017a5021b30f618e1b9e08d74a62589f38d74 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 19:03:48 -0700 Subject: [PATCH 27/32] [CI] test_multi_head_rel_attn_score failed for gpu test --- tests/test_attention_cell.py | 221 +++++++++++++++++------------------ tests/test_models.py | 2 +- 2 files changed, 111 insertions(+), 112 deletions(-) diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py index c3ddbcfd10..c7166f19c9 100644 --- a/tests/test_attention_cell.py +++ b/tests/test_attention_cell.py @@ -274,118 +274,117 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): @pytest.mark.parametrize('hybridize', [False, True]) @pytest.mark.seed(123) def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize, ctx): - with ctx: - batch_size = 6 - query_length = 25 - mem_length = 20 - query_head_units = 7 + batch_size = 6 + query_length = 25 + mem_length = 20 + query_head_units = 7 - # Initialize the attention cell with relative positional embedding - base_layout = 'NKT' - base_use_einsum = False - if method == 'shaw': - num_buckets = None - max_distance = 20 - elif method == 't5': - num_buckets = 10 - max_distance = 20 - elif method == 'transformer_xl': - num_buckets = None - max_distance = None - else: - raise NotImplementedError - base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units, - num_heads=num_heads, - dropout=0.0, - method=method, - num_buckets=num_buckets, - max_distance=max_distance, - layout=base_layout, - use_einsum=base_use_einsum) - base_score_cell.initialize() - if hybridize: - base_score_cell.hybridize() - # Generate the data - query = mx.np.random.normal(0, 1, - (batch_size, num_heads, query_length, query_head_units), - dtype=np.float32) - if method != 't5': - rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length), - dtype=np.float32) - else: - rel_score_grad = mx.np.random.normal(0, 1, - (num_heads, query_length, mem_length), - dtype=np.float32) - query_positions = mx.np.arange(query_length, dtype=np.int32) - mem_positions = mx.np.arange(mem_length, dtype=np.int32) - rel_positions = mx.np.expand_dims(query_positions, axis=-1)\ - - mx.np.expand_dims(mem_positions, axis=0) - mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32) - query.attach_grad() - with mx.autograd.record(): - rel_score = base_score_cell(rel_positions, query) - rel_score.backward(rel_score_grad) - original_rel_score = rel_score.asnumpy() - original_grad_norm = grad_global_norm(base_score_cell.collect_params().values()) - original_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) - assert original_grad_norm > 0 - # 1. Test for permutation equivariant - # We can permutate the query, rel_positions and the rel_score_grad and the result should - # always be the same. - query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32) - mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32) + # Initialize the attention cell with relative positional embedding + base_layout = 'NKT' + base_use_einsum = False + if method == 'shaw': + num_buckets = None + max_distance = 20 + elif method == 't5': + num_buckets = 10 + max_distance = 20 + elif method == 'transformer_xl': + num_buckets = None + max_distance = None + else: + raise NotImplementedError + base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units, + num_heads=num_heads, + dropout=0.0, + method=method, + num_buckets=num_buckets, + max_distance=max_distance, + layout=base_layout, + use_einsum=base_use_einsum) + base_score_cell.initialize() + if hybridize: + base_score_cell.hybridize() + # Generate the data + query = mx.np.random.normal(0, 1, + (batch_size, num_heads, query_length, query_head_units), + dtype=np.float32) + if method != 't5': + rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length), + dtype=np.float32) + else: + rel_score_grad = mx.np.random.normal(0, 1, + (num_heads, query_length, mem_length), + dtype=np.float32) + query_positions = mx.np.arange(query_length, dtype=np.int32) + mem_positions = mx.np.arange(mem_length, dtype=np.int32) + rel_positions = mx.np.expand_dims(query_positions, axis=-1)\ + - mx.np.expand_dims(mem_positions, axis=0) + mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32) + query.attach_grad() + with mx.autograd.record(): + rel_score = base_score_cell(rel_positions, query) + rel_score.backward(rel_score_grad) + original_rel_score = rel_score.asnumpy() + original_grad_norm = grad_global_norm(base_score_cell.collect_params().values()) + original_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) + assert original_grad_norm > 0 + # 1. Test for permutation equivariant + # We can permutate the query, rel_positions and the rel_score_grad and the result should + # always be the same. + query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32) + mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32) - query.grad[:] = 0 - with mx.autograd.record(): - rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm], - query[:, :, query_perm, :]) - if method != 't5': - rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm]) - else: - rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm]) - permutated_out = rel_score.asnumpy() - permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values()) - permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) + query.grad[:] = 0 + with mx.autograd.record(): + rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm], + query[:, :, query_perm, :]) if method != 't5': - assert_allclose( - original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()], - permutated_out, 1E-4, 1E-4) + rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm]) else: - assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()], - permutated_out, 1E-4, 1E-4) - assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4) - assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4) - # 2. Test for different layout + use/not use einsum - for layout in ['NKT', 'NTK', 'TNK']: - for use_einsum in [False, True]: - if layout == base_layout and use_einsum == base_use_einsum: - continue - score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units, - num_heads=num_heads, - dropout=0.0, - method=method, - num_buckets=num_buckets, - max_distance=max_distance, - layout=layout, - use_einsum=use_einsum) - score_cell.initialize() - if hybridize: - score_cell.hybridize() - score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()}) - query.attach_grad() - query.grad[:] = 0 - with mx.autograd.record(): - if layout == 'NKT': - rel_score = score_cell(rel_positions, query) - rel_score.backward(rel_score_grad) - elif layout == 'NTK': - rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3))) - rel_score.backward(rel_score_grad) - elif layout == 'TNK': - rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3))) - rel_score.backward(rel_score_grad) - else: - raise NotImplementedError - assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5) - layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) - assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5) + rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm]) + permutated_out = rel_score.asnumpy() + permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values()) + permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) + if method != 't5': + assert_allclose( + original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()], + permutated_out, 1E-4, 1E-4) + else: + assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()], + permutated_out, 1E-4, 1E-4) + assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4) + assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4) + # 2. Test for different layout + use/not use einsum + for layout in ['NKT', 'NTK', 'TNK']: + for use_einsum in [False, True]: + if layout == base_layout and use_einsum == base_use_einsum: + continue + score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units, + num_heads=num_heads, + dropout=0.0, + method=method, + num_buckets=num_buckets, + max_distance=max_distance, + layout=layout, + use_einsum=use_einsum) + score_cell.initialize() + if hybridize: + score_cell.hybridize() + score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()}) + query.attach_grad() + query.grad[:] = 0 + with mx.autograd.record(): + if layout == 'NKT': + rel_score = score_cell(rel_positions, query) + rel_score.backward(rel_score_grad) + elif layout == 'NTK': + rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3))) + rel_score.backward(rel_score_grad) + elif layout == 'TNK': + rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3))) + rel_score.backward(rel_score_grad) + else: + raise NotImplementedError + assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5) + layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy()) + assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5) diff --git a/tests/test_models.py b/tests/test_models.py index 413941250b..03491b6272 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -13,7 +13,7 @@ def test_list_backbone_names(): @pytest.mark.parametrize('name', list_backbone_names()) def test_get_backbone(name, ctx): - with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)(): + with tempfile.TemporaryDirectory() as root, ctx: model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root) net = model_cls.from_cfg(cfg) net.load_parameters(local_params_path) From 5c3a09956ad7574ca453b5c58871229b20a2aaf1 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Tue, 25 Aug 2020 20:52:53 -0700 Subject: [PATCH 28/32] [CI] Finalize gpu test - change remote from barry-jin to dmlc --- .github/workflows/unittests-gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index c11ecdbf4e..5b328c8da7 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -38,6 +38,6 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait | tee > script.log + python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait | tee > script.log From c866de3c8b790e46aff326014372e0b52296671f Mon Sep 17 00:00:00 2001 From: barry-jin <69359374+barry-jin@users.noreply.github.com> Date: Tue, 25 Aug 2020 20:57:11 -0700 Subject: [PATCH 29/32] Delete submit-test.py --- tools/batch/submit-test.py | 174 ------------------------------------- 1 file changed, 174 deletions(-) delete mode 100644 tools/batch/submit-test.py diff --git a/tools/batch/submit-test.py b/tools/batch/submit-test.py deleted file mode 100644 index 38eb601073..0000000000 --- a/tools/batch/submit-test.py +++ /dev/null @@ -1,174 +0,0 @@ -import argparse -import random -import re -import sys -import time -from datetime import datetime - -import boto3 -from botocore.compat import total_seconds - -parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - -parser.add_argument('--profile', help='profile name of aws account.', type=str, - default=None) -parser.add_argument('--region', help='Default region when creating new connections', type=str, - default=None) -parser.add_argument('--name', help='name of the job', type=str, default='dummy') -parser.add_argument('--job-type', help='type of job to submit.', type=str, - choices=['g4dn.4x', 'g4dn.8x', 'g4dn.12x', 'g4dn.16x', - 'p3.2x', 'p3.8x', 'p3.16x', 'p3dn.24x', - 'c5n.18x'], default='g4dn.4x') -parser.add_argument('--source-ref', - help='ref in GluonNLP main github. e.g. master, refs/pull/500/head', - type=str, default='master') -parser.add_argument('--work-dir', - help='working directory inside the repo. e.g. scripts/preprocess', - type=str, default='scripts/preprocess') -parser.add_argument('--saved-output', - help='output to be saved, relative to working directory. ' - 'it can be either a single file or a directory', - type=str, default='.') -parser.add_argument('--save-path', - help='s3 path where files are saved.', - type=str, default='batch/temp/{}'.format(datetime.now().isoformat())) -parser.add_argument('--command', help='command to run', type=str, - default='git rev-parse HEAD | tee stdout.log') -parser.add_argument('--remote', - help='git repo address. https://github.com/dmlc/gluon-nlp', - type=str, default="https://github.com/dmlc/gluon-nlp") -parser.add_argument('--wait', help='block wait until the job completes. ' - 'Non-zero exit code if job fails.', action='store_true') -parser.add_argument('--timeout', help='job timeout in seconds', default=None, type=int) - - -args = parser.parse_args() - -session = boto3.Session(profile_name=args.profile, region_name=args.region) -batch, cloudwatch = [session.client(service_name=sn) for sn in ['batch', 'logs']] - - -def printLogs(logGroupName, logStreamName, startTime): - kwargs = {'logGroupName': logGroupName, - 'logStreamName': logStreamName, - 'startTime': startTime, - 'startFromHead': True} - - lastTimestamp = 0 - printMessage = False - while True: - logEvents = cloudwatch.get_log_events(**kwargs) - - for event in logEvents['events']: - lastTimestamp = event['timestamp'] - timestamp = datetime.utcfromtimestamp(lastTimestamp / 1000.0).isoformat() - if printMessage: - print(event['message']) - else: - if event['message']=="/gluon-nlp/tools/batch": - printMessage = True - print(event['message']) - # print('[{}] {}'.format((timestamp + '.000')[:23] + 'Z', event['message'])) - - nextToken = logEvents['nextForwardToken'] - if nextToken and kwargs.get('nextToken') != nextToken: - kwargs['nextToken'] = nextToken - else: - break - return lastTimestamp - - -def nowInMillis(): - endTime = long(total_seconds(datetime.utcnow() - datetime(1970, 1, 1))) * 1000 - return endTime - - -job_definitions = { - 'g4dn.4x': 'gluon-nlp-1-jobs:5', - 'g4dn.8x': 'gluon-nlp-1-jobs:4', - 'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:1', - 'g4dn.16x': 'gluon-nlp-1-jobs:3', - 'p3.2x': 'gluon-nlp-1-jobs:11', - 'p3.8x': 'gluon-nlp-1-4gpu-jobs:2', - 'p3.16x': 'gluon-nlp-1-8gpu-jobs:1', - 'p3dn.24x': 'gluon-nlp-1-8gpu-jobs:2', - 'c5n.18x': 'gluon-nlp-1-cpu-jobs:2', -} - -job_queues = { - 'g4dn.4x': 'g4dn', - 'g4dn.8x': 'g4dn', - 'g4dn.12x': 'g4dn-multi-gpu', - 'g4dn.16x': 'g4dn', - 'p3.2x': 'p3', - 'p3.8x': 'p3-4gpu', - 'p3.16x': 'p3-8gpu', - 'p3dn.24x': 'p3dn-8gpu', - 'c5n.18x': 'c5n', -} - - -def main(): - spin = ['-', '/', '|', '\\', '-', '/', '|', '\\'] - logGroupName = '/aws/batch/job' - - jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128] # Enforce AWS Batch jobName rules - jobType = args.job_type - jobQueue = job_queues[jobType] - jobDefinition = job_definitions[jobType] - command = args.command.split() - wait = args.wait - - parameters = { - 'SOURCE_REF': args.source_ref, - 'WORK_DIR': args.work_dir, - 'SAVED_OUTPUT': args.saved_output, - 'SAVE_PATH': args.save_path, - 'COMMAND': args.command, - 'REMOTE': args.remote - } - kwargs = dict( - jobName=jobName, - jobQueue=jobQueue, - jobDefinition=jobDefinition, - parameters=parameters, - ) - if args.timeout is not None: - kwargs['timeout'] = {'attemptDurationSeconds': args.timeout} - submitJobResponse = batch.submit_job(**kwargs) - - jobId = submitJobResponse['jobId'] - print('Submitted job [{} - {}] to the job queue [{}]'.format(jobName, jobId, jobQueue)) - - spinner = 0 - running = False - status_set = set() - startTime = 0 - while wait: - time.sleep(random.randint(5, 10)) - describeJobsResponse = batch.describe_jobs(jobs=[jobId]) - status = describeJobsResponse['jobs'][0]['status'] - if status == 'SUCCEEDED' or status == 'FAILED': - print('=' * 80) - print('Job [{} - {}] {}'.format(jobName, jobId, status)) - - sys.exit(status == 'FAILED') - - elif status == 'RUNNING': - logStreamName = describeJobsResponse['jobs'][0]['container']['logStreamName'] - if not running: - running = True - print('\rJob [{}, {}] is RUNNING.'.format(jobName, jobId)) - if logStreamName: - print('Output [{}]:\n {}'.format(logStreamName, '=' * 80)) - if logStreamName: - startTime = printLogs(logGroupName, logStreamName, startTime) + 1 - elif status not in status_set: - status_set.add(status) - print('\rJob [%s - %s] is %-9s... %s' % (jobName, jobId, status, spin[spinner % len(spin)]),) - sys.stdout.flush() - spinner += 1 - - -if __name__ == '__main__': - main() From 67f2e385c66ee984b69eac5cf6da10c65764de4d Mon Sep 17 00:00:00 2001 From: barry-jin Date: Wed, 26 Aug 2020 12:23:01 -0700 Subject: [PATCH 30/32] [CI] Update test working directory --- .github/workflows/unittests-gpu.yml | 2 +- test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 5b328c8da7..7f91a7bdf5 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -38,6 +38,6 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait | tee > script.log + python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/gluon-nlp/test.sh" --wait | tee > script.log diff --git a/test.sh b/test.sh index be6b9b6d3b..0da2fefd51 100755 --- a/test.sh +++ b/test.sh @@ -11,4 +11,4 @@ python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars python3 -m pip install --upgrade --quiet cython python3 -m pip install --pre --user --quiet "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python python3 -m pip install --user --quiet -e .[extras] -python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ +python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" /gluon-nlp/tests/ From 0792deaeea8bcd516c39f449944efbb2672925c9 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Thu, 27 Aug 2020 09:22:38 -0700 Subject: [PATCH 31/32] [CI] Update AWS Batch job type --- .github/workflows/unittests-gpu.yml | 2 +- test.sh | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index 7f91a7bdf5..fb6fb28674 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -38,6 +38,6 @@ jobs: - name: Test project on AWS Batch run: | - python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/gluon-nlp/test.sh" --wait | tee > script.log + python ./tools/batch/submit-job.py --region us-east-1 --job-type g4dn.4x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/gluon-nlp/test.sh" --wait | tee > script.log diff --git a/test.sh b/test.sh index 0da2fefd51..2b6c2d36ba 100755 --- a/test.sh +++ b/test.sh @@ -1,8 +1,6 @@ #!/bin/bash # Shell script for installing dependencies and running test on AWS Batch -# alias python3='/usr/bin/python3' - echo $PWD sudo apt-get install libopenblas-dev From 9d4c45981213a1b54196f8ea191e0054fc4754e4 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Thu, 27 Aug 2020 16:18:16 -0700 Subject: [PATCH 32/32] [CI] Allow test logs downloading --- .github/workflows/unittests-gpu.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml index fb6fb28674..2d1f06d9ec 100644 --- a/.github/workflows/unittests-gpu.yml +++ b/.github/workflows/unittests-gpu.yml @@ -39,5 +39,15 @@ jobs: - name: Test project on AWS Batch run: | python ./tools/batch/submit-job.py --region us-east-1 --job-type g4dn.4x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/gluon-nlp/test.sh" --wait | tee > script.log - + + - name: Upload log file for AWS Batch test results + uses: actions/upload-artifact@v2 + with: + name: GPU_Test_Results + path: script.log + + - name: Download log file for AWS Batch test results + uses: actions/download-artifact@v2 + with: + name: GPU_Test_Results