From ad7c2eef7a5d1c4785715553dc801c1db317c46e Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 21 Aug 2020 13:59:14 -0700
Subject: [PATCH 01/32] [CI] Add GPU pytest + Submit jobs to AWS Batch through
 GitHub Actions

---
 .github/workflows/unittests.yml |  9 +-----
 test.sh                         | 11 ++++++++
 tests/test_models.py            |  5 ++--
 tests/test_optimizer.py         | 50 +++++++++++++++++----------------
 4 files changed, 41 insertions(+), 34 deletions(-)
 create mode 100644 test.sh

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 7ff2b0dfd0..5eca4cf107 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -30,16 +30,9 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           architecture: x64
-      - name: Install Other Dependencies
-        run: |
-          python -m pip install --user --upgrade pip
-          python -m pip install --user setuptools pytest pytest-cov contextvars
-          python -m pip install --upgrade cython
-          python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
-          python -m pip install --user -e .[extras]
       - name: Test project
         run: |
-          python -m pytest --cov=./ --cov-report=xml --durations=50 tests/
+          python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "chmod +x test.sh | ./test.sh" --wait
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1.0.10
         with:
diff --git a/test.sh b/test.sh
new file mode 100644
index 0000000000..0f7d3204c6
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Shell script for installing dependencies and running test on AWS Batch
+
+# alias python3='/usr/bin/python3'
+
+python3 -m pip install --user -upgrade pip
+python3 -m pip install --user setuptools pytest pytest-cov contextvars
+python3 -m pip install --upgrade cython
+python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install --user -e .[extras]
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 tests/
\ No newline at end of file
diff --git a/tests/test_models.py b/tests/test_models.py
index 0d35330387..61e1068bc6 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -12,8 +12,9 @@ def test_list_backbone_names():
 
 
 @pytest.mark.parametrize('name', list_backbone_names())
-def test_get_backbone(name):
-    with tempfile.TemporaryDirectory() as root:
+@pytest.mark.parametrize('use_gpu', [mx.cpu(), mx.gpu()])
+def test_get_backbone(name, use_gpu):
+    with tempfile.TemporaryDirectory() as root, use_gpu:
         model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root)
         net = model_cls.from_cfg(cfg)
         net.load_parameters(local_params_path)
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index 85a704f98c..6d2907d4d5 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -6,27 +6,29 @@
 mx.npx.reset_np()
 
 
-def test_adam():
-    opt1 = AdamW
-    opt2 = AdamW
-    shapes = [(3, 4, 5), (10, 4), (7,)]
-    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
-    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{'multi_precision': False}]  # TODO(sxjscience) Test for FP16
-    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
-                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
-    correct_bias_options = [{'correct_bias': True}, {'correct_bias': False}]
-    for dtype in [np.float16, np.float32]:
-        for params in itertools.product(beta1_options, beta2_options, cg_options,
-                                        rg_options, wd_options, mp_options,
-                                        agg_options, correct_bias_options):
-            kwarg = {k: v for param in params for k, v in param.items()}
-            if (dtype == np.float16 and ('multi_precision' not in kwarg or
-                                         not kwarg['multi_precision'])):
-                continue
-            compare_optimizer(opt1(use_fused_step=False, **kwarg),
-                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
-                              rtol=1e-4, atol=2e-5)
+@pytest.mark.parametrize('use_gpu', [mx.cpu(), mx.gpu()])
+def test_adam(use_gpu):
+    with use_gpu:
+        opt1 = AdamW
+        opt2 = AdamW
+        shapes = [(3, 4, 5), (10, 4), (7,)]
+        beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+        beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
+        cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+        rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+        wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+        mp_options = [{'multi_precision': False}]  # TODO(sxjscience) Test for FP16
+        agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                       {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+        correct_bias_options = [{'correct_bias': True}, {'correct_bias': False}]
+        for dtype in [np.float16, np.float32]:
+            for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                            rg_options, wd_options, mp_options,
+                                            agg_options, correct_bias_options):
+                kwarg = {k: v for param in params for k, v in param.items()}
+                if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                             not kwarg['multi_precision'])):
+                    continue
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                  rtol=1e-4, atol=2e-5)

From e9901c29e42837d1c22b5a7276a99cdb2d8cf4ec Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 21 Aug 2020 15:19:34 -0700
Subject: [PATCH 02/32] [CI] Update GPU tests and parameters use

---
 conftest.py             | 7 +++++++
 tests/test_models.py    | 1 -
 tests/test_optimizer.py | 1 -
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/conftest.py b/conftest.py
index 4ff9b028a8..ed4654697b 100644
--- a/conftest.py
+++ b/conftest.py
@@ -206,3 +206,10 @@ def doctest(doctest_namespace):
     doctest_namespace['gluon'] = mx.gluon
     import doctest
     doctest.ELLIPSIS_MARKER = '-etc-'
+
+def pytest_addoption(parser):
+    parser.addoption("--device", action="append", default=[], help="list of device choices to run the tests. ex: mx.gpu() (For GPU test only)")
+
+def pytest_generate_tests(metafunc):
+    if 'use_gpu' in metafunc.fixturenames:
+        metafunc.parametrize("use_gpu", metafunc.config.option.device)
diff --git a/tests/test_models.py b/tests/test_models.py
index 61e1068bc6..e1be3e0dcb 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -12,7 +12,6 @@ def test_list_backbone_names():
 
 
 @pytest.mark.parametrize('name', list_backbone_names())
-@pytest.mark.parametrize('use_gpu', [mx.cpu(), mx.gpu()])
 def test_get_backbone(name, use_gpu):
     with tempfile.TemporaryDirectory() as root, use_gpu:
         model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root)
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index 6d2907d4d5..1de36c2f55 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -6,7 +6,6 @@
 mx.npx.reset_np()
 
 
-@pytest.mark.parametrize('use_gpu', [mx.cpu(), mx.gpu()])
 def test_adam(use_gpu):
     with use_gpu:
         opt1 = AdamW

From 84fac9110d845e0ee0e1907a81344646dd284264 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 21 Aug 2020 15:44:52 -0700
Subject: [PATCH 03/32] [CI] Update CI pipeline

---
 .github/workflows/unittests.yml | 37 ++++++++++++++++++++++++++++++++-
 test.sh                         |  3 ++-
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 5eca4cf107..f657fe5056 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -30,10 +30,45 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           architecture: x64
+      - name: Install Other Dependencies
+        run: |
+          python -m pip install --user --upgrade pip
+          python -m pip install --user setuptools pytest pytest-cov contextvars
+          python -m pip install --upgrade cython
+          python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+          python -m pip install --user -e .[extras]
       - name: Test project
         run: |
-          python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "chmod +x test.sh | ./test.sh" --wait
+          python -m pytest --cov=./ --cov-report=xml --durations=50 tests/
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1.0.10
         with:
           env_vars: OS,PYTHON
+
+  unittest-gpu:
+    runs-on: ubuntu-latest
+    strategy: 
+      fail-fast: false
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Install Linux dependencies
+        run: sudo apt-get install libopenblas-dev
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+
+      - name: Test project on AWS Batch
+        run: |
+          python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "chmod +x test.sh | ./test.sh" --wait
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1.0.10
+        with:
+          env_vars: OS,PYTHON     
+
diff --git a/test.sh b/test.sh
index 0f7d3204c6..8160f86cab 100644
--- a/test.sh
+++ b/test.sh
@@ -3,9 +3,10 @@
 
 # alias python3='/usr/bin/python3'
 
+sudo apt-get install libopenblas-dev
 python3 -m pip install --user -upgrade pip
 python3 -m pip install --user setuptools pytest pytest-cov contextvars
 python3 -m pip install --upgrade cython
 python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
 python3 -m pip install --user -e .[extras]
-python3 -m pytest --cov=./ --cov-report=xml --durations=50 tests/
\ No newline at end of file
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="mx.gpu()" tests/
\ No newline at end of file

From c2f80d9d95b7c457e50fb9825909595c51824660 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 21 Aug 2020 15:48:59 -0700
Subject: [PATCH 04/32] [CI] Add new line

---
 test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test.sh b/test.sh
index 8160f86cab..3b8eea5a8a 100644
--- a/test.sh
+++ b/test.sh
@@ -9,4 +9,4 @@ python3 -m pip install --user setuptools pytest pytest-cov contextvars
 python3 -m pip install --upgrade cython
 python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
 python3 -m pip install --user -e .[extras]
-python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="mx.gpu()" tests/
\ No newline at end of file
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="mx.gpu()" tests/

From e5ab220e50f649888d7f7cc8e932f062c3066d9a Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 21 Aug 2020 16:08:20 -0700
Subject: [PATCH 05/32] [CI] Update pytest command for cpu test

---
 .github/workflows/unittests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index f657fe5056..20bccb2456 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -39,7 +39,7 @@ jobs:
           python -m pip install --user -e .[extras]
       - name: Test project
         run: |
-          python -m pytest --cov=./ --cov-report=xml --durations=50 tests/
+          python -m pytest --cov=./ --cov-report=xml --device="mx.cpu()" --durations=50 tests/
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1.0.10
         with:

From 0a6a1d3d0ebc62179c99ad555122f7a6aff2b98c Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 21 Aug 2020 16:45:35 -0700
Subject: [PATCH 06/32] [CI] Update use_gpu to ctx + add permissions to test.sh

---
 conftest.py             | 4 ++--
 tests/test_models.py    | 4 ++--
 tests/test_optimizer.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conftest.py b/conftest.py
index ed4654697b..bb84ce0051 100644
--- a/conftest.py
+++ b/conftest.py
@@ -211,5 +211,5 @@ def pytest_addoption(parser):
     parser.addoption("--device", action="append", default=[], help="list of device choices to run the tests. ex: mx.gpu() (For GPU test only)")
 
 def pytest_generate_tests(metafunc):
-    if 'use_gpu' in metafunc.fixturenames:
-        metafunc.parametrize("use_gpu", metafunc.config.option.device)
+    if 'ctx' in metafunc.fixturenames:
+        metafunc.parametrize("ctx", metafunc.config.option.device)
diff --git a/tests/test_models.py b/tests/test_models.py
index e1be3e0dcb..03491b6272 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -12,8 +12,8 @@ def test_list_backbone_names():
 
 
 @pytest.mark.parametrize('name', list_backbone_names())
-def test_get_backbone(name, use_gpu):
-    with tempfile.TemporaryDirectory() as root, use_gpu:
+def test_get_backbone(name, ctx):
+    with tempfile.TemporaryDirectory() as root, ctx:
         model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root)
         net = model_cls.from_cfg(cfg)
         net.load_parameters(local_params_path)
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index 1de36c2f55..48c2331a7a 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -6,8 +6,8 @@
 mx.npx.reset_np()
 
 
-def test_adam(use_gpu):
-    with use_gpu:
+def test_adam(ctx):
+    with ctx:
         opt1 = AdamW
         opt2 = AdamW
         shapes = [(3, 4, 5), (10, 4), (7,)]

From 92b9e85665aad334859dd724643e7348a09b5fff Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 21 Aug 2020 16:47:03 -0700
Subject: [PATCH 07/32] [CI] Update submitted command

---
 .github/workflows/unittests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 20bccb2456..9502d0d490 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -65,7 +65,7 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "chmod +x test.sh | ./test.sh" --wait
+          python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1.0.10

From 749acece178e2137ed341e0ebe8437782263339c Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 21 Aug 2020 20:58:55 -0700
Subject: [PATCH 08/32] [CI] De-stringify input to mxnet attribute

---
 .github/workflows/unittests.yml | 2 +-
 test.sh                         | 2 +-
 tests/test_models.py            | 2 +-
 tests/test_optimizer.py         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 test.sh

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 9502d0d490..98a85dabd4 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -39,7 +39,7 @@ jobs:
           python -m pip install --user -e .[extras]
       - name: Test project
         run: |
-          python -m pytest --cov=./ --cov-report=xml --device="mx.cpu()" --durations=50 tests/
+          python -m pytest --cov=./ --cov-report=xml --device="cpu" --durations=50 tests/
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1.0.10
         with:
diff --git a/test.sh b/test.sh
old mode 100644
new mode 100755
index 3b8eea5a8a..9ebba20abd
--- a/test.sh
+++ b/test.sh
@@ -9,4 +9,4 @@ python3 -m pip install --user setuptools pytest pytest-cov contextvars
 python3 -m pip install --upgrade cython
 python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
 python3 -m pip install --user -e .[extras]
-python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="mx.gpu()" tests/
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" tests/
diff --git a/tests/test_models.py b/tests/test_models.py
index 03491b6272..413941250b 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -13,7 +13,7 @@ def test_list_backbone_names():
 
 @pytest.mark.parametrize('name', list_backbone_names())
 def test_get_backbone(name, ctx):
-    with tempfile.TemporaryDirectory() as root, ctx:
+    with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)():
         model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root)
         net = model_cls.from_cfg(cfg)
         net.load_parameters(local_params_path)
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index 48c2331a7a..f5935fbef4 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -7,7 +7,7 @@
 
 
 def test_adam(ctx):
-    with ctx:
+    with getattr(mx, ctx)():
         opt1 = AdamW
         opt2 = AdamW
         shapes = [(3, 4, 5), (10, 4), (7,)]

From 44d0c5b6c295ee9c956db6fec61791c2f330e929 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 23 Aug 2020 12:27:14 -0700
Subject: [PATCH 09/32] [CI] Change pull_request event to pull_request_target
 event

---
 .github/workflows/unittests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 98a85dabd4..2a054527b3 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -1,6 +1,6 @@
 name: continuous build
 
-on: [push, pull_request]
+on: [push, pull_request_target]
 
 defaults:
   run:

From 3e02d5fc2529a4838458d5cdad02b540a982cd1f Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 23 Aug 2020 13:26:34 -0700
Subject: [PATCH 10/32] [CI] Add new workflow for GPU unit tests

---
 .github/workflows/unittests-gpu.yml | 35 +++++++++++++++++++++++++++++
 .github/workflows/unittests.yml     | 31 ++-----------------------
 2 files changed, 37 insertions(+), 29 deletions(-)
 create mode 100644 .github/workflows/unittests-gpu.yml

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
new file mode 100644
index 0000000000..36490b63ee
--- /dev/null
+++ b/.github/workflows/unittests-gpu.yml
@@ -0,0 +1,35 @@
+name: continuous build - gpu
+
+on: [push, pull_request_target]
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  unittest-gpu:
+    runs-on: ubuntu-latest
+    strategy: 
+      fail-fast: false
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Install Linux dependencies
+        run: sudo apt-get install libopenblas-dev
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+
+      - name: Test project on AWS Batch
+        run: |
+          python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1.0.10
+        with:
+          env_vars: OS,PYTHON 
\ No newline at end of file
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 2a054527b3..ced8f9a1c8 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -1,6 +1,6 @@
 name: continuous build
 
-on: [push, pull_request_target]
+on: [push, pull_request]
 
 defaults:
   run:
@@ -43,32 +43,5 @@ jobs:
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1.0.10
         with:
-          env_vars: OS,PYTHON
-
-  unittest-gpu:
-    runs-on: ubuntu-latest
-    strategy: 
-      fail-fast: false
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: Install Linux dependencies
-        run: sudo apt-get install libopenblas-dev
-
-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-1
-
-      - name: Test project on AWS Batch
-        run: |
-          python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v1.0.10
-        with:
-          env_vars: OS,PYTHON     
+          env_vars: OS,PYTHON    
 

From d174fcf33a8f742df829f74d9415b7b292a2d0af Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 23 Aug 2020 22:18:43 -0700
Subject: [PATCH 11/32] [CI] Update unittests-gpu.yml

---
 .github/workflows/unittests-gpu.yml | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 36490b63ee..3d840af095 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -18,6 +18,20 @@ jobs:
       - name: Install Linux dependencies
         run: sudo apt-get install libopenblas-dev
 
+      - name: Setup python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+          architecture: x64
+
+      - name: Install Other Dependencies
+        run: |
+          python -m pip install --user --upgrade pip
+          python -m pip install --user setuptools pytest pytest-cov contextvars
+          python -m pip install --upgrade cython
+          python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+          python -m pip install --user -e .[extras]
+
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v1
         with:
@@ -32,4 +46,4 @@ jobs:
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1.0.10
         with:
-          env_vars: OS,PYTHON 
\ No newline at end of file
+          env_vars: OS,PYTHON 

From a73161a5e2fc3903f3ba53ee5d11891ff27d61e5 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 23 Aug 2020 22:25:14 -0700
Subject: [PATCH 12/32] [CI] Update unittests-gpu.yml

---
 .github/workflows/unittests-gpu.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 3d840af095..f278c9a067 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -27,9 +27,6 @@ jobs:
       - name: Install Other Dependencies
         run: |
           python -m pip install --user --upgrade pip
-          python -m pip install --user setuptools pytest pytest-cov contextvars
-          python -m pip install --upgrade cython
-          python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
           python -m pip install --user -e .[extras]
 
       - name: Configure AWS Credentials
@@ -41,7 +38,7 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python3 ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1.0.10

From 994c2c1ee04c73313a7ffb933453e384c28f00e6 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 24 Aug 2020 12:41:52 -0700
Subject: [PATCH 13/32] [CI] Update path of test.sh

---
 .github/workflows/unittests-gpu.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index f278c9a067..72cac182f3 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -38,9 +38,5 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "./test.sh" --wait
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait
 
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v1.0.10
-        with:
-          env_vars: OS,PYTHON 

From 39d23512a279d07f5bae0f4d387b4740b7257b9f Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 24 Aug 2020 12:57:28 -0700
Subject: [PATCH 14/32] [CI] Update path of /test

---
 test.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test.sh b/test.sh
index 9ebba20abd..1053f2ac06 100755
--- a/test.sh
+++ b/test.sh
@@ -3,10 +3,12 @@
 
 # alias python3='/usr/bin/python3'
 
+echo $PWD
+
 sudo apt-get install libopenblas-dev
 python3 -m pip install --user -upgrade pip
 python3 -m pip install --user setuptools pytest pytest-cov contextvars
 python3 -m pip install --upgrade cython
 python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
 python3 -m pip install --user -e .[extras]
-python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" tests/
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/

From 43bb922b17a4faaa47ed124d529aace0025850de Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 24 Aug 2020 13:16:36 -0700
Subject: [PATCH 15/32] [CI] Update remote to barry-jin/gluon-nlp

---
 .github/workflows/unittests-gpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 72cac182f3..6e9e41c14b 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -38,5 +38,5 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait
 

From 006305294e6a44ed9fad4cc9ad68a0eb165a7a47 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 24 Aug 2020 14:35:21 -0700
Subject: [PATCH 16/32] [CI] Update remote to dmlc/gluon-nlp

---
 .github/workflows/unittests-gpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 6e9e41c14b..72cac182f3 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -38,5 +38,5 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait
 

From 68f814fdfb3e38bcd1111fd1e1085c74ee1bc238 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 10:04:04 -0700
Subject: [PATCH 17/32] [CI] Add gpu tests for attention cells, bert, electra +
 Update README

---
 tests/README.md              |  16 +-
 tests/test_attention_cell.py | 634 ++++++++++++++++++-----------------
 tests/test_models_bert.py    | 143 ++++----
 tests/test_models_electra.py |  67 ++--
 4 files changed, 439 insertions(+), 421 deletions(-)

diff --git a/tests/README.md b/tests/README.md
index 69e08e039e..ceeaeaf68f 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -3,13 +3,25 @@
 To run the unittests, use the following command
 
 ```bash
-python3 -m pytest .
+python3 -m pytest --device="cpu" .
 ```
 
 To test for certain file, e.g., the `test_models_transformer.py`, use the following command
 
 ```bash
-python3 -m pytest test_models_transformer
+python3 -m pytest --device="cpu" test_models_transformer.py
+```
+
+To test only for gpu device, use the following command
+
+```bash
+python3 -m pytest --device="gpu" test_models_transformer.py
+```
+
+To test both for cpu and gpu device, use the following command
+
+```bash
+python3 -m pytest --device="cpu" --device="gpu" test_models_transformer.py
 ```
 
 Refer to the [official guide of pytest](https://docs.pytest.org/en/latest/) for more details.
diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py
index 3b874b0d55..1964f9db4c 100644
--- a/tests/test_attention_cell.py
+++ b/tests/test_attention_cell.py
@@ -17,161 +17,163 @@
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('rel_score_type', ['share_head', 'no_share_head', 'no'])
 @pytest.mark.seed(123)
-def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize, rel_score_type):
-    batch_size = 5
-    query_length, mem_length = 16, 32
-    query_head_units = 8
-    mem_head_units = 6
-    query_units = query_head_units * num_heads
-    mem_units = mem_head_units * num_heads
-    seed = 100
-    attn_cells = dict()
-    for layout in ['NKT', 'NTK', 'TNK']:
-        for use_einsum in [False, True]:
-            attn_cells[(layout, use_einsum)] = MultiHeadAttentionCell(
-                query_units=query_units,
-                num_heads=num_heads,
-                attention_dropout=0.0,
-                scaled=scaled,
-                normalized=normalized,
-                layout=layout,
-                use_einsum=use_einsum)
-            if hybridize:
-                attn_cells[(layout, use_einsum)].hybridize()
-    # Generate the data
-    query_np = np.random.normal(0, 1, (batch_size, num_heads, query_length, query_head_units))
-    key_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, query_head_units))
-    value_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, mem_head_units))
-    mask_np = np.random.randint(0, 2, (batch_size, query_length, mem_length))
-    if rel_score_type == 'share_head':
-        rel_scores_np = np.random.normal(0, 1, (query_length, mem_length))
-    elif rel_score_type == 'no_share_head':
-        rel_scores_np = np.random.normal(0, 1, (num_heads, query_length, mem_length))
-    else:
-        rel_scores_np = None
-    out_np = None
-    score_np = None
-    attn_weights_np = None
-    stored_layout = None
-    query_grad_np = None
-    key_grad_np = None
-    value_grad_np = None
-    rel_scores_grad_np = None
-    for (layout, use_einsum), attn_cell in attn_cells.items():
-        mx.npx.random.seed(seed)
-        if rel_score_type != 'no':
-            rel_scores = mx.np.array(rel_scores_np, dtype=np.float32)
-        else:
-            rel_scores = None
-        if layout == 'NKT':
-            query = mx.np.array(query_np, dtype=np.float32)
-            key = mx.np.array(key_np, dtype=np.float32)
-            value = mx.np.array(value_np, dtype=np.float32)
-        elif layout == 'NTK':
-            query = mx.np.array(query_np.transpose((0, 2, 1, 3)), dtype=np.float32)
-            key = mx.np.array(key_np.transpose((0, 2, 1, 3)), dtype=np.float32)
-            value = mx.np.array(value_np.transpose((0, 2, 1, 3)), dtype=np.float32)
-        elif layout == 'TNK':
-            query = mx.np.array(query_np.transpose((2, 0, 1, 3)), dtype=np.float32)
-            key = mx.np.array(key_np.transpose((2, 0, 1, 3)), dtype=np.float32)
-            value = mx.np.array(value_np.transpose((2, 0, 1, 3)), dtype=np.float32)
-        else:
-            raise NotImplementedError
-        mask = mx.np.array(mask_np, dtype=np.int32)
-        query.attach_grad()
-        key.attach_grad()
-        value.attach_grad()
-        if rel_scores is not None:
-            rel_scores.attach_grad()
-        with mx.autograd.record():
-            out, [score, attn_weights] = attn_cell(query, key, value, mask, rel_scores)
-            out.backward()
-        if layout == 'NKT':
-            assert out.shape == (batch_size, query_length, num_heads * mem_head_units)
-        elif layout == 'NTK':
-            assert out.shape == (batch_size, query_length, num_heads * mem_head_units)
-        elif layout == 'TNK':
-            assert out.shape == (query_length, batch_size, num_heads * mem_head_units)
+def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize, rel_score_type, ctx):
+    with getattr(mx, ctx)():
+        batch_size = 5
+        query_length, mem_length = 16, 32
+        query_head_units = 8
+        mem_head_units = 6
+        query_units = query_head_units * num_heads
+        mem_units = mem_head_units * num_heads
+        seed = 100
+        attn_cells = dict()
+        for layout in ['NKT', 'NTK', 'TNK']:
+            for use_einsum in [False, True]:
+                attn_cells[(layout, use_einsum)] = MultiHeadAttentionCell(
+                    query_units=query_units,
+                    num_heads=num_heads,
+                    attention_dropout=0.0,
+                    scaled=scaled,
+                    normalized=normalized,
+                    layout=layout,
+                    use_einsum=use_einsum)
+                if hybridize:
+                    attn_cells[(layout, use_einsum)].hybridize()
+        # Generate the data
+        query_np = np.random.normal(0, 1, (batch_size, num_heads, query_length, query_head_units))
+        key_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, query_head_units))
+        value_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, mem_head_units))
+        mask_np = np.random.randint(0, 2, (batch_size, query_length, mem_length))
+        if rel_score_type == 'share_head':
+            rel_scores_np = np.random.normal(0, 1, (query_length, mem_length))
+        elif rel_score_type == 'no_share_head':
+            rel_scores_np = np.random.normal(0, 1, (num_heads, query_length, mem_length))
         else:
-            raise NotImplementedError
-        for i in range(num_heads):
-            assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(),
-                            mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5)
-
-        if stored_layout is None:
-            out_np = out.asnumpy()
-            score_np = score.asnumpy()
-            attn_weights_np = attn_weights.asnumpy()
-            stored_layout = layout
-            query_grad_np = query.grad.asnumpy()
-            key_grad_np = key.grad.asnumpy()
-            value_grad_np = value.grad.asnumpy()
+            rel_scores_np = None
+        out_np = None
+        score_np = None
+        attn_weights_np = None
+        stored_layout = None
+        query_grad_np = None
+        key_grad_np = None
+        value_grad_np = None
+        rel_scores_grad_np = None
+        for (layout, use_einsum), attn_cell in attn_cells.items():
+            mx.npx.random.seed(seed)
             if rel_score_type != 'no':
-                rel_scores_grad_np = rel_scores.grad.asnumpy()
-        else:
-            assert stored_layout == 'NKT'
-            # Begin to match the output
+                rel_scores = mx.np.array(rel_scores_np, dtype=np.float32)
+            else:
+                rel_scores = None
             if layout == 'NKT':
-                m_out_np = out.asnumpy()
-                m_score_np = score.asnumpy()
-                m_attn_weights_np = attn_weights.asnumpy()
-                m_query_grad_np = query.grad.asnumpy()
-                m_key_grad_np = key.grad.asnumpy()
-                m_value_grad_np = value.grad.asnumpy()
-                if rel_score_type != 'no':
-                    m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                query = mx.np.array(query_np, dtype=np.float32)
+                key = mx.np.array(key_np, dtype=np.float32)
+                value = mx.np.array(value_np, dtype=np.float32)
             elif layout == 'NTK':
-                m_out_np = out.asnumpy()
-                m_score_np = score.asnumpy()
-                m_attn_weights_np = attn_weights.asnumpy()
-                m_query_grad_np = query.grad.asnumpy().transpose((0, 2, 1, 3))
-                m_key_grad_np = key.grad.asnumpy().transpose((0, 2, 1, 3))
-                m_value_grad_np = value.grad.asnumpy().transpose((0, 2, 1, 3))
-                if rel_score_type != 'no':
-                    m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                query = mx.np.array(query_np.transpose((0, 2, 1, 3)), dtype=np.float32)
+                key = mx.np.array(key_np.transpose((0, 2, 1, 3)), dtype=np.float32)
+                value = mx.np.array(value_np.transpose((0, 2, 1, 3)), dtype=np.float32)
             elif layout == 'TNK':
-                m_out_np = out.asnumpy().transpose((1, 0, 2))
-                m_score_np = score.asnumpy()
-                m_attn_weights_np = attn_weights.asnumpy()
-                m_query_grad_np = query.grad.asnumpy().transpose((1, 2, 0, 3))
-                m_key_grad_np = key.grad.asnumpy().transpose((1, 2, 0, 3))
-                m_value_grad_np = value.grad.asnumpy().transpose((1, 2, 0, 3))
-                if rel_score_type != 'no':
-                    m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                query = mx.np.array(query_np.transpose((2, 0, 1, 3)), dtype=np.float32)
+                key = mx.np.array(key_np.transpose((2, 0, 1, 3)), dtype=np.float32)
+                value = mx.np.array(value_np.transpose((2, 0, 1, 3)), dtype=np.float32)
             else:
                 raise NotImplementedError
-            assert_allclose(m_out_np, out_np, 1E-5, 1E-5)
-            assert_allclose(m_score_np, score_np, 1E-5, 1E-5)
-            assert_allclose(m_attn_weights_np, attn_weights_np, 1E-5, 1E-5)
-            assert_allclose(m_query_grad_np, query_grad_np, 1E-5, 1E-5)
-            assert_allclose(m_key_grad_np, key_grad_np, 1E-5, 1E-5)
-            assert_allclose(m_value_grad_np, value_grad_np, 1E-5, 1E-5)
-            if rel_score_type != 'no':
-                assert_allclose(m_rel_scores_grad_np, rel_scores_grad_np, 1E-5, 1E-5)
+            mask = mx.np.array(mask_np, dtype=np.int32)
+            query.attach_grad()
+            key.attach_grad()
+            value.attach_grad()
+            if rel_scores is not None:
+                rel_scores.attach_grad()
+            with mx.autograd.record():
+                out, [score, attn_weights] = attn_cell(query, key, value, mask, rel_scores)
+                out.backward()
+            if layout == 'NKT':
+                assert out.shape == (batch_size, query_length, num_heads * mem_head_units)
+            elif layout == 'NTK':
+                assert out.shape == (batch_size, query_length, num_heads * mem_head_units)
+            elif layout == 'TNK':
+                assert out.shape == (query_length, batch_size, num_heads * mem_head_units)
+            else:
+                raise NotImplementedError
+            for i in range(num_heads):
+                assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(),
+                                mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5)
+
+            if stored_layout is None:
+                out_np = out.asnumpy()
+                score_np = score.asnumpy()
+                attn_weights_np = attn_weights.asnumpy()
+                stored_layout = layout
+                query_grad_np = query.grad.asnumpy()
+                key_grad_np = key.grad.asnumpy()
+                value_grad_np = value.grad.asnumpy()
+                if rel_score_type != 'no':
+                    rel_scores_grad_np = rel_scores.grad.asnumpy()
+            else:
+                assert stored_layout == 'NKT'
+                # Begin to match the output
+                if layout == 'NKT':
+                    m_out_np = out.asnumpy()
+                    m_score_np = score.asnumpy()
+                    m_attn_weights_np = attn_weights.asnumpy()
+                    m_query_grad_np = query.grad.asnumpy()
+                    m_key_grad_np = key.grad.asnumpy()
+                    m_value_grad_np = value.grad.asnumpy()
+                    if rel_score_type != 'no':
+                        m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                elif layout == 'NTK':
+                    m_out_np = out.asnumpy()
+                    m_score_np = score.asnumpy()
+                    m_attn_weights_np = attn_weights.asnumpy()
+                    m_query_grad_np = query.grad.asnumpy().transpose((0, 2, 1, 3))
+                    m_key_grad_np = key.grad.asnumpy().transpose((0, 2, 1, 3))
+                    m_value_grad_np = value.grad.asnumpy().transpose((0, 2, 1, 3))
+                    if rel_score_type != 'no':
+                        m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                elif layout == 'TNK':
+                    m_out_np = out.asnumpy().transpose((1, 0, 2))
+                    m_score_np = score.asnumpy()
+                    m_attn_weights_np = attn_weights.asnumpy()
+                    m_query_grad_np = query.grad.asnumpy().transpose((1, 2, 0, 3))
+                    m_key_grad_np = key.grad.asnumpy().transpose((1, 2, 0, 3))
+                    m_value_grad_np = value.grad.asnumpy().transpose((1, 2, 0, 3))
+                    if rel_score_type != 'no':
+                        m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                else:
+                    raise NotImplementedError
+                assert_allclose(m_out_np, out_np, 1E-5, 1E-5)
+                assert_allclose(m_score_np, score_np, 1E-5, 1E-5)
+                assert_allclose(m_attn_weights_np, attn_weights_np, 1E-5, 1E-5)
+                assert_allclose(m_query_grad_np, query_grad_np, 1E-5, 1E-5)
+                assert_allclose(m_key_grad_np, key_grad_np, 1E-5, 1E-5)
+                assert_allclose(m_value_grad_np, value_grad_np, 1E-5, 1E-5)
+                if rel_score_type != 'no':
+                    assert_allclose(m_rel_scores_grad_np, rel_scores_grad_np, 1E-5, 1E-5)
 
 
 @pytest.mark.parametrize('scaled', [True, False])
 @pytest.mark.parametrize('normalized', [True, False])
 @pytest.mark.seed(123)
-def test_dot_product_attention(scaled, normalized):
-    num_heads = 4
-    batch_size = 32
-    query_length, mem_length = 16, 32
-    num_channel = 8
-    query = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, num_channel))
-    key = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel))
-    value = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel))
-    mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length))
-    out, [score, attn_weights] = multi_head_dot_attn(mx.nd, query, key, value, mask,
-                                                     scaled=scaled, normalized=normalized)
-    assert out.shape == (batch_size, query_length, num_heads * num_channel)
-    for i in range(num_heads):
-        assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(),
-                        mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5)
+def test_dot_product_attention(scaled, normalized, ctx):
+    with getattr(mx, ctx)():
+        num_heads = 4
+        batch_size = 32
+        query_length, mem_length = 16, 32
+        num_channel = 8
+        query = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, num_channel))
+        key = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel))
+        value = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel))
+        mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length))
+        out, [score, attn_weights] = multi_head_dot_attn(mx.nd, query, key, value, mask,
+                                                         scaled=scaled, normalized=normalized)
+        assert out.shape == (batch_size, query_length, num_heads * num_channel)
+        for i in range(num_heads):
+            assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(),
+                            mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5)
 
 
 @pytest.mark.seed(123)
-def test_gen_attn_mask():
+def test_gen_attn_mask(ctx):
     class GenSelfAttnMask(HybridBlock):
         def __init__(self, dtype, layout, attn_type):
             super().__init__()
@@ -195,74 +197,75 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
             return gen_mem_attn_mask(F, mem, mem_valid_length, data, valid_length,
                                      dtype=self._dtype, layout=self._layout)
 
-    batch_size = 4
-    query_length = 8
-    mem_length = 6
-    nchannel = 5
-    data = mx.np.random.normal(0, 1, (batch_size, query_length, nchannel), dtype=np.float32)
-    valid_length = mx.np.random.randint(1, query_length + 1, (batch_size,))
+    with getattr(mx, ctx)():
+        batch_size = 4
+        query_length = 8
+        mem_length = 6
+        nchannel = 5
+        data = mx.np.random.normal(0, 1, (batch_size, query_length, nchannel), dtype=np.float32)
+        valid_length = mx.np.random.randint(1, query_length + 1, (batch_size,))
 
-    mem = mx.np.random.normal(0, 1, (batch_size, mem_length, nchannel), dtype=np.float32)
-    mem_valid_length = mx.np.random.randint(1, mem_length + 1, (batch_size,))
+        mem = mx.np.random.normal(0, 1, (batch_size, mem_length, nchannel), dtype=np.float32)
+        mem_valid_length = mx.np.random.randint(1, mem_length + 1, (batch_size,))
 
-    for hybridize in [False, True]:
-        # Test Full Attention Mask
-        mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='full')
-        mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='full')
-        if hybridize:
-            mask_gen_nt.hybridize()
-            mask_gen_tn.hybridize()
-        mask_nt = mask_gen_nt(data, valid_length)
-        mask_nt = mask_nt.asnumpy()
-        mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length)
-        mask_tn = mask_tn.asnumpy()
-        mask = mask_nt
-        assert_allclose(mask_nt, mask_tn)
-        for b in range(batch_size):
-            v_l = valid_length.asnumpy()[b]
-            for i in range(v_l):
-                assert (mask[b, i, :v_l] == 1).all()
-                assert(mask[b, i, v_l:] == 0).all()
-            for i in range(v_l, query_length):
-                assert (mask[b, i, :] == 0).all()
+        for hybridize in [False, True]:
+            # Test Full Attention Mask
+            mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='full')
+            mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='full')
+            if hybridize:
+                mask_gen_nt.hybridize()
+                mask_gen_tn.hybridize()
+            mask_nt = mask_gen_nt(data, valid_length)
+            mask_nt = mask_nt.asnumpy()
+            mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length)
+            mask_tn = mask_tn.asnumpy()
+            mask = mask_nt
+            assert_allclose(mask_nt, mask_tn)
+            for b in range(batch_size):
+                v_l = valid_length.asnumpy()[b]
+                for i in range(v_l):
+                    assert (mask[b, i, :v_l] == 1).all()
+                    assert(mask[b, i, v_l:] == 0).all()
+                for i in range(v_l, query_length):
+                    assert (mask[b, i, :] == 0).all()
 
-        # Test Causal Attention Mask
-        mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='causal')
-        mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='causal')
-        if hybridize:
-            mask_gen_nt.hybridize()
-            mask_gen_tn.hybridize()
-        mask_nt = mask_gen_nt(data, valid_length)
-        mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length)
-        assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy())
-        mask = mask_nt.asnumpy()
-        for b in range(batch_size):
-            v_l = valid_length.asnumpy()[b]
-            for i in range(v_l):
-                assert (mask[b, i, :(i + 1)] == 1).all()
-                assert (mask[b, i, (i + 1):] == 0).all()
-            for i in range(v_l, query_length):
-                assert (mask[b, i, :] == 0).all()
+            # Test Causal Attention Mask
+            mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='causal')
+            mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='causal')
+            if hybridize:
+                mask_gen_nt.hybridize()
+                mask_gen_tn.hybridize()
+            mask_nt = mask_gen_nt(data, valid_length)
+            mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length)
+            assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy())
+            mask = mask_nt.asnumpy()
+            for b in range(batch_size):
+                v_l = valid_length.asnumpy()[b]
+                for i in range(v_l):
+                    assert (mask[b, i, :(i + 1)] == 1).all()
+                    assert (mask[b, i, (i + 1):] == 0).all()
+                for i in range(v_l, query_length):
+                    assert (mask[b, i, :] == 0).all()
 
-        # Test Mem Attention Mask
-        mask_gen_nt = GenMemAttnMask(dtype=np.float32, layout='NT')
-        mask_gen_tn = GenMemAttnMask(dtype=np.float32, layout='TN')
-        if hybridize:
-            mask_gen_nt.hybridize()
-            mask_gen_tn.hybridize()
-        mask_nt = mask_gen_nt(mem, mem_valid_length, data, valid_length)
-        mask_tn = mask_gen_tn(mx.np.swapaxes(mem, 0, 1), mem_valid_length,
-                              mx.np.swapaxes(data, 0, 1), valid_length)
-        mask = mask_nt.asnumpy()
-        assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy())
-        for b in range(batch_size):
-            data_v_l = valid_length.asnumpy()[b]
-            mem_v_l = mem_valid_length.asnumpy()[b]
-            for i in range(data_v_l):
-                assert (mask[b, i, :mem_v_l] == 1).all()
-                assert (mask[b, i, mem_v_l:] == 0).all()
-            for i in range(data_v_l, query_length):
-                assert (mask[b, i, :] == 0).all()
+            # Test Mem Attention Mask
+            mask_gen_nt = GenMemAttnMask(dtype=np.float32, layout='NT')
+            mask_gen_tn = GenMemAttnMask(dtype=np.float32, layout='TN')
+            if hybridize:
+                mask_gen_nt.hybridize()
+                mask_gen_tn.hybridize()
+            mask_nt = mask_gen_nt(mem, mem_valid_length, data, valid_length)
+            mask_tn = mask_gen_tn(mx.np.swapaxes(mem, 0, 1), mem_valid_length,
+                                  mx.np.swapaxes(data, 0, 1), valid_length)
+            mask = mask_nt.asnumpy()
+            assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy())
+            for b in range(batch_size):
+                data_v_l = valid_length.asnumpy()[b]
+                mem_v_l = mem_valid_length.asnumpy()[b]
+                for i in range(data_v_l):
+                    assert (mask[b, i, :mem_v_l] == 1).all()
+                    assert (mask[b, i, mem_v_l:] == 0).all()
+                for i in range(data_v_l, query_length):
+                    assert (mask[b, i, :] == 0).all()
 
 
 @pytest.mark.parametrize('num_heads', [1, 2, 3])
@@ -270,118 +273,119 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
 @pytest.mark.parametrize('bidirectional', [False, True])
 @pytest.mark.parametrize('hybridize', [False, True])
 @pytest.mark.seed(123)
-def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize):
-    batch_size = 6
-    query_length = 25
-    mem_length = 20
-    query_head_units = 7
+def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize, ctx):
+    with getattr(mx, ctx)():
+        batch_size = 6
+        query_length = 25
+        mem_length = 20
+        query_head_units = 7
 
-    # Initialize the attention cell with relative positional embedding
-    base_layout = 'NKT'
-    base_use_einsum = False
-    if method == 'shaw':
-        num_buckets = None
-        max_distance = 20
-    elif method == 't5':
-        num_buckets = 10
-        max_distance = 20
-    elif method == 'transformer_xl':
-        num_buckets = None
-        max_distance = None
-    else:
-        raise NotImplementedError
-    base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
-                                            num_heads=num_heads,
-                                            dropout=0.0,
-                                            method=method,
-                                            num_buckets=num_buckets,
-                                            max_distance=max_distance,
-                                            layout=base_layout,
-                                            use_einsum=base_use_einsum)
-    base_score_cell.initialize()
-    if hybridize:
-        base_score_cell.hybridize()
-    # Generate the data
-    query = mx.np.random.normal(0, 1,
-                                (batch_size, num_heads, query_length, query_head_units),
-                                dtype=np.float32)
-    if method != 't5':
-        rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length),
-                                             dtype=np.float32)
-    else:
-        rel_score_grad = mx.np.random.normal(0, 1,
-                                             (num_heads, query_length, mem_length),
-                                             dtype=np.float32)
-    query_positions = mx.np.arange(query_length, dtype=np.int32)
-    mem_positions = mx.np.arange(mem_length, dtype=np.int32)
-    rel_positions = mx.np.expand_dims(query_positions, axis=-1)\
-                    - mx.np.expand_dims(mem_positions, axis=0)
-    mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32)
-    query.attach_grad()
-    with mx.autograd.record():
-        rel_score = base_score_cell(rel_positions, query)
-        rel_score.backward(rel_score_grad)
-    original_rel_score = rel_score.asnumpy()
-    original_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
-    original_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
-    assert original_grad_norm > 0
-    # 1. Test for permutation equivariant
-    # We can permutate the query, rel_positions and the rel_score_grad and the result should
-    # always be the same.
-    query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32)
-    mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32)
+        # Initialize the attention cell with relative positional embedding
+        base_layout = 'NKT'
+        base_use_einsum = False
+        if method == 'shaw':
+            num_buckets = None
+            max_distance = 20
+        elif method == 't5':
+            num_buckets = 10
+            max_distance = 20
+        elif method == 'transformer_xl':
+            num_buckets = None
+            max_distance = None
+        else:
+            raise NotImplementedError
+        base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
+                                                num_heads=num_heads,
+                                                dropout=0.0,
+                                                method=method,
+                                                num_buckets=num_buckets,
+                                                max_distance=max_distance,
+                                                layout=base_layout,
+                                                use_einsum=base_use_einsum)
+        base_score_cell.initialize()
+        if hybridize:
+            base_score_cell.hybridize()
+        # Generate the data
+        query = mx.np.random.normal(0, 1,
+                                    (batch_size, num_heads, query_length, query_head_units),
+                                    dtype=np.float32)
+        if method != 't5':
+            rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length),
+                                                 dtype=np.float32)
+        else:
+            rel_score_grad = mx.np.random.normal(0, 1,
+                                                 (num_heads, query_length, mem_length),
+                                                 dtype=np.float32)
+        query_positions = mx.np.arange(query_length, dtype=np.int32)
+        mem_positions = mx.np.arange(mem_length, dtype=np.int32)
+        rel_positions = mx.np.expand_dims(query_positions, axis=-1)\
+                        - mx.np.expand_dims(mem_positions, axis=0)
+        mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32)
+        query.attach_grad()
+        with mx.autograd.record():
+            rel_score = base_score_cell(rel_positions, query)
+            rel_score.backward(rel_score_grad)
+        original_rel_score = rel_score.asnumpy()
+        original_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
+        original_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+        assert original_grad_norm > 0
+        # 1. Test for permutation equivariant
+        # We can permutate the query, rel_positions and the rel_score_grad and the result should
+        # always be the same.
+        query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32)
+        mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32)
 
-    query.grad[:] = 0
-    with mx.autograd.record():
-        rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm],
-                                    query[:, :, query_perm, :])
+        query.grad[:] = 0
+        with mx.autograd.record():
+            rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm],
+                                        query[:, :, query_perm, :])
+            if method != 't5':
+                rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm])
+            else:
+                rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm])
+        permutated_out = rel_score.asnumpy()
+        permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
+        permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
         if method != 't5':
-            rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm])
+            assert_allclose(
+                original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()],
+                permutated_out, 1E-4, 1E-4)
         else:
-            rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm])
-    permutated_out = rel_score.asnumpy()
-    permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
-    permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
-    if method != 't5':
-        assert_allclose(
-            original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()],
-            permutated_out, 1E-4, 1E-4)
-    else:
-        assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()],
-                        permutated_out, 1E-4, 1E-4)
-    assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4)
-    assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4)
-    # 2. Test for different layout + use/not use einsum
-    for layout in ['NKT', 'NTK', 'TNK']:
-        for use_einsum in [False, True]:
-            if layout == base_layout and use_einsum == base_use_einsum:
-                continue
-            score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
-                                               num_heads=num_heads,
-                                               dropout=0.0,
-                                               method=method,
-                                               num_buckets=num_buckets,
-                                               max_distance=max_distance,
-                                               layout=layout,
-                                               use_einsum=use_einsum)
-            score_cell.initialize()
-            if hybridize:
-                score_cell.hybridize()
-            score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()})
-            query.attach_grad()
-            query.grad[:] = 0
-            with mx.autograd.record():
-                if layout == 'NKT':
-                    rel_score = score_cell(rel_positions, query)
-                    rel_score.backward(rel_score_grad)
-                elif layout == 'NTK':
-                    rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3)))
-                    rel_score.backward(rel_score_grad)
-                elif layout == 'TNK':
-                    rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3)))
-                    rel_score.backward(rel_score_grad)
-                else:
-                    raise NotImplementedError
-            assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5)
-            layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
-            assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5)
+            assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()],
+                            permutated_out, 1E-4, 1E-4)
+        assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4)
+        assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4)
+        # 2. Test for different layout + use/not use einsum
+        for layout in ['NKT', 'NTK', 'TNK']:
+            for use_einsum in [False, True]:
+                if layout == base_layout and use_einsum == base_use_einsum:
+                    continue
+                score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
+                                                   num_heads=num_heads,
+                                                   dropout=0.0,
+                                                   method=method,
+                                                   num_buckets=num_buckets,
+                                                   max_distance=max_distance,
+                                                   layout=layout,
+                                                   use_einsum=use_einsum)
+                score_cell.initialize()
+                if hybridize:
+                    score_cell.hybridize()
+                score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()})
+                query.attach_grad()
+                query.grad[:] = 0
+                with mx.autograd.record():
+                    if layout == 'NKT':
+                        rel_score = score_cell(rel_positions, query)
+                        rel_score.backward(rel_score_grad)
+                    elif layout == 'NTK':
+                        rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3)))
+                        rel_score.backward(rel_score_grad)
+                    elif layout == 'TNK':
+                        rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3)))
+                        rel_score.backward(rel_score_grad)
+                    else:
+                        raise NotImplementedError
+                assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5)
+                layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+                assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5)
diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py
index f2a2ffdfc1..294582239e 100644
--- a/tests/test_models_bert.py
+++ b/tests/test_models_bert.py
@@ -12,87 +12,88 @@ def test_list_pretrained_bert():
 
 
 @pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
-def test_bert_small_cfg(compute_layout):
-    cfg = BertModel.get_cfg()
-    cfg.defrost()
-    cfg.MODEL.vocab_size = 100
-    cfg.MODEL.units = 12 * 4
-    cfg.MODEL.hidden_size = 64
-    cfg.MODEL.num_layers = 2
-    cfg.MODEL.num_heads = 2
-    cfg.MODEL.compute_layout = compute_layout
-    cfg.freeze()
+def test_bert_small_cfg(compute_layout, ctx):
+    with getattr(mx, ctx)():
+        cfg = BertModel.get_cfg()
+        cfg.defrost()
+        cfg.MODEL.vocab_size = 100
+        cfg.MODEL.units = 12 * 4
+        cfg.MODEL.hidden_size = 64
+        cfg.MODEL.num_layers = 2
+        cfg.MODEL.num_heads = 2
+        cfg.MODEL.compute_layout = compute_layout
+        cfg.freeze()
 
-    # Generate TN layout
-    cfg_tn = cfg.clone()
-    cfg_tn.defrost()
-    cfg_tn.MODEL.layout = 'TN'
-    cfg_tn.freeze()
+        # Generate TN layout
+        cfg_tn = cfg.clone()
+        cfg_tn.defrost()
+        cfg_tn.MODEL.layout = 'TN'
+        cfg_tn.freeze()
 
-    # Sample data
-    batch_size = 4
-    sequence_length = 8
-    num_mask = 3
-    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
-    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
-    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
-    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+        # Sample data
+        batch_size = 4
+        sequence_length = 8
+        num_mask = 3
+        inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+        token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+        valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+        masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
 
-    # Test for BertModel
-    bert_model = BertModel.from_cfg(cfg)
-    bert_model.initialize()
-    bert_model.hybridize()
-    contextual_embedding, pooled_out = bert_model(inputs, token_types, valid_length)
-    bert_model_tn = BertModel.from_cfg(cfg_tn)
-    bert_model_tn.share_parameters(bert_model.collect_params())
-    bert_model_tn.hybridize()
-    contextual_embedding_tn, pooled_out_tn = bert_model_tn(inputs.T, token_types.T, valid_length)
-    assert_allclose(contextual_embedding.asnumpy(),
-                    mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
-                    1E-4, 1E-4)
-    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+        # Test for BertModel
+        bert_model = BertModel.from_cfg(cfg)
+        bert_model.initialize()
+        bert_model.hybridize()
+        contextual_embedding, pooled_out = bert_model(inputs, token_types, valid_length)
+        bert_model_tn = BertModel.from_cfg(cfg_tn)
+        bert_model_tn.share_parameters(bert_model.collect_params())
+        bert_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn = bert_model_tn(inputs.T, token_types.T, valid_length)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                        1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
 
-    # Test for BertForMLM
-    bert_mlm_model = BertForMLM(cfg)
-    bert_mlm_model.initialize()
-    bert_mlm_model.hybridize()
-    contextual_embedding, pooled_out, mlm_score = bert_mlm_model(inputs, token_types,
-                                                                 valid_length, masked_positions)
-    bert_mlm_model_tn = BertForMLM(cfg_tn)
-    bert_mlm_model_tn.share_parameters(bert_mlm_model.collect_params())
-    bert_mlm_model_tn.hybridize()
-    contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\
-        bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
-    assert_allclose(contextual_embedding.asnumpy(),
-                    mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
-                    1E-4, 1E-4)
-    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
+        # Test for BertForMLM
+        bert_mlm_model = BertForMLM(cfg)
+        bert_mlm_model.initialize()
+        bert_mlm_model.hybridize()
+        contextual_embedding, pooled_out, mlm_score = bert_mlm_model(inputs, token_types,
+                                                                     valid_length, masked_positions)
+        bert_mlm_model_tn = BertForMLM(cfg_tn)
+        bert_mlm_model_tn.share_parameters(bert_mlm_model.collect_params())
+        bert_mlm_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\
+            bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                        1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
 
-    # Test for BertForPretrain
-    bert_pretrain_model = BertForPretrain(cfg)
-    bert_pretrain_model.initialize()
-    bert_pretrain_model.hybridize()
-    contextual_embedding, pooled_out, nsp_score, mlm_scores =\
-        bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
-    bert_pretrain_model_tn = BertForPretrain(cfg_tn)
-    bert_pretrain_model_tn.share_parameters(bert_pretrain_model.collect_params())
-    bert_pretrain_model_tn.hybridize()
-    contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
-        bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
-    assert_allclose(contextual_embedding.asnumpy(),
-                    mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
-                    1E-4, 1E-4)
-    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
+        # Test for BertForPretrain
+        bert_pretrain_model = BertForPretrain(cfg)
+        bert_pretrain_model.initialize()
+        bert_pretrain_model.hybridize()
+        contextual_embedding, pooled_out, nsp_score, mlm_scores =\
+            bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+        bert_pretrain_model_tn = BertForPretrain(cfg_tn)
+        bert_pretrain_model_tn.share_parameters(bert_pretrain_model.collect_params())
+        bert_pretrain_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
+            bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                        1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
 
 
 @pytest.mark.remote_required
 @pytest.mark.parametrize('model_name', list_pretrained_bert())
-def test_bert_get_pretrained(model_name):
+def test_bert_get_pretrained(model_name, ctx):
     assert len(list_pretrained_bert()) > 0
-    with tempfile.TemporaryDirectory() as root:
+    with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)():
         cfg, tokenizer, backbone_params_path, mlm_params_path =\
             get_pretrained_bert(model_name, load_backbone=True, load_mlm=True, root=root)
         assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py
index 17f9420a07..6940af717d 100644
--- a/tests/test_models_electra.py
+++ b/tests/test_models_electra.py
@@ -26,47 +26,48 @@ def get_test_cfg():
 
 
 @pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
-def test_electra_model(compute_layout):
-    cfg = get_test_cfg()
-    cfg.defrost()
-    cfg.MODEL.compute_layout = compute_layout
-    cfg.freeze()
+def test_electra_model(compute_layout, ctx):
+    with getattr(mx, ctx)():
+        cfg = get_test_cfg()
+        cfg.defrost()
+        cfg.MODEL.compute_layout = compute_layout
+        cfg.freeze()
 
-    # Generate TN layout
-    cfg_tn = cfg.clone()
-    cfg_tn.defrost()
-    cfg_tn.MODEL.layout = 'TN'
-    cfg_tn.freeze()
+        # Generate TN layout
+        cfg_tn = cfg.clone()
+        cfg_tn.defrost()
+        cfg_tn.MODEL.layout = 'TN'
+        cfg_tn.freeze()
 
-    # Sample data
-    batch_size = 4
-    sequence_length = 16
-    num_mask = 3
-    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
-    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
-    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
-    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+        # Sample data
+        batch_size = 4
+        sequence_length = 16
+        num_mask = 3
+        inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+        token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+        valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+        masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
 
-    electra_model = ElectraModel.from_cfg(cfg)
-    electra_model.initialize()
-    electra_model.hybridize()
-    contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length)
-    electra_model_tn = ElectraModel.from_cfg(cfg_tn)
-    electra_model_tn.share_parameters(electra_model.collect_params())
-    electra_model_tn.hybridize()
-    contextual_embedding_tn, pooled_out_tn = electra_model_tn(inputs.T, token_types.T, valid_length)
-    assert_allclose(contextual_embedding.asnumpy(),
-                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
-                    1E-4, 1E-4)
-    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(),
-                    1E-4, 1E-4)
+        electra_model = ElectraModel.from_cfg(cfg)
+        electra_model.initialize()
+        electra_model.hybridize()
+        contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length)
+        electra_model_tn = ElectraModel.from_cfg(cfg_tn)
+        electra_model_tn.share_parameters(electra_model.collect_params())
+        electra_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn = electra_model_tn(inputs.T, token_types.T, valid_length)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                        1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(),
+                        1E-4, 1E-4)
 
 
 @pytest.mark.remote_required
 @pytest.mark.parametrize('model_name', list_pretrained_electra())
-def test_electra_get_pretrained(model_name):
+def test_electra_get_pretrained(model_name, ctx):
     assert len(list_pretrained_electra()) > 0
-    with tempfile.TemporaryDirectory() as root:
+    with tempfile.TemporaryDirectory() as root, getattr(mx, ctx):
         cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\
             get_pretrained_electra(model_name, root=root,
                                    load_backbone=True, load_disc=True, load_gen=True)

From 76cf1c4787ec4d486cc409b4e9aa92d7c16c4efb Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 10:15:38 -0700
Subject: [PATCH 18/32] [CI] Change remote from dmlc to barry-jin

---
 .github/workflows/unittests-gpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 72cac182f3..6e9e41c14b 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -38,5 +38,5 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait
 

From c0bfc6d761f988ca71f6dddae1e0640ee77a4299 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 12:32:25 -0700
Subject: [PATCH 19/32] [CI] Bug Fix

---
 tests/test_models_electra.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py
index 6940af717d..dcc20a76be 100644
--- a/tests/test_models_electra.py
+++ b/tests/test_models_electra.py
@@ -67,7 +67,7 @@ def test_electra_model(compute_layout, ctx):
 @pytest.mark.parametrize('model_name', list_pretrained_electra())
 def test_electra_get_pretrained(model_name, ctx):
     assert len(list_pretrained_electra()) > 0
-    with tempfile.TemporaryDirectory() as root, getattr(mx, ctx):
+    with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)():
         cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\
             get_pretrained_electra(model_name, root=root,
                                    load_backbone=True, load_disc=True, load_gen=True)

From b134ac132e5e51f82f2a5741b2f53d5bb1e43434 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 13:19:57 -0700
Subject: [PATCH 20/32] [CI] Truncate logs + Add failure test

---
 test.sh                 | 14 +++++++++++++-
 tests/test_optimizer.py |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/test.sh b/test.sh
index 1053f2ac06..77e19177d1 100755
--- a/test.sh
+++ b/test.sh
@@ -11,4 +11,16 @@ python3 -m pip install --user setuptools pytest pytest-cov contextvars
 python3 -m pip install --upgrade cython
 python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
 python3 -m pip install --user -e .[extras]
-python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ > output.txt
+
+flag=false
+while IFS= read -r line; do
+    if $flag; then
+        echo $line
+    else
+        if [ "$line" == "/gluon-nlp/tools/batch" ]; then
+            echo $line
+            flag=true
+        fi
+    fi
+done < output.txt
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index f5935fbef4..eac29f1265 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -8,6 +8,7 @@
 
 def test_adam(ctx):
     with getattr(mx, ctx)():
+        assert False
         opt1 = AdamW
         opt2 = AdamW
         shapes = [(3, 4, 5), (10, 4), (7,)]

From 91cd6f063f9c46880a21d6c3aa2414fa85eba6af Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 14:38:51 -0700
Subject: [PATCH 21/32] [CI] Duplicate script to submit test and get logs

---
 test.sh                    |  14 +--
 tools/batch/submit-test.py | 174 +++++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+), 13 deletions(-)
 create mode 100644 tools/batch/submit-test.py

diff --git a/test.sh b/test.sh
index 77e19177d1..1053f2ac06 100755
--- a/test.sh
+++ b/test.sh
@@ -11,16 +11,4 @@ python3 -m pip install --user setuptools pytest pytest-cov contextvars
 python3 -m pip install --upgrade cython
 python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
 python3 -m pip install --user -e .[extras]
-python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/ > output.txt
-
-flag=false
-while IFS= read -r line; do
-    if $flag; then
-        echo $line
-    else
-        if [ "$line" == "/gluon-nlp/tools/batch" ]; then
-            echo $line
-            flag=true
-        fi
-    fi
-done < output.txt
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/
diff --git a/tools/batch/submit-test.py b/tools/batch/submit-test.py
new file mode 100644
index 0000000000..38eb601073
--- /dev/null
+++ b/tools/batch/submit-test.py
@@ -0,0 +1,174 @@
+import argparse
+import random
+import re
+import sys
+import time
+from datetime import datetime
+
+import boto3
+from botocore.compat import total_seconds
+
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+parser.add_argument('--profile', help='profile name of aws account.', type=str,
+                    default=None)
+parser.add_argument('--region', help='Default region when creating new connections', type=str,
+                    default=None)
+parser.add_argument('--name', help='name of the job', type=str, default='dummy')
+parser.add_argument('--job-type', help='type of job to submit.', type=str,
+                    choices=['g4dn.4x', 'g4dn.8x', 'g4dn.12x', 'g4dn.16x',
+                             'p3.2x', 'p3.8x', 'p3.16x', 'p3dn.24x',
+                             'c5n.18x'], default='g4dn.4x')
+parser.add_argument('--source-ref',
+                    help='ref in GluonNLP main github. e.g. master, refs/pull/500/head',
+                    type=str, default='master')
+parser.add_argument('--work-dir',
+                    help='working directory inside the repo. e.g. scripts/preprocess',
+                    type=str, default='scripts/preprocess')
+parser.add_argument('--saved-output',
+                    help='output to be saved, relative to working directory. '
+                         'it can be either a single file or a directory',
+                    type=str, default='.')
+parser.add_argument('--save-path',
+                    help='s3 path where files are saved.',
+                    type=str, default='batch/temp/{}'.format(datetime.now().isoformat()))
+parser.add_argument('--command', help='command to run', type=str,
+                    default='git rev-parse HEAD | tee stdout.log')
+parser.add_argument('--remote',
+                    help='git repo address. https://github.com/dmlc/gluon-nlp',
+                    type=str, default="https://github.com/dmlc/gluon-nlp")
+parser.add_argument('--wait', help='block wait until the job completes. '
+                    'Non-zero exit code if job fails.', action='store_true')
+parser.add_argument('--timeout', help='job timeout in seconds', default=None, type=int)
+
+
+args = parser.parse_args()
+
+session = boto3.Session(profile_name=args.profile, region_name=args.region)
+batch, cloudwatch = [session.client(service_name=sn) for sn in ['batch', 'logs']]
+
+
+def printLogs(logGroupName, logStreamName, startTime):
+    kwargs = {'logGroupName': logGroupName,
+              'logStreamName': logStreamName,
+              'startTime': startTime,
+              'startFromHead': True}
+
+    lastTimestamp = 0
+    printMessage = False
+    while True:
+        logEvents = cloudwatch.get_log_events(**kwargs)
+
+        for event in logEvents['events']:
+            lastTimestamp = event['timestamp']
+            timestamp = datetime.utcfromtimestamp(lastTimestamp / 1000.0).isoformat()
+            if printMessage:
+                print(event['message'])
+            else:
+                if event['message']=="/gluon-nlp/tools/batch":
+                    printMessage = True
+                    print(event['message'])
+            # print('[{}] {}'.format((timestamp + '.000')[:23] + 'Z', event['message']))
+
+        nextToken = logEvents['nextForwardToken']
+        if nextToken and kwargs.get('nextToken') != nextToken:
+            kwargs['nextToken'] = nextToken
+        else:
+            break
+    return lastTimestamp
+
+
+def nowInMillis():
+    endTime = long(total_seconds(datetime.utcnow() - datetime(1970, 1, 1))) * 1000
+    return endTime
+
+
+job_definitions = {
+    'g4dn.4x': 'gluon-nlp-1-jobs:5',
+    'g4dn.8x': 'gluon-nlp-1-jobs:4',
+    'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:1',
+    'g4dn.16x': 'gluon-nlp-1-jobs:3',
+    'p3.2x': 'gluon-nlp-1-jobs:11',
+    'p3.8x': 'gluon-nlp-1-4gpu-jobs:2',
+    'p3.16x': 'gluon-nlp-1-8gpu-jobs:1',
+    'p3dn.24x': 'gluon-nlp-1-8gpu-jobs:2',
+    'c5n.18x': 'gluon-nlp-1-cpu-jobs:2',
+}
+
+job_queues = {
+    'g4dn.4x': 'g4dn',
+    'g4dn.8x': 'g4dn',
+    'g4dn.12x': 'g4dn-multi-gpu',
+    'g4dn.16x': 'g4dn',
+    'p3.2x': 'p3',
+    'p3.8x': 'p3-4gpu',
+    'p3.16x': 'p3-8gpu',
+    'p3dn.24x': 'p3dn-8gpu',
+    'c5n.18x': 'c5n',
+}
+
+
+def main():
+    spin = ['-', '/', '|', '\\', '-', '/', '|', '\\']
+    logGroupName = '/aws/batch/job'
+
+    jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128]  # Enforce AWS Batch jobName rules
+    jobType = args.job_type
+    jobQueue = job_queues[jobType]
+    jobDefinition = job_definitions[jobType]
+    command = args.command.split()
+    wait = args.wait
+
+    parameters = {
+        'SOURCE_REF': args.source_ref,
+        'WORK_DIR': args.work_dir,
+        'SAVED_OUTPUT': args.saved_output,
+        'SAVE_PATH': args.save_path,
+        'COMMAND': args.command,
+        'REMOTE': args.remote
+    }
+    kwargs = dict(
+        jobName=jobName,
+        jobQueue=jobQueue,
+        jobDefinition=jobDefinition,
+        parameters=parameters,
+    )
+    if args.timeout is not None:
+        kwargs['timeout'] = {'attemptDurationSeconds': args.timeout}
+    submitJobResponse = batch.submit_job(**kwargs)
+
+    jobId = submitJobResponse['jobId']
+    print('Submitted job [{} - {}] to the job queue [{}]'.format(jobName, jobId, jobQueue))
+
+    spinner = 0
+    running = False
+    status_set = set()
+    startTime = 0
+    while wait:
+        time.sleep(random.randint(5, 10))
+        describeJobsResponse = batch.describe_jobs(jobs=[jobId])
+        status = describeJobsResponse['jobs'][0]['status']
+        if status == 'SUCCEEDED' or status == 'FAILED':
+            print('=' * 80)
+            print('Job [{} - {}] {}'.format(jobName, jobId, status))
+
+            sys.exit(status == 'FAILED')
+
+        elif status == 'RUNNING':
+            logStreamName = describeJobsResponse['jobs'][0]['container']['logStreamName']
+            if not running:
+                running = True
+                print('\rJob [{}, {}] is RUNNING.'.format(jobName, jobId))
+                if logStreamName:
+                    print('Output [{}]:\n {}'.format(logStreamName, '=' * 80))
+            if logStreamName:
+                startTime = printLogs(logGroupName, logStreamName, startTime) + 1
+        elif status not in status_set:
+            status_set.add(status)
+            print('\rJob [%s - %s] is %-9s... %s' % (jobName, jobId, status, spin[spinner % len(spin)]),)
+            sys.stdout.flush()
+            spinner += 1
+
+
+if __name__ == '__main__':
+    main()

From 837903dfa991056c25d3c79a6cb81bcf1d6af174 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 15:09:47 -0700
Subject: [PATCH 22/32] [CI] Update unittest-gpu

---
 .github/workflows/unittests-gpu.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 6e9e41c14b..78697f31fa 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -38,5 +38,6 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait
+          python ./tools/batch/submit-test.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait
+          
 

From 074880c099a85f4e5b096dd7559ddbc592adf304 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 17:09:12 -0700
Subject: [PATCH 23/32] [CI] Quiet the pip install + Redirect the logs to
 script.log

---
 .github/workflows/unittests-gpu.yml |  6 +++---
 test.sh                             | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 78697f31fa..c11ecdbf4e 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -26,8 +26,8 @@ jobs:
 
       - name: Install Other Dependencies
         run: |
-          python -m pip install --user --upgrade pip
-          python -m pip install --user -e .[extras]
+          python -m pip install --user --quiet --upgrade pip
+          python -m pip install --user --quiet -e .[extras]
 
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v1
@@ -38,6 +38,6 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-test.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait | tee > script.log
           
 
diff --git a/test.sh b/test.sh
index 1053f2ac06..be6b9b6d3b 100755
--- a/test.sh
+++ b/test.sh
@@ -6,9 +6,9 @@
 echo $PWD
 
 sudo apt-get install libopenblas-dev
-python3 -m pip install --user -upgrade pip
-python3 -m pip install --user setuptools pytest pytest-cov contextvars
-python3 -m pip install --upgrade cython
-python3 -m pip install --pre --user "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
-python3 -m pip install --user -e .[extras]
+python3 -m pip install --user --quiet -upgrade pip
+python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars
+python3 -m pip install --upgrade --quiet cython
+python3 -m pip install --pre --user --quiet "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install --user --quiet -e .[extras]
 python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/

From 061cdfb693c229891b400534ddf21f41257f8899 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 17:16:00 -0700
Subject: [PATCH 24/32] [CI] Remove asserts

---
 tests/test_optimizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index eac29f1265..f5935fbef4 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -8,7 +8,6 @@
 
 def test_adam(ctx):
     with getattr(mx, ctx)():
-        assert False
         opt1 = AdamW
         opt2 = AdamW
         shapes = [(3, 4, 5), (10, 4), (7,)]

From f8b87f47bc2166e843dcefeec0a5207c8bae6b69 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 17:33:32 -0700
Subject: [PATCH 25/32] [CI] Simplify ctx statement

---
 conftest.py                  | 2 +-
 tests/test_attention_cell.py | 8 ++++----
 tests/test_models_bert.py    | 4 ++--
 tests/test_models_electra.py | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/conftest.py b/conftest.py
index bb84ce0051..04efde9756 100644
--- a/conftest.py
+++ b/conftest.py
@@ -212,4 +212,4 @@ def pytest_addoption(parser):
 
 def pytest_generate_tests(metafunc):
     if 'ctx' in metafunc.fixturenames:
-        metafunc.parametrize("ctx", metafunc.config.option.device)
+        metafunc.parametrize("ctx", [getattr(mx, device)() for device in metafunc.config.option.device])
diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py
index 1964f9db4c..c3ddbcfd10 100644
--- a/tests/test_attention_cell.py
+++ b/tests/test_attention_cell.py
@@ -18,7 +18,7 @@
 @pytest.mark.parametrize('rel_score_type', ['share_head', 'no_share_head', 'no'])
 @pytest.mark.seed(123)
 def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize, rel_score_type, ctx):
-    with getattr(mx, ctx)():
+    with ctx:
         batch_size = 5
         query_length, mem_length = 16, 32
         query_head_units = 8
@@ -155,7 +155,7 @@ def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize,
 @pytest.mark.parametrize('normalized', [True, False])
 @pytest.mark.seed(123)
 def test_dot_product_attention(scaled, normalized, ctx):
-    with getattr(mx, ctx)():
+    with ctx:
         num_heads = 4
         batch_size = 32
         query_length, mem_length = 16, 32
@@ -197,7 +197,7 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
             return gen_mem_attn_mask(F, mem, mem_valid_length, data, valid_length,
                                      dtype=self._dtype, layout=self._layout)
 
-    with getattr(mx, ctx)():
+    with ctx:
         batch_size = 4
         query_length = 8
         mem_length = 6
@@ -274,7 +274,7 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
 @pytest.mark.parametrize('hybridize', [False, True])
 @pytest.mark.seed(123)
 def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize, ctx):
-    with getattr(mx, ctx)():
+    with ctx:
         batch_size = 6
         query_length = 25
         mem_length = 20
diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py
index 294582239e..30ae207248 100644
--- a/tests/test_models_bert.py
+++ b/tests/test_models_bert.py
@@ -13,7 +13,7 @@ def test_list_pretrained_bert():
 
 @pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
 def test_bert_small_cfg(compute_layout, ctx):
-    with getattr(mx, ctx)():
+    with ctx:
         cfg = BertModel.get_cfg()
         cfg.defrost()
         cfg.MODEL.vocab_size = 100
@@ -93,7 +93,7 @@ def test_bert_small_cfg(compute_layout, ctx):
 @pytest.mark.parametrize('model_name', list_pretrained_bert())
 def test_bert_get_pretrained(model_name, ctx):
     assert len(list_pretrained_bert()) > 0
-    with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)():
+    with tempfile.TemporaryDirectory() as root, ctx:
         cfg, tokenizer, backbone_params_path, mlm_params_path =\
             get_pretrained_bert(model_name, load_backbone=True, load_mlm=True, root=root)
         assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py
index dcc20a76be..998ee72f53 100644
--- a/tests/test_models_electra.py
+++ b/tests/test_models_electra.py
@@ -27,7 +27,7 @@ def get_test_cfg():
 
 @pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
 def test_electra_model(compute_layout, ctx):
-    with getattr(mx, ctx)():
+    with ctx:
         cfg = get_test_cfg()
         cfg.defrost()
         cfg.MODEL.compute_layout = compute_layout
@@ -67,7 +67,7 @@ def test_electra_model(compute_layout, ctx):
 @pytest.mark.parametrize('model_name', list_pretrained_electra())
 def test_electra_get_pretrained(model_name, ctx):
     assert len(list_pretrained_electra()) > 0
-    with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)():
+    with tempfile.TemporaryDirectory() as root, ctx:
         cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\
             get_pretrained_electra(model_name, root=root,
                                    load_backbone=True, load_disc=True, load_gen=True)

From 86a4ff224f41a5bfa3f4175efb6316cce4132f94 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 17:40:20 -0700
Subject: [PATCH 26/32] [CI] Simplify ctx statement

---
 tests/test_optimizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index f5935fbef4..48c2331a7a 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -7,7 +7,7 @@
 
 
 def test_adam(ctx):
-    with getattr(mx, ctx)():
+    with ctx:
         opt1 = AdamW
         opt2 = AdamW
         shapes = [(3, 4, 5), (10, 4), (7,)]

From b3c017a5021b30f618e1b9e08d74a62589f38d74 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 19:03:48 -0700
Subject: [PATCH 27/32] [CI] test_multi_head_rel_attn_score failed for gpu test

---
 tests/test_attention_cell.py | 221 +++++++++++++++++------------------
 tests/test_models.py         |   2 +-
 2 files changed, 111 insertions(+), 112 deletions(-)

diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py
index c3ddbcfd10..c7166f19c9 100644
--- a/tests/test_attention_cell.py
+++ b/tests/test_attention_cell.py
@@ -274,118 +274,117 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
 @pytest.mark.parametrize('hybridize', [False, True])
 @pytest.mark.seed(123)
 def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize, ctx):
-    with ctx:
-        batch_size = 6
-        query_length = 25
-        mem_length = 20
-        query_head_units = 7
+    batch_size = 6
+    query_length = 25
+    mem_length = 20
+    query_head_units = 7
 
-        # Initialize the attention cell with relative positional embedding
-        base_layout = 'NKT'
-        base_use_einsum = False
-        if method == 'shaw':
-            num_buckets = None
-            max_distance = 20
-        elif method == 't5':
-            num_buckets = 10
-            max_distance = 20
-        elif method == 'transformer_xl':
-            num_buckets = None
-            max_distance = None
-        else:
-            raise NotImplementedError
-        base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
-                                                num_heads=num_heads,
-                                                dropout=0.0,
-                                                method=method,
-                                                num_buckets=num_buckets,
-                                                max_distance=max_distance,
-                                                layout=base_layout,
-                                                use_einsum=base_use_einsum)
-        base_score_cell.initialize()
-        if hybridize:
-            base_score_cell.hybridize()
-        # Generate the data
-        query = mx.np.random.normal(0, 1,
-                                    (batch_size, num_heads, query_length, query_head_units),
-                                    dtype=np.float32)
-        if method != 't5':
-            rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length),
-                                                 dtype=np.float32)
-        else:
-            rel_score_grad = mx.np.random.normal(0, 1,
-                                                 (num_heads, query_length, mem_length),
-                                                 dtype=np.float32)
-        query_positions = mx.np.arange(query_length, dtype=np.int32)
-        mem_positions = mx.np.arange(mem_length, dtype=np.int32)
-        rel_positions = mx.np.expand_dims(query_positions, axis=-1)\
-                        - mx.np.expand_dims(mem_positions, axis=0)
-        mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32)
-        query.attach_grad()
-        with mx.autograd.record():
-            rel_score = base_score_cell(rel_positions, query)
-            rel_score.backward(rel_score_grad)
-        original_rel_score = rel_score.asnumpy()
-        original_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
-        original_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
-        assert original_grad_norm > 0
-        # 1. Test for permutation equivariant
-        # We can permutate the query, rel_positions and the rel_score_grad and the result should
-        # always be the same.
-        query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32)
-        mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32)
+    # Initialize the attention cell with relative positional embedding
+    base_layout = 'NKT'
+    base_use_einsum = False
+    if method == 'shaw':
+        num_buckets = None
+        max_distance = 20
+    elif method == 't5':
+        num_buckets = 10
+        max_distance = 20
+    elif method == 'transformer_xl':
+        num_buckets = None
+        max_distance = None
+    else:
+        raise NotImplementedError
+    base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
+                                            num_heads=num_heads,
+                                            dropout=0.0,
+                                            method=method,
+                                            num_buckets=num_buckets,
+                                            max_distance=max_distance,
+                                            layout=base_layout,
+                                            use_einsum=base_use_einsum)
+    base_score_cell.initialize()
+    if hybridize:
+        base_score_cell.hybridize()
+    # Generate the data
+    query = mx.np.random.normal(0, 1,
+                                (batch_size, num_heads, query_length, query_head_units),
+                                dtype=np.float32)
+    if method != 't5':
+        rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length),
+                                             dtype=np.float32)
+    else:
+        rel_score_grad = mx.np.random.normal(0, 1,
+                                             (num_heads, query_length, mem_length),
+                                             dtype=np.float32)
+    query_positions = mx.np.arange(query_length, dtype=np.int32)
+    mem_positions = mx.np.arange(mem_length, dtype=np.int32)
+    rel_positions = mx.np.expand_dims(query_positions, axis=-1)\
+                    - mx.np.expand_dims(mem_positions, axis=0)
+    mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32)
+    query.attach_grad()
+    with mx.autograd.record():
+        rel_score = base_score_cell(rel_positions, query)
+        rel_score.backward(rel_score_grad)
+    original_rel_score = rel_score.asnumpy()
+    original_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
+    original_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+    assert original_grad_norm > 0
+    # 1. Test for permutation equivariant
+    # We can permutate the query, rel_positions and the rel_score_grad and the result should
+    # always be the same.
+    query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32)
+    mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32)
 
-        query.grad[:] = 0
-        with mx.autograd.record():
-            rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm],
-                                        query[:, :, query_perm, :])
-            if method != 't5':
-                rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm])
-            else:
-                rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm])
-        permutated_out = rel_score.asnumpy()
-        permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
-        permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+    query.grad[:] = 0
+    with mx.autograd.record():
+        rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm],
+                                    query[:, :, query_perm, :])
         if method != 't5':
-            assert_allclose(
-                original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()],
-                permutated_out, 1E-4, 1E-4)
+            rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm])
         else:
-            assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()],
-                            permutated_out, 1E-4, 1E-4)
-        assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4)
-        assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4)
-        # 2. Test for different layout + use/not use einsum
-        for layout in ['NKT', 'NTK', 'TNK']:
-            for use_einsum in [False, True]:
-                if layout == base_layout and use_einsum == base_use_einsum:
-                    continue
-                score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
-                                                   num_heads=num_heads,
-                                                   dropout=0.0,
-                                                   method=method,
-                                                   num_buckets=num_buckets,
-                                                   max_distance=max_distance,
-                                                   layout=layout,
-                                                   use_einsum=use_einsum)
-                score_cell.initialize()
-                if hybridize:
-                    score_cell.hybridize()
-                score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()})
-                query.attach_grad()
-                query.grad[:] = 0
-                with mx.autograd.record():
-                    if layout == 'NKT':
-                        rel_score = score_cell(rel_positions, query)
-                        rel_score.backward(rel_score_grad)
-                    elif layout == 'NTK':
-                        rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3)))
-                        rel_score.backward(rel_score_grad)
-                    elif layout == 'TNK':
-                        rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3)))
-                        rel_score.backward(rel_score_grad)
-                    else:
-                        raise NotImplementedError
-                assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5)
-                layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
-                assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5)
+            rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm])
+    permutated_out = rel_score.asnumpy()
+    permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
+    permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+    if method != 't5':
+        assert_allclose(
+            original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()],
+            permutated_out, 1E-4, 1E-4)
+    else:
+        assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()],
+                        permutated_out, 1E-4, 1E-4)
+    assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4)
+    assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4)
+    # 2. Test for different layout + use/not use einsum
+    for layout in ['NKT', 'NTK', 'TNK']:
+        for use_einsum in [False, True]:
+            if layout == base_layout and use_einsum == base_use_einsum:
+                continue
+            score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
+                                               num_heads=num_heads,
+                                               dropout=0.0,
+                                               method=method,
+                                               num_buckets=num_buckets,
+                                               max_distance=max_distance,
+                                               layout=layout,
+                                               use_einsum=use_einsum)
+            score_cell.initialize()
+            if hybridize:
+                score_cell.hybridize()
+            score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()})
+            query.attach_grad()
+            query.grad[:] = 0
+            with mx.autograd.record():
+                if layout == 'NKT':
+                    rel_score = score_cell(rel_positions, query)
+                    rel_score.backward(rel_score_grad)
+                elif layout == 'NTK':
+                    rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3)))
+                    rel_score.backward(rel_score_grad)
+                elif layout == 'TNK':
+                    rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3)))
+                    rel_score.backward(rel_score_grad)
+                else:
+                    raise NotImplementedError
+            assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5)
+            layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+            assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5)
diff --git a/tests/test_models.py b/tests/test_models.py
index 413941250b..03491b6272 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -13,7 +13,7 @@ def test_list_backbone_names():
 
 @pytest.mark.parametrize('name', list_backbone_names())
 def test_get_backbone(name, ctx):
-    with tempfile.TemporaryDirectory() as root, getattr(mx, ctx)():
+    with tempfile.TemporaryDirectory() as root, ctx:
         model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root)
         net = model_cls.from_cfg(cfg)
         net.load_parameters(local_params_path)

From 5c3a09956ad7574ca453b5c58871229b20a2aaf1 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 25 Aug 2020 20:52:53 -0700
Subject: [PATCH 28/32] [CI] Finalize gpu test - change remote from barry-jin
 to dmlc

---
 .github/workflows/unittests-gpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index c11ecdbf4e..5b328c8da7 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -38,6 +38,6 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/barry-jin/gluon-nlp --command "../../test.sh" --wait | tee > script.log
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait | tee > script.log
           
 

From c866de3c8b790e46aff326014372e0b52296671f Mon Sep 17 00:00:00 2001
From: barry-jin <69359374+barry-jin@users.noreply.github.com>
Date: Tue, 25 Aug 2020 20:57:11 -0700
Subject: [PATCH 29/32] Delete submit-test.py

---
 tools/batch/submit-test.py | 174 -------------------------------------
 1 file changed, 174 deletions(-)
 delete mode 100644 tools/batch/submit-test.py

diff --git a/tools/batch/submit-test.py b/tools/batch/submit-test.py
deleted file mode 100644
index 38eb601073..0000000000
--- a/tools/batch/submit-test.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import argparse
-import random
-import re
-import sys
-import time
-from datetime import datetime
-
-import boto3
-from botocore.compat import total_seconds
-
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-parser.add_argument('--profile', help='profile name of aws account.', type=str,
-                    default=None)
-parser.add_argument('--region', help='Default region when creating new connections', type=str,
-                    default=None)
-parser.add_argument('--name', help='name of the job', type=str, default='dummy')
-parser.add_argument('--job-type', help='type of job to submit.', type=str,
-                    choices=['g4dn.4x', 'g4dn.8x', 'g4dn.12x', 'g4dn.16x',
-                             'p3.2x', 'p3.8x', 'p3.16x', 'p3dn.24x',
-                             'c5n.18x'], default='g4dn.4x')
-parser.add_argument('--source-ref',
-                    help='ref in GluonNLP main github. e.g. master, refs/pull/500/head',
-                    type=str, default='master')
-parser.add_argument('--work-dir',
-                    help='working directory inside the repo. e.g. scripts/preprocess',
-                    type=str, default='scripts/preprocess')
-parser.add_argument('--saved-output',
-                    help='output to be saved, relative to working directory. '
-                         'it can be either a single file or a directory',
-                    type=str, default='.')
-parser.add_argument('--save-path',
-                    help='s3 path where files are saved.',
-                    type=str, default='batch/temp/{}'.format(datetime.now().isoformat()))
-parser.add_argument('--command', help='command to run', type=str,
-                    default='git rev-parse HEAD | tee stdout.log')
-parser.add_argument('--remote',
-                    help='git repo address. https://github.com/dmlc/gluon-nlp',
-                    type=str, default="https://github.com/dmlc/gluon-nlp")
-parser.add_argument('--wait', help='block wait until the job completes. '
-                    'Non-zero exit code if job fails.', action='store_true')
-parser.add_argument('--timeout', help='job timeout in seconds', default=None, type=int)
-
-
-args = parser.parse_args()
-
-session = boto3.Session(profile_name=args.profile, region_name=args.region)
-batch, cloudwatch = [session.client(service_name=sn) for sn in ['batch', 'logs']]
-
-
-def printLogs(logGroupName, logStreamName, startTime):
-    kwargs = {'logGroupName': logGroupName,
-              'logStreamName': logStreamName,
-              'startTime': startTime,
-              'startFromHead': True}
-
-    lastTimestamp = 0
-    printMessage = False
-    while True:
-        logEvents = cloudwatch.get_log_events(**kwargs)
-
-        for event in logEvents['events']:
-            lastTimestamp = event['timestamp']
-            timestamp = datetime.utcfromtimestamp(lastTimestamp / 1000.0).isoformat()
-            if printMessage:
-                print(event['message'])
-            else:
-                if event['message']=="/gluon-nlp/tools/batch":
-                    printMessage = True
-                    print(event['message'])
-            # print('[{}] {}'.format((timestamp + '.000')[:23] + 'Z', event['message']))
-
-        nextToken = logEvents['nextForwardToken']
-        if nextToken and kwargs.get('nextToken') != nextToken:
-            kwargs['nextToken'] = nextToken
-        else:
-            break
-    return lastTimestamp
-
-
-def nowInMillis():
-    endTime = long(total_seconds(datetime.utcnow() - datetime(1970, 1, 1))) * 1000
-    return endTime
-
-
-job_definitions = {
-    'g4dn.4x': 'gluon-nlp-1-jobs:5',
-    'g4dn.8x': 'gluon-nlp-1-jobs:4',
-    'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:1',
-    'g4dn.16x': 'gluon-nlp-1-jobs:3',
-    'p3.2x': 'gluon-nlp-1-jobs:11',
-    'p3.8x': 'gluon-nlp-1-4gpu-jobs:2',
-    'p3.16x': 'gluon-nlp-1-8gpu-jobs:1',
-    'p3dn.24x': 'gluon-nlp-1-8gpu-jobs:2',
-    'c5n.18x': 'gluon-nlp-1-cpu-jobs:2',
-}
-
-job_queues = {
-    'g4dn.4x': 'g4dn',
-    'g4dn.8x': 'g4dn',
-    'g4dn.12x': 'g4dn-multi-gpu',
-    'g4dn.16x': 'g4dn',
-    'p3.2x': 'p3',
-    'p3.8x': 'p3-4gpu',
-    'p3.16x': 'p3-8gpu',
-    'p3dn.24x': 'p3dn-8gpu',
-    'c5n.18x': 'c5n',
-}
-
-
-def main():
-    spin = ['-', '/', '|', '\\', '-', '/', '|', '\\']
-    logGroupName = '/aws/batch/job'
-
-    jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128]  # Enforce AWS Batch jobName rules
-    jobType = args.job_type
-    jobQueue = job_queues[jobType]
-    jobDefinition = job_definitions[jobType]
-    command = args.command.split()
-    wait = args.wait
-
-    parameters = {
-        'SOURCE_REF': args.source_ref,
-        'WORK_DIR': args.work_dir,
-        'SAVED_OUTPUT': args.saved_output,
-        'SAVE_PATH': args.save_path,
-        'COMMAND': args.command,
-        'REMOTE': args.remote
-    }
-    kwargs = dict(
-        jobName=jobName,
-        jobQueue=jobQueue,
-        jobDefinition=jobDefinition,
-        parameters=parameters,
-    )
-    if args.timeout is not None:
-        kwargs['timeout'] = {'attemptDurationSeconds': args.timeout}
-    submitJobResponse = batch.submit_job(**kwargs)
-
-    jobId = submitJobResponse['jobId']
-    print('Submitted job [{} - {}] to the job queue [{}]'.format(jobName, jobId, jobQueue))
-
-    spinner = 0
-    running = False
-    status_set = set()
-    startTime = 0
-    while wait:
-        time.sleep(random.randint(5, 10))
-        describeJobsResponse = batch.describe_jobs(jobs=[jobId])
-        status = describeJobsResponse['jobs'][0]['status']
-        if status == 'SUCCEEDED' or status == 'FAILED':
-            print('=' * 80)
-            print('Job [{} - {}] {}'.format(jobName, jobId, status))
-
-            sys.exit(status == 'FAILED')
-
-        elif status == 'RUNNING':
-            logStreamName = describeJobsResponse['jobs'][0]['container']['logStreamName']
-            if not running:
-                running = True
-                print('\rJob [{}, {}] is RUNNING.'.format(jobName, jobId))
-                if logStreamName:
-                    print('Output [{}]:\n {}'.format(logStreamName, '=' * 80))
-            if logStreamName:
-                startTime = printLogs(logGroupName, logStreamName, startTime) + 1
-        elif status not in status_set:
-            status_set.add(status)
-            print('\rJob [%s - %s] is %-9s... %s' % (jobName, jobId, status, spin[spinner % len(spin)]),)
-            sys.stdout.flush()
-            spinner += 1
-
-
-if __name__ == '__main__':
-    main()

From 67f2e385c66ee984b69eac5cf6da10c65764de4d Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 26 Aug 2020 12:23:01 -0700
Subject: [PATCH 30/32] [CI] Update test working directory

---
 .github/workflows/unittests-gpu.yml | 2 +-
 test.sh                             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 5b328c8da7..7f91a7bdf5 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -38,6 +38,6 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "../../test.sh" --wait | tee > script.log
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/gluon-nlp/test.sh" --wait | tee > script.log
           
 
diff --git a/test.sh b/test.sh
index be6b9b6d3b..0da2fefd51 100755
--- a/test.sh
+++ b/test.sh
@@ -11,4 +11,4 @@ python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars
 python3 -m pip install --upgrade --quiet cython
 python3 -m pip install --pre --user --quiet "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
 python3 -m pip install --user --quiet -e .[extras]
-python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" ../../tests/
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" /gluon-nlp/tests/

From 0792deaeea8bcd516c39f449944efbb2672925c9 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 27 Aug 2020 09:22:38 -0700
Subject: [PATCH 31/32] [CI] Update AWS Batch job type

---
 .github/workflows/unittests-gpu.yml | 2 +-
 test.sh                             | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index 7f91a7bdf5..fb6fb28674 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -38,6 +38,6 @@ jobs:
 
       - name: Test project on AWS Batch
         run: |
-          python ./tools/batch/submit-job.py --region us-east-1 --job-type p3.2x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/gluon-nlp/test.sh" --wait | tee > script.log
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type g4dn.4x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/gluon-nlp/test.sh" --wait | tee > script.log
           
 
diff --git a/test.sh b/test.sh
index 0da2fefd51..2b6c2d36ba 100755
--- a/test.sh
+++ b/test.sh
@@ -1,8 +1,6 @@
 #!/bin/bash
 # Shell script for installing dependencies and running test on AWS Batch
 
-# alias python3='/usr/bin/python3'
-
 echo $PWD
 
 sudo apt-get install libopenblas-dev

From 9d4c45981213a1b54196f8ea191e0054fc4754e4 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 27 Aug 2020 16:18:16 -0700
Subject: [PATCH 32/32] [CI] Allow test logs downloading

---
 .github/workflows/unittests-gpu.yml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
index fb6fb28674..2d1f06d9ec 100644
--- a/.github/workflows/unittests-gpu.yml
+++ b/.github/workflows/unittests-gpu.yml
@@ -39,5 +39,15 @@ jobs:
       - name: Test project on AWS Batch
         run: |
           python ./tools/batch/submit-job.py --region us-east-1 --job-type g4dn.4x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/gluon-nlp/test.sh" --wait | tee > script.log
-          
+
+      - name: Upload log file for AWS Batch test results
+        uses: actions/upload-artifact@v2
+        with:
+          name: GPU_Test_Results
+          path: script.log
+
+      - name: Download log file for AWS Batch test results
+        uses: actions/download-artifact@v2
+        with:
+          name: GPU_Test_Results