Merge pull request #5584 from Masao-Someki/feature/multi_dataset_support

Support external dataset library for ESPnetEasy
espnet · Mar 18, 2024 · c3064f9 · c3064f9
2 parents d004740 + e880d6b
commit c3064f9
Show file tree

Hide file tree

Showing 35 changed files with 1,476 additions and 1,181 deletions.
diff --git a/.github/workflows/ci_on_centos7.yml b/.github/workflows/ci_on_centos7.yml
@@ -58,3 +58,4 @@ jobs:
           PATH="/opt/rh/devtoolset-7/root/usr/bin:${PATH:-}"
           ./ci/test_python_espnet1.sh
           ./ci/test_python_espnet2.sh
+          ./ci/test_python_espnetez.sh
diff --git a/.github/workflows/ci_on_debian11.yml b/.github/workflows/ci_on_debian11.yml
@@ -46,3 +46,4 @@ jobs:
         run: |
           ./ci/test_python_espnet1.sh
           ./ci/test_python_espnet2.sh
+          ./ci/test_python_espnetez.sh
diff --git a/.github/workflows/ci_on_ubuntu.yml b/.github/workflows/ci_on_ubuntu.yml
@@ -235,6 +235,91 @@ jobs:
         with:
           flags: test_configuration_espnet2
 
+  unit_test_espnetez_and_integration_test_espnetez:
+    runs-on: ${{ matrix.os }}
+    needs: process_labels
+    if: |
+      github.event.pull_request.draft == false &&
+      needs.process_labels.outputs.is_docker == 'false'
+    strategy:
+      max-parallel: 20
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.8, 3.9, "3.10"]
+        pytorch-version: [2.0.1, 2.1.0]
+        chainer-version: [6.0.0]
+        use-conda: [false]
+        include:
+          - os: ubuntu-latest
+            python-version: "3.7"
+            pytorch-version: 1.12.1
+            chainer-verssion: 6.0.0
+            use-conda: false
+          - os: ubuntu-latest
+            python-version: "3.7"
+            pytorch-version: 1.13.1
+            chainer-verssion: 6.0.0
+            use-conda: false
+    steps:
+      - uses: actions/checkout@master
+      - uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/Makefile') }}
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: 'x64'
+      - name: install dependencies
+        run: |
+          sudo apt-get update -qq
+          # NOTE(kamo): g++-7 doesn't exist in ubuntu-latest
+          sudo apt-get install -qq -y cmake libsndfile1-dev bc sox ffmpeg
+      - name: Get PR labels
+        id: pr-labels
+        uses: joerick/pr-labels-action@v1.0.9
+      - name: install espnet
+        env:
+          ESPNET_PYTHON_VERSION: ${{ matrix.python-version }}
+          TH_VERSION: ${{ matrix.pytorch-version }}
+          CHAINER_VERSION: ${{ matrix.chainer-version }}
+          USE_CONDA: ${{ matrix.use-conda }}
+        run: |
+          ./ci/install.sh
+
+      - name: test python
+        run: ./ci/test_python_espnetez.sh
+      - uses: codecov/codecov-action@v2
+        with:
+          flags: test_python_espnetez
+      - name: coverage erase
+        continue-on-error: true
+        run: |
+          source tools/activate_python.sh
+          coverage erase
+
+      - name: install kaldi
+        run: |
+          ./ci/install_kaldi.sh
+
+      - name: test utils
+        run: ./ci/test_utils.sh
+      - uses: codecov/codecov-action@v2
+        with:
+          flags: test_utils
+      - name: coverage erase
+        continue-on-error: true
+        run: |
+          source tools/activate_python.sh
+          coverage erase
+
+      - name: test espnetez integration
+        run: ./ci/test_integration_espnetez.sh
+      - uses: codecov/codecov-action@v2
+        with:
+          flags: test_integration_espnetez
+
+
   test_import:
     runs-on: ${{ matrix.os }}
     needs: process_labels

diff --git a/ci/test_integration_espnetez.sh b/ci/test_integration_espnetez.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+
+# set -euo pipefail
+
+source tools/activate_python.sh
+PYTHONPATH="${PYTHONPATH:-}:$(pwd)/tools/s3prl"
+export PYTHONPATH
+python="python -m coverage run --append"
+cwd=$(pwd)
+
+gen_dummy_coverage(){
+    # To avoid a problem when parallel running for `coverage run`.
+    # Please put this command after cd ./egs2/foo/bar
+    touch empty.py; ${python} empty.py
+}
+
+#### Make sure chainer-independent ####
+python3 -m pip uninstall -y chainer
+
+# Download mini_an4 as test data and prepare flac data
+cd ./egs2/mini_an4/asr1 || exit
+./run.sh --stage 1 --stop-stage 1
+./run.sh --stage 2 --stop-stage 4 --feats-type "raw"
+
+# Now we have flac files under dump/org/train_*/data/format.*/
+# and wav.scp files under dump/train_*/
+
+rm -rf exp data/spm
+# [ESPnet Easy] test asr recipe with coverage
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
+    --task asr \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path conf/train_asr_transformer_debug.yaml \
+    --train_sentencepiece_model \
+    --run_collect_stats \
+    --run_train
+
+# finetuning
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
+    --task asr \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path conf/train_asr_transformer_debug.yaml \
+    --run_finetune
+
+# Remove generated files in order to reduce the disk usage
+rm -rf exp data/spm
+
+# [ESPnet Easy] test asr transducer recipe with coverage
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
+    --task asr \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path conf/train_asr_transducer_debug.yaml \
+    --train_sentencepiece_model \
+    --run_collect_stats \
+    --run_train
+
+# finetuning
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
+    --task asr \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path conf/train_asr_transducer_debug.yaml \
+    --run_finetune
+
+# Remove generated files in order to reduce the disk usage
+rm -rf exp data/spm
+
+# [ESPnet Easy] test lm recipe with coverage
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
+    --task lm \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path ../lm1/conf/train_transformer.yaml \
+    --train_sentencepiece_model \
+    --run_collect_stats \
+    --run_train
+
+# finetune
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
+    --task lm \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path ../lm1/conf/train_transformer.yaml \
+    --run_finetune
+
+# Remove generated files in order to reduce the disk usage
+rm -rf exp data/spm
+
+
+# [ESPnet Easy] test slu recipe with coverage
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
+    --task slu \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path ../s2t1/conf/train_slu_transformer.yaml \
+    --train_sentencepiece_model \
+    --run_collect_stats \
+    --run_train
+
+# finetune
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
+    --task slu \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path ../s2t1/conf/train_slu_transformer.yaml \
+    --run_finetune
+
+# Remove generated files in order to reduce the disk usage
+rm -rf exp data/spm
+
+
+# [ESPnet Easy] test tts recipe with coverage
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
+    --task tts \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path ../tts1/conf/train_tacotron2_debug.yaml \
+    --train_sentencepiece_model \
+    --run_collect_stats \
+    --run_train
+
+# finetune
+python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
+    --task tts \
+    --data_path data \
+    --train_dump_path dump/raw/train_nodev \
+    --valid_dump_path dump/raw/train_dev \
+    --exp_path ./exp \
+    --config_path ../tts1/conf/train_tacotron2_debug.yaml \
+    --run_finetune
+
+# Remove generated files in order to reduce the disk usage
+rm -rf exp data/spm
+
+cd "${cwd}" || exit
+
+
+echo "=== report ==="
+python -m coverage combine egs2/*/*/.coverage
+python -m coverage report
+python -m coverage xml
diff --git a/ci/test_python_espnet1.sh b/ci/test_python_espnet1.sh
@@ -12,7 +12,7 @@ exclude="egs2/TEMPLATE/asr1/utils,egs2/TEMPLATE/asr1/steps,egs2/TEMPLATE/tts1/si
 # pycodestyle
 pycodestyle --exclude "${exclude}" --show-source --show-pep8
 
-LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q --ignore test/espnet2 test
+LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q --ignore test/espnet2 --ignore test/espnetez test
 
 echo "=== report ==="
 coverage report

diff --git a/ci/test_python_espnetez.sh b/ci/test_python_espnetez.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+. tools/activate_python.sh
+. tools/extra_path.sh
+
+set -euo pipefail
+
+exclude="egs2/TEMPLATE/asr1/utils,egs2/TEMPLATE/asr1/steps,egs2/TEMPLATE/tts1/sid,doc,tools,test_utils/bats-core,test_utils/bats-support,test_utils/bats-assert"
+
+# flake8
+# "$(dirname $0)"/test_flake8.sh
+# pycodestyle
+pycodestyle --exclude "${exclude}" --show-source --show-pep8
+
+LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q test/espnetez/test_ez.py
+
+echo "=== report ==="
+coverage report
+coverage xml
diff --git a/egs2/mini_an4/asr1/conf/finetune_with_lora.yaml b/egs2/mini_an4/asr1/conf/finetune_with_lora.yaml
@@ -0,0 +1 @@
+use_lora: true
diff --git a/egs2/mini_an4/asr1/conf/train_asr_transducer_debug.yaml b/egs2/mini_an4/asr1/conf/train_asr_transducer_debug.yaml
@@ -0,0 +1,51 @@
+# This is a debug config for CI
+batch_type: unsorted
+batch_size: 2
+accum_grad: 1
+max_epoch: 1
+num_iters_per_epoch: 1
+patience:
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 2
+    attention_heads: 2
+    linear_units: 2
+    num_blocks: 2
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv1d2
+    normalize_before: true
+
+decoder: transducer
+decoder_conf:
+    rnn_type: lstm
+    num_layers: 1  # Decoder Layers
+    hidden_size: 4  # Decoder dim
+    dropout: 0.1
+    dropout_embed: 0.2
+
+
+joint_net_conf:
+    joint_space_size: 4
+
+model: espnet
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 4
diff --git a/egs2/mini_an4/asr1/conf/train_asr_transformer_debug.yaml b/egs2/mini_an4/asr1/conf/train_asr_transformer_debug.yaml
@@ -0,0 +1,50 @@
+# This is a debug config for CI
+batch_type: unsorted
+batch_size: 2
+accum_grad: 1
+max_epoch: 1
+num_iters_per_epoch: 1
+patience:
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+
+encoder: transformer
+encoder_conf:
+    output_size: 2
+    attention_heads: 2
+    linear_units: 2
+    num_blocks: 2
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv1d2
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 2
+    linear_units: 2
+    num_blocks: 2
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model: espnet
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 4