Skip to content

Commit

Permalink
Merge pull request #5584 from Masao-Someki/feature/multi_dataset_support
Browse files Browse the repository at this point in the history
Support external dataset library for ESPnetEasy
  • Loading branch information
mergify[bot] committed Mar 18, 2024
2 parents d004740 + e880d6b commit c3064f9
Show file tree
Hide file tree
Showing 35 changed files with 1,476 additions and 1,181 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci_on_centos7.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,4 @@ jobs:
PATH="/opt/rh/devtoolset-7/root/usr/bin:${PATH:-}"
./ci/test_python_espnet1.sh
./ci/test_python_espnet2.sh
./ci/test_python_espnetez.sh
1 change: 1 addition & 0 deletions .github/workflows/ci_on_debian11.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@ jobs:
run: |
./ci/test_python_espnet1.sh
./ci/test_python_espnet2.sh
./ci/test_python_espnetez.sh
85 changes: 85 additions & 0 deletions .github/workflows/ci_on_ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,91 @@ jobs:
with:
flags: test_configuration_espnet2

unit_test_espnetez_and_integration_test_espnetez:
runs-on: ${{ matrix.os }}
needs: process_labels
if: |
github.event.pull_request.draft == false &&
needs.process_labels.outputs.is_docker == 'false'
strategy:
max-parallel: 20
matrix:
os: [ubuntu-latest]
python-version: [3.8, 3.9, "3.10"]
pytorch-version: [2.0.1, 2.1.0]
chainer-version: [6.0.0]
use-conda: [false]
include:
- os: ubuntu-latest
python-version: "3.7"
pytorch-version: 1.12.1
chainer-verssion: 6.0.0
use-conda: false
- os: ubuntu-latest
python-version: "3.7"
pytorch-version: 1.13.1
chainer-verssion: 6.0.0
use-conda: false
steps:
- uses: actions/checkout@master
- uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/Makefile') }}
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: 'x64'
- name: install dependencies
run: |
sudo apt-get update -qq
# NOTE(kamo): g++-7 doesn't exist in ubuntu-latest
sudo apt-get install -qq -y cmake libsndfile1-dev bc sox ffmpeg
- name: Get PR labels
id: pr-labels
uses: joerick/pr-labels-action@v1.0.9
- name: install espnet
env:
ESPNET_PYTHON_VERSION: ${{ matrix.python-version }}
TH_VERSION: ${{ matrix.pytorch-version }}
CHAINER_VERSION: ${{ matrix.chainer-version }}
USE_CONDA: ${{ matrix.use-conda }}
run: |
./ci/install.sh
- name: test python
run: ./ci/test_python_espnetez.sh
- uses: codecov/codecov-action@v2
with:
flags: test_python_espnetez
- name: coverage erase
continue-on-error: true
run: |
source tools/activate_python.sh
coverage erase
- name: install kaldi
run: |
./ci/install_kaldi.sh
- name: test utils
run: ./ci/test_utils.sh
- uses: codecov/codecov-action@v2
with:
flags: test_utils
- name: coverage erase
continue-on-error: true
run: |
source tools/activate_python.sh
coverage erase
- name: test espnetez integration
run: ./ci/test_integration_espnetez.sh
- uses: codecov/codecov-action@v2
with:
flags: test_integration_espnetez


test_import:
runs-on: ${{ matrix.os }}
needs: process_labels
Expand Down
162 changes: 162 additions & 0 deletions ci/test_integration_espnetez.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#!/usr/bin/env bash

# set -euo pipefail

source tools/activate_python.sh
PYTHONPATH="${PYTHONPATH:-}:$(pwd)/tools/s3prl"
export PYTHONPATH
python="python -m coverage run --append"
cwd=$(pwd)

gen_dummy_coverage(){
# To avoid a problem when parallel running for `coverage run`.
# Please put this command after cd ./egs2/foo/bar
touch empty.py; ${python} empty.py
}

#### Make sure chainer-independent ####
python3 -m pip uninstall -y chainer

# Download mini_an4 as test data and prepare flac data
cd ./egs2/mini_an4/asr1 || exit
./run.sh --stage 1 --stop-stage 1
./run.sh --stage 2 --stop-stage 4 --feats-type "raw"

# Now we have flac files under dump/org/train_*/data/format.*/
# and wav.scp files under dump/train_*/

rm -rf exp data/spm
# [ESPnet Easy] test asr recipe with coverage
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
--task asr \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path conf/train_asr_transformer_debug.yaml \
--train_sentencepiece_model \
--run_collect_stats \
--run_train

# finetuning
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
--task asr \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path conf/train_asr_transformer_debug.yaml \
--run_finetune

# Remove generated files in order to reduce the disk usage
rm -rf exp data/spm

# [ESPnet Easy] test asr transducer recipe with coverage
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
--task asr \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path conf/train_asr_transducer_debug.yaml \
--train_sentencepiece_model \
--run_collect_stats \
--run_train

# finetuning
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
--task asr \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path conf/train_asr_transducer_debug.yaml \
--run_finetune

# Remove generated files in order to reduce the disk usage
rm -rf exp data/spm

# [ESPnet Easy] test lm recipe with coverage
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
--task lm \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path ../lm1/conf/train_transformer.yaml \
--train_sentencepiece_model \
--run_collect_stats \
--run_train

# finetune
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
--task lm \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path ../lm1/conf/train_transformer.yaml \
--run_finetune

# Remove generated files in order to reduce the disk usage
rm -rf exp data/spm


# [ESPnet Easy] test slu recipe with coverage
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
--task slu \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path ../s2t1/conf/train_slu_transformer.yaml \
--train_sentencepiece_model \
--run_collect_stats \
--run_train

# finetune
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
--task slu \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path ../s2t1/conf/train_slu_transformer.yaml \
--run_finetune

# Remove generated files in order to reduce the disk usage
rm -rf exp data/spm


# [ESPnet Easy] test tts recipe with coverage
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \
--task tts \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path ../tts1/conf/train_tacotron2_debug.yaml \
--train_sentencepiece_model \
--run_collect_stats \
--run_train

# finetune
python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \
--task tts \
--data_path data \
--train_dump_path dump/raw/train_nodev \
--valid_dump_path dump/raw/train_dev \
--exp_path ./exp \
--config_path ../tts1/conf/train_tacotron2_debug.yaml \
--run_finetune

# Remove generated files in order to reduce the disk usage
rm -rf exp data/spm

cd "${cwd}" || exit


echo "=== report ==="
python -m coverage combine egs2/*/*/.coverage
python -m coverage report
python -m coverage xml
2 changes: 1 addition & 1 deletion ci/test_python_espnet1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ exclude="egs2/TEMPLATE/asr1/utils,egs2/TEMPLATE/asr1/steps,egs2/TEMPLATE/tts1/si
# pycodestyle
pycodestyle --exclude "${exclude}" --show-source --show-pep8

LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q --ignore test/espnet2 test
LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q --ignore test/espnet2 --ignore test/espnetez test

echo "=== report ==="
coverage report
Expand Down
19 changes: 19 additions & 0 deletions ci/test_python_espnetez.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

. tools/activate_python.sh
. tools/extra_path.sh

set -euo pipefail

exclude="egs2/TEMPLATE/asr1/utils,egs2/TEMPLATE/asr1/steps,egs2/TEMPLATE/tts1/sid,doc,tools,test_utils/bats-core,test_utils/bats-support,test_utils/bats-assert"

# flake8
# "$(dirname $0)"/test_flake8.sh
# pycodestyle
pycodestyle --exclude "${exclude}" --show-source --show-pep8

LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q test/espnetez/test_ez.py

echo "=== report ==="
coverage report
coverage xml
1 change: 1 addition & 0 deletions egs2/mini_an4/asr1/conf/finetune_with_lora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
use_lora: true
51 changes: 51 additions & 0 deletions egs2/mini_an4/asr1/conf/train_asr_transducer_debug.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# This is a debug config for CI
batch_type: unsorted
batch_size: 2
accum_grad: 1
max_epoch: 1
num_iters_per_epoch: 1
patience:
# The initialization method for model parameters
init: xavier_uniform
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10

encoder: transformer
encoder_conf:
output_size: 2
attention_heads: 2
linear_units: 2
num_blocks: 2
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv1d2
normalize_before: true

decoder: transducer
decoder_conf:
rnn_type: lstm
num_layers: 1 # Decoder Layers
hidden_size: 4 # Decoder dim
dropout: 0.1
dropout_embed: 0.2


joint_net_conf:
joint_space_size: 4

model: espnet
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false

optim: adam
optim_conf:
lr: 0.005
scheduler: warmuplr
scheduler_conf:
warmup_steps: 4
50 changes: 50 additions & 0 deletions egs2/mini_an4/asr1/conf/train_asr_transformer_debug.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# This is a debug config for CI
batch_type: unsorted
batch_size: 2
accum_grad: 1
max_epoch: 1
num_iters_per_epoch: 1
patience:
# The initialization method for model parameters
init: xavier_uniform
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10


encoder: transformer
encoder_conf:
output_size: 2
attention_heads: 2
linear_units: 2
num_blocks: 2
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv1d2
normalize_before: true

decoder: transformer
decoder_conf:
attention_heads: 2
linear_units: 2
num_blocks: 2
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0

model: espnet
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false

optim: adam
optim_conf:
lr: 0.005
scheduler: warmuplr
scheduler_conf:
warmup_steps: 4

0 comments on commit c3064f9

Please sign in to comment.