Merge branch 'master' into codec

espnet · Jun 6, 2024 · 3d7eff8 · 3d7eff8
2 parents 1c4199f + 1bbdf96
commit 3d7eff8
Show file tree

Hide file tree

Showing 240 changed files with 12,861 additions and 1,544 deletions.
diff --git a/.github/workflows/ci_on_macos.yml b/.github/workflows/ci_on_macos.yml
@@ -26,7 +26,6 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-          architecture: 'x64'
       - name: Get PR labels
         id: pr-labels
         uses: joerick/pr-labels-action@v1.0.9
@@ -43,4 +42,4 @@ jobs:
           # CC: /usr/local/bin/gcc-11
           # CXX: /usr/local/bin/g++-11
         run: |
-          ./ci/install.sh
+          ./ci/install_macos.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
         exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|tools/installers/patch_mwerSegmenter)
 
 -   repo: https://github.com/psf/black
-    rev: 24.3.0
+    rev: 24.4.2
     hooks:
     -   id: black
         exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|doc)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -55,10 +55,6 @@ For ESPnet2, we do not recommend preparing the recipe's stages for each corpus b
 
 The common pipeline of ESPnet2 recipes will take care of the `RESULTS.md` generation, model packing, and uploading. ESPnet2 models are maintained at Hugging Face and Zenodo (Deprecated).
 You can also refer to the document at https://github.com/espnet/espnet_model_zoo
-To upload your model, you need first (This is currently deprecated, uploading to Huggingface Hub is preferred) :
-1. Sign up to Zenodo: https://zenodo.org/
-2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
-3. Set your environment: % export ACCESS_TOKEN="<your token>"
 
 To port models from zenodo using Hugging Face hub,
 1. Create a Hugging Face account - https://huggingface.co/

diff --git a/ci/install.sh b/ci/install.sh
@@ -21,7 +21,7 @@ ${CXX:-g++} -v
 
     . ./activate_python.sh
     # FIXME(kamo): Failed to compile pesq
-    make TH_VERSION="${TH_VERSION}" WITH_OMP="${WITH_OMP-ON}" all warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done whisper.done parallel-wavegan.done muskits.done lora.done
+    make TH_VERSION="${TH_VERSION}" WITH_OMP="${WITH_OMP-ON}" all warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done whisper.done parallel-wavegan.done muskits.done lora.done sph2pipe
     rm -rf kaldi
 )
 . tools/activate_python.sh

diff --git a/ci/install_macos.sh b/ci/install_macos.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+# NOTE: DO NOT WRITE DISTRIBUTION-SPECIFIC COMMANDS HERE (e.g., apt, dnf, etc)
+
+set -euo pipefail
+
+${CXX:-g++} -v
+
+(
+    set -euo pipefail
+    cd tools
+
+    # To skip error
+    mkdir -p kaldi/egs/wsj/s5/utils && touch kaldi/egs/wsj/s5/utils/parse_options.sh
+    if ${USE_CONDA}; then
+        ./setup_anaconda.sh venv espnet ${ESPNET_PYTHON_VERSION}
+        # To install via pip instead of conda
+    else
+        ./setup_venv.sh "$(command -v python3)" venv
+    fi
+
+    . ./activate_python.sh
+    # FIXME(kamo): Failed to compile pesq
+    make TH_VERSION="${TH_VERSION}" WITH_OMP="${WITH_OMP-ON}" all warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done whisper.done parallel-wavegan.done muskits.done lora.done
+    rm -rf kaldi
+)
+. tools/activate_python.sh
+python3 --version
+
+python3 -m pip install https://github.com/kpu/kenlm/archive/master.zip
+# NOTE(kamo): tensorboardx is used for chainer mode only
+python3 -m pip install tensorboardx
+# NOTE(kamo): Create matplotlib.cache to reduce runtime for test phase
+python3 -c "import matplotlib.pyplot"
+# NOTE(wangyou): onnxruntime and onnx2torch are used for testing dnsmos functions
+cat >> constraints.txt << EOF
+torch==${TH_VERSION}
+EOF
+python3 -m pip install -c constraints.txt onnxruntime onnx2torch
+
+# NOTE(kan-bayashi): Fix the error in black installation.
+#   See: https://github.com/psf/black/issues/1707
+python3 -m pip uninstall -y typing
+
+# NOTE(kamo): Workaround for pip resolve issue (I think this is a bug of pip)
+python3 -m pip install "hacking>=2.0.0" "flake8>=3.7.8"
+
+# install espnet
+python3 -m pip install -e ".[test]"
+python3 -m pip install -e ".[doc]"
+
+# log
+python3 -m pip freeze
+
+
+# Check pytorch version
+python3 <<EOF
+import torch
+from packaging.version import parse as L
+version = '$TH_VERSION'.split(".")
+next_version = f"{version[0]}.{version[1]}.{int(version[2]) + 1}"
+
+if L(torch.__version__) < L('$TH_VERSION') or L(torch.__version__) >= L(next_version):
+    raise RuntimeError(f"Pytorch=$TH_VERSION is expected, but got pytorch={torch.__version__}. This is a bug in installation scripts")
+EOF
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
@@ -37,27 +37,27 @@ use_lm=true
 for t in ${feats_types}; do
     for t2 in ${token_types}; do
         echo "==== feats_type=${t}, token_types=${t2} ==="
-        ./run.sh --use_lm ${use_lm} --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --token-type "${t2}" --python "${python}" --asr-args "--num_workers 0"
+        ./run.sh --use_lm ${use_lm} --ngpu 0 --stage 6 --stop-stage 13 --skip-packing false --feats-type "${t}" --token-type "${t2}" --python "${python}" --asr-args "--num_workers 0"
     done
     use_lm=false
     echo "==== feats_type=raw_copy, token_types=bpe ==="
     cp -r dump/raw data/
-    ./run.sh --use_lm ${use_lm} --ngpu 0 --stage 4 --stop-stage 13 --skip-upload false --feats-type "raw_copy" --token-type "${t2}" \
+    ./run.sh --use_lm ${use_lm} --ngpu 0 --stage 4 --stop-stage 13 --skip-packing false --feats-type "raw_copy" --token-type "${t2}" \
         --train_set raw/train_nodev --valid_set raw/train_dev --test_sets raw/test --python "${python}" --asr-args "--num_workers 0"
 done
 echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
-./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
     --feats_normalize "utterance_mvn" --python "${python}" \
     --asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
 
 echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn, with data augmentation ==="
-./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
     --asr_config "conf/train_asr_rnn_data_aug_debug.yaml" \
     --feats_normalize "utterance_mvn" --python "${python}" \
     --asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
 
 echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
-./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
     --feats_normalize "utterance_mvn"  --python "${python}" \
     --asr_config "" --asr-tag "train_raw_bpe_streaming" \
     --asr-args "--model_conf extract_feats_in_collect_stats=false --encoder=contextual_block_transformer
@@ -67,14 +67,14 @@ echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_fe
 
 if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then
     echo "==== Transducer, feats_type=raw, token_types=bpe ==="
-    ./run.sh --asr-tag "espnet_model_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false \
+    ./run.sh --asr-tag "espnet_model_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false \
         --feats-type "raw" --token-type "bpe" --python "${python}" \
         --asr-args "--decoder transducer --decoder_conf hidden_size=2 --model_conf ctc_weight=0.0 --joint_net_conf joint_space_size=2 --num_workers 0 \
         --best_model_criterion '(valid, loss, min)'" --inference_asr_model "valid.loss.best.pth"
 
     if [ "$(python3 -c "import torch; print(torch.cuda.is_available())")" == "True" ]; then
         echo "==== Multi-Blank Transducer, feats_type=raw, token_types=bpe ==="
-        ./run.sh --asr-tag "espnet_model_multi_blank_transducer" --ngpu 1 --stage 10 --stop-stage 13 --skip-upload false \
+        ./run.sh --asr-tag "espnet_model_multi_blank_transducer" --ngpu 1 --stage 10 --stop-stage 13 --skip-packing false \
             --feats-type "raw" --token-type "bpe" --python "${python}" \
             --asr-tag "train_multi_black_transducer" \
             --asr_args "--decoder transducer --decoder_conf hidden_size=2 --model_conf ctc_weight=0.0 --joint_net_conf joint_space_size=2 \
@@ -86,11 +86,11 @@ fi
 
 if python3 -c "import k2" &> /dev/null; then
     echo "==== use_k2, num_paths > nll_batch_size, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
-    ./run.sh --num_paths 4 --nll_batch_size 2 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+    ./run.sh --num_paths 4 --nll_batch_size 2 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
         --feats_normalize "utterance_mvn" --python "${python}" --asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
 
     echo "==== use_k2, num_paths == nll_batch_size, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
-    ./run.sh --num_paths 2 --nll_batch_size 2 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+    ./run.sh --num_paths 2 --nll_batch_size 2 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
        --feats_normalize "utterance_mvn" --python "${python}" --asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
 fi
 
@@ -101,7 +101,7 @@ if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then
         asr_tag="transducer_${t}"
 
         echo "==== [Conformer-RNN-T] feats_type=raw, token_types=${t}, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
-        ./run.sh --asr_config "" --asr_task "asr_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type ${t} \
+        ./run.sh --asr_config "" --asr_task "asr_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type ${t} \
             --feats_normalize "utterance_mvn" --python "${python}" --inference_asr_model "valid.loss.best.pth" \
             --asr-tag "${asr_tag}_conformer" \
             --asr-args "--model_conf extract_feats_in_collect_stats=false \
@@ -110,7 +110,7 @@ if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then
                         --max_epoch 1 --num_iters_per_epoch 1 --batch_size 2 --batch_type folded --num_workers 0"
 
         echo "==== [Streaming Conformer-RNN-T] feats_type=raw, token_types=${t}, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
-        ./run.sh --asr_config "" --asr_task "asr_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type ${t} \
+        ./run.sh --asr_config "" --asr_task "asr_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type ${t} \
             --feats_normalize "utterance_mvn" --python "${python}" --inference_asr_model "valid.loss.best.pth" \
             --asr-tag "${asr_tag}_conformer_streaming" \
             --asr-args "--model_conf extract_feats_in_collect_stats=false \
@@ -131,7 +131,7 @@ for i in $(seq 2); do
     cp dump/raw/test/text dump/raw/test/text_spk${i}
     cp dump/raw/test_seg/text dump/raw/test_seg/text_spk${i}
 done
-./run_multispkr.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+./run_multispkr.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
     --feats_normalize "utterance_mvn" --python "${python}" \
     --asr_config "" \
     --asr_tag "train_multispkr_raw_en_bpe30" \
@@ -152,7 +152,7 @@ cd "${cwd}"
 cd ./egs2/mini_an4/tts1
 gen_dummy_coverage
 echo "==== [ESPnet2] TTS ==="
-./run.sh --ngpu 0 --stage 1 --stop-stage 7 --skip-upload false --python "${python}" --train-args "--num_workers 0"
+./run.sh --ngpu 0 --stage 1 --stop-stage 7 --skip-packing false --python "${python}" --train-args "--num_workers 0"
 # Remove generated files in order to reduce the disk usage
 rm -rf exp dump data
 
@@ -162,7 +162,7 @@ rm -rf exp dump data
 #   See also: https://github.com/pytorch/pytorch/issues/42446
 if python3 -c 'import torch as t; from packaging.version import parse as L; assert L(t.__version__) > L("1.6")' &> /dev/null; then
     ./run.sh --fs 22050 --tts_task gan_tts --feats_extract linear_spectrogram --feats_normalize none --inference_model latest.pth \
-        --ngpu 0 --stop-stage 7 --skip-upload false --python "${python}" \
+        --ngpu 0 --stop-stage 7 --skip-packing false --python "${python}" \
         --train-config "" --train-args "--max_epoch 1 --num_iters_per_epoch 1 --batch_size 1 --batch_type folded --num_workers 0"
     rm -rf exp dump data
 fi
@@ -177,18 +177,18 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
     feats_types="raw"
     for t in ${feats_types}; do
         echo "==== feats_type=${t} with preprocessor ==="
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
             --extra_wav_list "rirs.scp noises.scp" --enh_config ./conf/train_with_preprocessor_debug.yaml --enh-args "--num_workers 0"
-        ./run.sh --ngpu 0 --stage 5 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
+        ./run.sh --ngpu 0 --stage 5 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
             --enh_config conf/train_with_data_aug_debug.yaml --enh-args "--num_workers 0"
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 2 --python "${python}" \
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 2 --python "${python}" \
             --enh_config conf/train_with_dynamic_mixing_debug.yaml --enh-args "--num_workers 0"
     done
     rm data/**/utt2category 2>/dev/null || true
     rm -r dump
     for t in ${feats_types}; do
         echo "==== feats_type=${t} without preprocessor ==="
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
     done
     # Remove generated files in order to reduce the disk usage
     rm -rf exp dump data
@@ -203,13 +203,13 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
     feats_types="raw"
     for t in ${feats_types}; do
         echo "==== feats_type=${t} ==="
-        ./run.sh --ngpu 0 --stage 1 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
-        ./run.sh --ngpu 0 --stage 3 --stop-stage 6 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
+        ./run.sh --ngpu 0 --stage 1 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
+        ./run.sh --ngpu 0 --stage 3 --stop-stage 6 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
             --train_set train_nodev_unk_nspk --valid_set test_unk_nspk --test_sets "train_dev_unk_nspk" \
             --enh_config ./conf/train_variable_nspk_debug.yaml --enh-args "--num_workers 0" --variable_num_refs true
-        ./run.sh --ngpu 0 --stage 1 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
+        ./run.sh --ngpu 0 --stage 1 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
             --local_data_opts "--random-enrollment true" --enh_config ./conf/train_random_enrollment_debug.yaml --enh-args "--num_workers 0"
-        ./run.sh --ngpu 0 --stage 3 --stop-stage 6 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
+        ./run.sh --ngpu 0 --stage 3 --stop-stage 6 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
             --train_set train_nodev_unk_nspk --valid_set test_unk_nspk --test_sets "train_dev_unk_nspk" \
             --enh_config ./conf/train_variable_nspk_random_enrollment_debug.yaml --enh-args "--num_workers 0" --variable_num_refs true
     done
@@ -234,7 +234,7 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
     cd ./egs2/mini_an4/enh_asr1
     gen_dummy_coverage
     echo "==== [ESPnet2] ENH_ASR ==="
-    ./run.sh --ngpu 0 --stage 0 --stop-stage 15 --skip-upload_hf false --feats-type "raw" --spk-num 1 --enh_asr_args "--enh_separator_conf num_spk=1 --num_workers 0" --python "${python}"
+    ./run.sh --ngpu 0 --stage 0 --stop-stage 15 --skip-packing false --skip-upload_hf false --feats-type "raw" --spk-num 1 --enh_asr_args "--enh_separator_conf num_spk=1 --num_workers 0" --python "${python}"
     # Remove generated files in order to reduce the disk usage
     rm -rf exp dump data
     cd "${cwd}"
@@ -256,16 +256,16 @@ use_lm=true
 for t in ${feats_types}; do
     for t2 in ${token_types}; do
         echo "==== feats_type=${t}, token_types=${t2} ==="
-        ./run.sh --use_lm ${use_lm} --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --tgt_token_type "${t2}" --src_token_type "${t2}" --python "${python}" --st-args "--num_workers 0"
+        ./run.sh --use_lm ${use_lm} --ngpu 0 --stage 6 --stop-stage 13 --skip-packing false --feats-type "${t}" --tgt_token_type "${t2}" --src_token_type "${t2}" --python "${python}" --st-args "--num_workers 0"
     done
     use_lm=false
 done
 echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
-./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
+./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
     --feats_normalize "utterance_mvn" --python "${python}" --st-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
 
 echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
-./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
+./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
     --feats_normalize "utterance_mvn" --python "${python}" \
     --st-config conf/train_st_streaming_debug.yaml --st-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
 
@@ -277,7 +277,7 @@ cd "${cwd}"
 cd ./egs2/mini_an4/asr2
 gen_dummy_coverage
 echo "==== [ESPnet2] ASR2 ==="
-./run.sh --ngpu 0 --stage 1 --stop-stage 15 --skip-upload false --use-lm false --python "${python}" --asr-args "--num_workers 0"
+./run.sh --ngpu 0 --stage 1 --stop-stage 15 --skip-packing false --use-lm false --python "${python}" --asr-args "--num_workers 0"
 # Remove generated files in order to reduce the disk usage
 rm -rf exp dump data
 cd "${cwd}"