Skip to content

Commit

Permalink
Merge branch 'master' into codec
Browse files Browse the repository at this point in the history
  • Loading branch information
ftshijt committed Jun 6, 2024
2 parents 1c4199f + 1bbdf96 commit 3d7eff8
Show file tree
Hide file tree
Showing 240 changed files with 12,861 additions and 1,544 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/ci_on_macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: 'x64'
- name: Get PR labels
id: pr-labels
uses: joerick/pr-labels-action@v1.0.9
Expand All @@ -43,4 +42,4 @@ jobs:
# CC: /usr/local/bin/gcc-11
# CXX: /usr/local/bin/g++-11
run: |
./ci/install.sh
./ci/install_macos.sh
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ repos:
exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|tools/installers/patch_mwerSegmenter)

- repo: https://github.com/psf/black
rev: 24.3.0
rev: 24.4.2
hooks:
- id: black
exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|doc)
Expand Down
4 changes: 0 additions & 4 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,6 @@ For ESPnet2, we do not recommend preparing the recipe's stages for each corpus b

The common pipeline of ESPnet2 recipes will take care of the `RESULTS.md` generation, model packing, and uploading. ESPnet2 models are maintained at Hugging Face and Zenodo (Deprecated).
You can also refer to the document at https://github.com/espnet/espnet_model_zoo
To upload your model, you need first (This is currently deprecated, uploading to Huggingface Hub is preferred) :
1. Sign up to Zenodo: https://zenodo.org/
2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
3. Set your environment: % export ACCESS_TOKEN="<your token>"

To port models from zenodo using Hugging Face hub,
1. Create a Hugging Face account - https://huggingface.co/
Expand Down
2 changes: 1 addition & 1 deletion ci/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ${CXX:-g++} -v

. ./activate_python.sh
# FIXME(kamo): Failed to compile pesq
make TH_VERSION="${TH_VERSION}" WITH_OMP="${WITH_OMP-ON}" all warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done whisper.done parallel-wavegan.done muskits.done lora.done
make TH_VERSION="${TH_VERSION}" WITH_OMP="${WITH_OMP-ON}" all warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done whisper.done parallel-wavegan.done muskits.done lora.done sph2pipe
rm -rf kaldi
)
. tools/activate_python.sh
Expand Down
65 changes: 65 additions & 0 deletions ci/install_macos.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env bash

# NOTE: DO NOT WRITE DISTRIBUTION-SPECIFIC COMMANDS HERE (e.g., apt, dnf, etc)

set -euo pipefail

${CXX:-g++} -v

(
set -euo pipefail
cd tools

# To skip error
mkdir -p kaldi/egs/wsj/s5/utils && touch kaldi/egs/wsj/s5/utils/parse_options.sh
if ${USE_CONDA}; then
./setup_anaconda.sh venv espnet ${ESPNET_PYTHON_VERSION}
# To install via pip instead of conda
else
./setup_venv.sh "$(command -v python3)" venv
fi

. ./activate_python.sh
# FIXME(kamo): Failed to compile pesq
make TH_VERSION="${TH_VERSION}" WITH_OMP="${WITH_OMP-ON}" all warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done whisper.done parallel-wavegan.done muskits.done lora.done
rm -rf kaldi
)
. tools/activate_python.sh
python3 --version

python3 -m pip install https://github.com/kpu/kenlm/archive/master.zip
# NOTE(kamo): tensorboardx is used for chainer mode only
python3 -m pip install tensorboardx
# NOTE(kamo): Create matplotlib.cache to reduce runtime for test phase
python3 -c "import matplotlib.pyplot"
# NOTE(wangyou): onnxruntime and onnx2torch are used for testing dnsmos functions
cat >> constraints.txt << EOF
torch==${TH_VERSION}
EOF
python3 -m pip install -c constraints.txt onnxruntime onnx2torch

# NOTE(kan-bayashi): Fix the error in black installation.
# See: https://github.com/psf/black/issues/1707
python3 -m pip uninstall -y typing

# NOTE(kamo): Workaround for pip resolve issue (I think this is a bug of pip)
python3 -m pip install "hacking>=2.0.0" "flake8>=3.7.8"

# install espnet
python3 -m pip install -e ".[test]"
python3 -m pip install -e ".[doc]"

# log
python3 -m pip freeze


# Check pytorch version
python3 <<EOF
import torch
from packaging.version import parse as L
version = '$TH_VERSION'.split(".")
next_version = f"{version[0]}.{version[1]}.{int(version[2]) + 1}"
if L(torch.__version__) < L('$TH_VERSION') or L(torch.__version__) >= L(next_version):
raise RuntimeError(f"Pytorch=$TH_VERSION is expected, but got pytorch={torch.__version__}. This is a bug in installation scripts")
EOF
54 changes: 27 additions & 27 deletions ci/test_integration_espnet2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,27 @@ use_lm=true
for t in ${feats_types}; do
for t2 in ${token_types}; do
echo "==== feats_type=${t}, token_types=${t2} ==="
./run.sh --use_lm ${use_lm} --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --token-type "${t2}" --python "${python}" --asr-args "--num_workers 0"
./run.sh --use_lm ${use_lm} --ngpu 0 --stage 6 --stop-stage 13 --skip-packing false --feats-type "${t}" --token-type "${t2}" --python "${python}" --asr-args "--num_workers 0"
done
use_lm=false
echo "==== feats_type=raw_copy, token_types=bpe ==="
cp -r dump/raw data/
./run.sh --use_lm ${use_lm} --ngpu 0 --stage 4 --stop-stage 13 --skip-upload false --feats-type "raw_copy" --token-type "${t2}" \
./run.sh --use_lm ${use_lm} --ngpu 0 --stage 4 --stop-stage 13 --skip-packing false --feats-type "raw_copy" --token-type "${t2}" \
--train_set raw/train_nodev --valid_set raw/train_dev --test_sets raw/test --python "${python}" --asr-args "--num_workers 0"
done
echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
--feats_normalize "utterance_mvn" --python "${python}" \
--asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"

echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn, with data augmentation ==="
./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
--asr_config "conf/train_asr_rnn_data_aug_debug.yaml" \
--feats_normalize "utterance_mvn" --python "${python}" \
--asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"

echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
--feats_normalize "utterance_mvn" --python "${python}" \
--asr_config "" --asr-tag "train_raw_bpe_streaming" \
--asr-args "--model_conf extract_feats_in_collect_stats=false --encoder=contextual_block_transformer
Expand All @@ -67,14 +67,14 @@ echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_fe

if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then
echo "==== Transducer, feats_type=raw, token_types=bpe ==="
./run.sh --asr-tag "espnet_model_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false \
./run.sh --asr-tag "espnet_model_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false \
--feats-type "raw" --token-type "bpe" --python "${python}" \
--asr-args "--decoder transducer --decoder_conf hidden_size=2 --model_conf ctc_weight=0.0 --joint_net_conf joint_space_size=2 --num_workers 0 \
--best_model_criterion '(valid, loss, min)'" --inference_asr_model "valid.loss.best.pth"

if [ "$(python3 -c "import torch; print(torch.cuda.is_available())")" == "True" ]; then
echo "==== Multi-Blank Transducer, feats_type=raw, token_types=bpe ==="
./run.sh --asr-tag "espnet_model_multi_blank_transducer" --ngpu 1 --stage 10 --stop-stage 13 --skip-upload false \
./run.sh --asr-tag "espnet_model_multi_blank_transducer" --ngpu 1 --stage 10 --stop-stage 13 --skip-packing false \
--feats-type "raw" --token-type "bpe" --python "${python}" \
--asr-tag "train_multi_black_transducer" \
--asr_args "--decoder transducer --decoder_conf hidden_size=2 --model_conf ctc_weight=0.0 --joint_net_conf joint_space_size=2 \
Expand All @@ -86,11 +86,11 @@ fi

if python3 -c "import k2" &> /dev/null; then
echo "==== use_k2, num_paths > nll_batch_size, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
./run.sh --num_paths 4 --nll_batch_size 2 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
./run.sh --num_paths 4 --nll_batch_size 2 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
--feats_normalize "utterance_mvn" --python "${python}" --asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"

echo "==== use_k2, num_paths == nll_batch_size, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
./run.sh --num_paths 2 --nll_batch_size 2 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
./run.sh --num_paths 2 --nll_batch_size 2 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
--feats_normalize "utterance_mvn" --python "${python}" --asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
fi

Expand All @@ -101,7 +101,7 @@ if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then
asr_tag="transducer_${t}"

echo "==== [Conformer-RNN-T] feats_type=raw, token_types=${t}, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
./run.sh --asr_config "" --asr_task "asr_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type ${t} \
./run.sh --asr_config "" --asr_task "asr_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type ${t} \
--feats_normalize "utterance_mvn" --python "${python}" --inference_asr_model "valid.loss.best.pth" \
--asr-tag "${asr_tag}_conformer" \
--asr-args "--model_conf extract_feats_in_collect_stats=false \
Expand All @@ -110,7 +110,7 @@ if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then
--max_epoch 1 --num_iters_per_epoch 1 --batch_size 2 --batch_type folded --num_workers 0"

echo "==== [Streaming Conformer-RNN-T] feats_type=raw, token_types=${t}, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
./run.sh --asr_config "" --asr_task "asr_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type ${t} \
./run.sh --asr_config "" --asr_task "asr_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type ${t} \
--feats_normalize "utterance_mvn" --python "${python}" --inference_asr_model "valid.loss.best.pth" \
--asr-tag "${asr_tag}_conformer_streaming" \
--asr-args "--model_conf extract_feats_in_collect_stats=false \
Expand All @@ -131,7 +131,7 @@ for i in $(seq 2); do
cp dump/raw/test/text dump/raw/test/text_spk${i}
cp dump/raw/test_seg/text dump/raw/test_seg/text_spk${i}
done
./run_multispkr.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
./run_multispkr.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --token-type "bpe" \
--feats_normalize "utterance_mvn" --python "${python}" \
--asr_config "" \
--asr_tag "train_multispkr_raw_en_bpe30" \
Expand All @@ -152,7 +152,7 @@ cd "${cwd}"
cd ./egs2/mini_an4/tts1
gen_dummy_coverage
echo "==== [ESPnet2] TTS ==="
./run.sh --ngpu 0 --stage 1 --stop-stage 7 --skip-upload false --python "${python}" --train-args "--num_workers 0"
./run.sh --ngpu 0 --stage 1 --stop-stage 7 --skip-packing false --python "${python}" --train-args "--num_workers 0"
# Remove generated files in order to reduce the disk usage
rm -rf exp dump data

Expand All @@ -162,7 +162,7 @@ rm -rf exp dump data
# See also: https://github.com/pytorch/pytorch/issues/42446
if python3 -c 'import torch as t; from packaging.version import parse as L; assert L(t.__version__) > L("1.6")' &> /dev/null; then
./run.sh --fs 22050 --tts_task gan_tts --feats_extract linear_spectrogram --feats_normalize none --inference_model latest.pth \
--ngpu 0 --stop-stage 7 --skip-upload false --python "${python}" \
--ngpu 0 --stop-stage 7 --skip-packing false --python "${python}" \
--train-config "" --train-args "--max_epoch 1 --num_iters_per_epoch 1 --batch_size 1 --batch_type folded --num_workers 0"
rm -rf exp dump data
fi
Expand All @@ -177,18 +177,18 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
feats_types="raw"
for t in ${feats_types}; do
echo "==== feats_type=${t} with preprocessor ==="
./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
--extra_wav_list "rirs.scp noises.scp" --enh_config ./conf/train_with_preprocessor_debug.yaml --enh-args "--num_workers 0"
./run.sh --ngpu 0 --stage 5 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
./run.sh --ngpu 0 --stage 5 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
--enh_config conf/train_with_data_aug_debug.yaml --enh-args "--num_workers 0"
./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 2 --python "${python}" \
./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 2 --python "${python}" \
--enh_config conf/train_with_dynamic_mixing_debug.yaml --enh-args "--num_workers 0"
done
rm data/**/utt2category 2>/dev/null || true
rm -r dump
for t in ${feats_types}; do
echo "==== feats_type=${t} without preprocessor ==="
./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
done
# Remove generated files in order to reduce the disk usage
rm -rf exp dump data
Expand All @@ -203,13 +203,13 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
feats_types="raw"
for t in ${feats_types}; do
echo "==== feats_type=${t} ==="
./run.sh --ngpu 0 --stage 1 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
./run.sh --ngpu 0 --stage 3 --stop-stage 6 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
./run.sh --ngpu 0 --stage 1 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
./run.sh --ngpu 0 --stage 3 --stop-stage 6 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
--train_set train_nodev_unk_nspk --valid_set test_unk_nspk --test_sets "train_dev_unk_nspk" \
--enh_config ./conf/train_variable_nspk_debug.yaml --enh-args "--num_workers 0" --variable_num_refs true
./run.sh --ngpu 0 --stage 1 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
./run.sh --ngpu 0 --stage 1 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
--local_data_opts "--random-enrollment true" --enh_config ./conf/train_random_enrollment_debug.yaml --enh-args "--num_workers 0"
./run.sh --ngpu 0 --stage 3 --stop-stage 6 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
./run.sh --ngpu 0 --stage 3 --stop-stage 6 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
--train_set train_nodev_unk_nspk --valid_set test_unk_nspk --test_sets "train_dev_unk_nspk" \
--enh_config ./conf/train_variable_nspk_random_enrollment_debug.yaml --enh-args "--num_workers 0" --variable_num_refs true
done
Expand All @@ -234,7 +234,7 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
cd ./egs2/mini_an4/enh_asr1
gen_dummy_coverage
echo "==== [ESPnet2] ENH_ASR ==="
./run.sh --ngpu 0 --stage 0 --stop-stage 15 --skip-upload_hf false --feats-type "raw" --spk-num 1 --enh_asr_args "--enh_separator_conf num_spk=1 --num_workers 0" --python "${python}"
./run.sh --ngpu 0 --stage 0 --stop-stage 15 --skip-packing false --skip-upload_hf false --feats-type "raw" --spk-num 1 --enh_asr_args "--enh_separator_conf num_spk=1 --num_workers 0" --python "${python}"
# Remove generated files in order to reduce the disk usage
rm -rf exp dump data
cd "${cwd}"
Expand All @@ -256,16 +256,16 @@ use_lm=true
for t in ${feats_types}; do
for t2 in ${token_types}; do
echo "==== feats_type=${t}, token_types=${t2} ==="
./run.sh --use_lm ${use_lm} --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --tgt_token_type "${t2}" --src_token_type "${t2}" --python "${python}" --st-args "--num_workers 0"
./run.sh --use_lm ${use_lm} --ngpu 0 --stage 6 --stop-stage 13 --skip-packing false --feats-type "${t}" --tgt_token_type "${t2}" --src_token_type "${t2}" --python "${python}" --st-args "--num_workers 0"
done
use_lm=false
done
echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
--feats_normalize "utterance_mvn" --python "${python}" --st-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"

echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-packing false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
--feats_normalize "utterance_mvn" --python "${python}" \
--st-config conf/train_st_streaming_debug.yaml --st-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"

Expand All @@ -277,7 +277,7 @@ cd "${cwd}"
cd ./egs2/mini_an4/asr2
gen_dummy_coverage
echo "==== [ESPnet2] ASR2 ==="
./run.sh --ngpu 0 --stage 1 --stop-stage 15 --skip-upload false --use-lm false --python "${python}" --asr-args "--num_workers 0"
./run.sh --ngpu 0 --stage 1 --stop-stage 15 --skip-packing false --use-lm false --python "${python}" --asr-args "--num_workers 0"
# Remove generated files in order to reduce the disk usage
rm -rf exp dump data
cd "${cwd}"
Expand Down
Loading

0 comments on commit 3d7eff8

Please sign in to comment.