Merge branch 'master' of https://github.com/ftshijt/espnet

chintu619 · Apr 27, 2022 · 04d0cd8 · 04d0cd8
2 parents 72b6b21 + 4a12ab3
commit 04d0cd8
Show file tree

Hide file tree

Showing 138 changed files with 4,329 additions and 127 deletions.
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
@@ -100,6 +100,16 @@ if python3 -c "import fairseq" &> /dev/null; then
     cd "${cwd}"
 fi
 
+# [ESPnet2] test enh_asr1 recipe
+if python -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.2.0")' &> /dev/null;  then
+    cd ./egs2/mini_an4/enh_asr1
+    echo "==== [ESPnet2] ENH_ASR ==="
+    ./run.sh --ngpu 0 --stage 0 --stop-stage 15 --skip-upload_hf false --feats-type "raw" --spk-num 1 --enh_asr_args "--max_epoch=1 --enh_separator_conf num_spk=1" --python "${python}"
+    # Remove generated files in order to reduce the disk usage
+    rm -rf exp dump data
+    cd "${cwd}"
+fi
+
 # [ESPnet2] Validate configuration files
 echo "<blank>" > dummy_token_list
 echo "==== [ESPnet2] Validation configuration files ==="

diff --git a/egs/commonvoice/asr1/local/download_and_untar.sh b/egs/commonvoice/asr1/local/download_and_untar.sh
@@ -16,7 +16,7 @@ fi
 
 if [ $# -ne 3 ]; then
   echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
-  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
+  echo "e.g.: $0 /export/data/ https://us.openslr.org/resources/108/FR.tgz"
   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
   exit 0;
 fi

diff --git a/egs2/README.md b/egs2/README.md
@@ -38,6 +38,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                                                                        | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
 | gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio                                          | ASR                     | ENG                   | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
 | grabo                   | Grabo dataset                                                                                                                    | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |              |
+| harpervalley             | HarperValleyBank: A Domain-Specific Spoken Dialog Corpus                                                                            | SLU                     | ENG                   | https://github.com/cricketclub/gridspace-stanford-harper-valley                                                       |              |
 | hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                                                                   | ASR                     | CMN                   | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
 | hui_acg                 | HUI-audio-corpus-german                                                                                                          | TTS                     | DEU                   | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
 | how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding                                                                | ASR/MT/ST               | ENG->POR              | https://github.com/srvk/how2-dataset                                                                         |              |
@@ -61,6 +62,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | ljspeech                | The LJ Speech Dataset                                                                                                            | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
 | lrs3                    | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset                                                                            | ASR                     | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                                                  |              |
 | lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                                                            | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
+| mediaspeech             | MediaSpeech: Multilanguage ASR Benchmark and Dataset                                    | ASR                     | FRA                  | https://www.openslr.org/108/                                                                           |              |
 | microsoft_speech        | Microsoft Speech Corpus (Indian languages)                                                                                       | ASR                     | 3 languages          | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e                                        |              |
 | mini_an4                | Mini version of CMU AN4 database for the integration test                                                                        | ASR/TTS/SE              | ENG                  | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
 | mini_librispeech        | Mini version of Librispeech corpus                                                                                               | DIAR                    | ENG                  | https://openslr.org/31/                                                                                      |              |

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
@@ -46,6 +46,7 @@ LIBRILIGHT_LIMITED=
 FSC=
 SLURP=
 VOXCELEB=
+MEDIASPEECH=downloads
 MINI_LIBRISPEECH=downloads
 MISP2021=
 LIBRIMIX=downloads
@@ -139,6 +140,7 @@ MALAYALAM=downloads
 ST_CMDS=downloads
 MS_INDIC_IS18=
 MARATHI=downloads
+HARPERVALLEY=downloads
 
 # For only CMU TIR environment
 if [[ "$(hostname)" == tir* ]]; then
@@ -216,6 +218,7 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
     FSC=
     SNIPS= # smart-light-en-closed-field data path
     SLURP=
+    MEDIASPEECH=downloads
     MINI_LIBRISPEECH=downloads
     LIBRITTS=
     LJSPEECH=downloads

diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
@@ -78,6 +78,8 @@ download_model=
 # Evaluation related
 scoring_protocol="STOI SDR SAR SIR SI_SNR"
 ref_channel=0
+inference_tag=  # Prefix to the result dir for ENH inference.
+inference_enh_config= # Config for enhancement.
 score_with_asr=false
 asr_exp=""       # asr model for scoring WER
 lm_exp=""       # lm model for scoring WER
@@ -151,8 +153,9 @@ Options:
     --init_param    # pretrained model path and module name (default="${init_param}")
 
     # Enhancement related
-    --inference_args   # Arguments for enhancement in the inference stage (default="${inference_args}")
-    --inference_model  # Enhancement model path for inference (default="${inference_model}").
+    --inference_args       # Arguments for enhancement in the inference stage (default="${inference_args}")
+    --inference_model      # Enhancement model path for inference (default="${inference_model}").
+    --inference_enh_config # Configuration file for overwriting some model attributes during SE inference. (default="${inference_enh_config}")
 
     # Evaluation related
     --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
@@ -250,6 +253,14 @@ if [ -n "${speed_perturb_factors}" ]; then
   enh_exp="${enh_exp}_sp"
 fi
 
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_enh_config}" ]; then
+        inference_tag="$(basename "${inference_enh_config}" .yaml)"
+    else
+        inference_tag=enhanced
+    fi
+fi
+
 # ========================== Main stages start from here. ==========================
 
 if ! "${skip_data_prep}"; then
@@ -617,7 +628,7 @@ if ! "${skip_eval}"; then
 
         for dset in "${valid_set}" ${test_sets}; do
             _data="${data_feats}/${dset}"
-            _dir="${enh_exp}/enhanced_${dset}"
+            _dir="${enh_exp}/${inference_tag}_${dset}"
             _logdir="${_dir}/logdir"
             mkdir -p "${_logdir}"
 
@@ -649,6 +660,7 @@ if ! "${skip_eval}"; then
                     --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
                     --key_file "${_logdir}"/keys.JOB.scp \
                     --train_config "${enh_exp}"/config.yaml \
+                    ${inference_enh_config:+--inference_config "$inference_enh_config"} \
                     --model_file "${enh_exp}"/"${inference_model}" \
                     --output_dir "${_logdir}"/output.JOB \
                     ${_opts} ${inference_args}
@@ -689,7 +701,7 @@ if ! "${skip_eval}"; then
                 if "${score_obs}"; then
                     _dir="${data_feats}/${dset}/scoring"
                 else
-                    _dir="${enh_exp}/enhanced_${dset}/scoring"
+                    _dir="${enh_exp}/${inference_tag}_${dset}/scoring"
                 fi
 
                 _logdir="${_dir}/logdir"
@@ -716,7 +728,7 @@ if ! "${skip_eval}"; then
                         # To compute the score of observation, input original wav.scp
                         _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
                     else
-                        _inf_scp+="--inf_scp ${enh_exp}/enhanced_${dset}/spk${spk}.scp "
+                        _inf_scp+="--inf_scp ${enh_exp}/${inference_tag}_${dset}/spk${spk}.scp "
                     fi
                 done
 
@@ -752,7 +764,7 @@ if ! "${skip_eval}"; then
             ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS.md"
         done
         log "Evaluation result for observation: ${data_feats}/RESULTS.md"
-        log "Evaluation result for enhancement: ${enh_exp}/enhanced/RESULTS.md"
+        log "Evaluation result for enhancement: ${enh_exp}/RESULTS.md"
 
     fi
 else
@@ -811,7 +823,7 @@ if "${score_with_asr}"; then
                         # Using same wav.scp for all speakers
                         cp "${_data}/wav.scp" "${_ddir}/wav.scp"
                     else
-                        cp "${enh_exp}/enhanced_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
+                        cp "${enh_exp}/${inference_tag}_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
                     fi
                     cp data/${dset}/text_spk${spk} ${_ddir}/text
                     cp ${_data}/{spk2utt,utt2spk,utt2num_samples,feats_type} ${_ddir}

diff --git a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
@@ -15,7 +15,7 @@ fi
 [ -f ./path.sh ] && . ./path.sh
 set -euo pipefail
 if [ $# -eq 1 ]; then
-    exp=$1
+    exp=$(realpath "$1")
 else
     exp=exp
 fi

diff --git a/egs2/TEMPLATE/enh_asr1/enh_asr.sh b/egs2/TEMPLATE/enh_asr1/enh_asr.sh
@@ -122,6 +122,8 @@ dereverb_ref_num=1
 # Evaluation related
 scoring_protocol="STOI SDR SAR SIR SI_SNR"
 ref_channel=0
+inference_enh_tag=      # Prefix to the result dir for ENH inference.
+inference_enh_config=   # Config for enhancement.
 
 # Enh Training data related
 use_dereverb_ref=false
@@ -453,6 +455,14 @@ if [ -z "${inference_tag}" ]; then
     fi
 fi
 
+if [ -z "${inference_enh_tag}" ]; then
+    if [ -n "${inference_enh_config}" ]; then
+        inference_enh_tag="$(basename "${inference_enh_config}" .yaml)"
+    else
+        inference_enh_tag=enhanced
+    fi
+fi
+
 # ========================== Main stages start from here. ==========================
 
 if ! "${skip_data_prep}"; then
@@ -518,7 +528,10 @@ if ! "${skip_data_prep}"; then
                 expand_utt_extra_files=""
                 for extra_file in ${utt_extra_files}; do
                     # with regex to suuport multi-references
-                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                    for single_file in "data/${dset}/${extra_file}"*; do
+                        if [ ! -f "${single_file}" ]; then
+                            continue
+                        fi
                         cp ${single_file} "${data_feats}${_suf}/${dset}"
                         expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
                     done
@@ -553,7 +566,7 @@ if ! "${skip_data_prep}"; then
                     # shellcheck disable=SC2086
                     scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
                         --out-filename "${spk}.scp" \
-                        --ref_channels "0" \
+                        --ref_channels "${ref_channel}" \
                         --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
                         "data/${dset}/${spk}.scp" "${data_feats}${_suf}/${dset}" \
                         "${data_feats}${_suf}/${dset}/logs/${spk}" "${data_feats}${_suf}/${dset}/data/${spk}"
@@ -1259,6 +1272,7 @@ if ! "${skip_eval}"; then
             # shellcheck disable=SC2086
             ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
                 ${python} -m ${asr_inference_tool} \
+                    --enh_s2t_task true \
                     --batch_size ${batch_size} \
                     --ngpu "${_ngpu}" \
                     --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
@@ -1293,12 +1307,12 @@ if ! "${skip_eval}"; then
         _opts=
 
         # 2. Generate run.sh
-        log "Generate '${enh_asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 13 using this script"
-        mkdir -p "${enh_asr_exp}/${inference_tag}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_asr_exp}/${inference_tag}/run.sh"; chmod +x "${enh_asr_exp}/${inference_tag}/run.sh"
+        log "Generate '${enh_asr_exp}/run_enhance.sh'. You can resume the process from stage 13 using this script"
+        mkdir -p "${enh_asr_exp}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_asr_exp}/run_enhance.sh"; chmod +x "${enh_asr_exp}/run_enhance.sh"
 
         for dset in ${test_sets}; do
             _data="${data_feats}/${dset}"
-            _dir="${enh_asr_exp}/${inference_tag}/${dset}"
+            _dir="${enh_asr_exp}/${inference_enh_tag}_${dset}"
             _logdir="${_dir}/logdir"
             mkdir -p "${_logdir}"
 
@@ -1330,6 +1344,7 @@ if ! "${skip_eval}"; then
                     --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
                     --key_file "${_logdir}"/keys.JOB.scp \
                     --train_config "${enh_asr_exp}"/config.yaml \
+                    ${inference_enh_config:+--inference_config "$inference_enh_config"} \
                     --model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
                     --output_dir "${_logdir}"/output.JOB \
                     ${_opts} ${enh_inference_args}
@@ -1472,17 +1487,17 @@ if ! "${skip_eval}"; then
         # for score_obs in true false; do
         for score_obs in true false; do
             # Peform only at the first time for observation
-            if "${score_obs}" && [ -e "${data_feats}/RESULTS.md" ]; then
-                log "${data_feats}/RESULTS.md already exists. The scoring for observation will be skipped"
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS_enh.md" ]; then
+                log "${data_feats}/RESULTS_enh.md already exists. The scoring for observation will be skipped"
                 continue
             fi
 
             for dset in ${test_sets}; do
                 _data="${data_feats}/${dset}"
                 if "${score_obs}"; then
-                    _dir="${data_feats}/${dset}/scoring_enh"
+                    _dir="${data_feats}/${dset}/scoring"
                 else
-                    _dir="${enh_asr_exp}/${inference_tag}/${dset}/scoring_enh"
+                    _dir="${enh_asr_exp}/${inference_enh_tag}_${dset}/scoring"
                 fi
 
                 _logdir="${_dir}/logdir"
@@ -1508,7 +1523,7 @@ if ! "${skip_eval}"; then
                         # To compute the score of observation, input original wav.scp
                         _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
                     else
-                        _inf_scp+="--inf_scp ${enh_asr_exp}/${inference_tag}/${dset}/spk${spk}.scp "
+                        _inf_scp+="--inf_scp ${enh_asr_exp}/${inference_enh_tag}_${dset}/spk${spk}.scp "
                     fi
                 done
 
@@ -1544,7 +1559,7 @@ if ! "${skip_eval}"; then
             ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS_enh.md"
         done
         log "Evaluation result for observation: ${data_feats}/RESULTS_enh.md"
-        log "Evaluation result for enhancement: ${enh_asr_exp}/enhanced/RESULTS_enh.md"
+        log "Evaluation result for enhancement: ${enh_asr_exp}/RESULTS_enh.md"
 
     fi
 else
@@ -1620,7 +1635,7 @@ if ! "${skip_upload_hf}"; then
         # shellcheck disable=SC2034     
         espnet_task=EnhS2T
         # shellcheck disable=SC2034
-        task_exp=${enh_st_exp}
+        task_exp=${enh_asr_exp}
         eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
 
         this_folder=${PWD}