Merge branch 'master' into codec

espnet · Jun 14, 2024 · fab9aac · fab9aac
2 parents 1382977 + 19787b1
commit fab9aac
Show file tree

Hide file tree

Showing 44 changed files with 2,167 additions and 68 deletions.
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
@@ -189,6 +189,8 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
     for t in ${feats_types}; do
         echo "==== feats_type=${t} without preprocessor ==="
         ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" --enh-args "--num_workers 0"
+        ./run.sh --ngpu 0 --stage 6 --stop-stage 10 --skip-packing false --feats-type "${t}" --ref-num 1 --python "${python}" \
+            --enh_config conf/train_with_chunk_iterator_debug.yaml --enh-args "--num_workers 0"
     done
     # Remove generated files in order to reduce the disk usage
     rm -rf exp dump data

diff --git a/egs2/README.md b/egs2/README.md
@@ -178,6 +178,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | totonac                 | Highland Totonac corpus (endangered language in central Mexico)                                                                  | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                  |              |
 | tsukuyomi               | つくよみちゃんコーパス                                                                                                               | TTS                     | JPN                  | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
 | universal_se_v1         | Combination of Multi-condition English Corpora (vctk_noisy, dns_ins20, chime4, reverb, whamr)                                    | SE                      | ENG                  |                                                                                                              |              |
+| urgent2024              | Multi-domain simulated speech enhancement data for the URGENT 2024 Challenge                                                     | SE                      | ENG                  | https://urgent-challenge.github.io/urgent2024/data/                                                          |              |
 | vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                                                                      | ASR/TTS                 | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
 | vctk_reverb             | Reverberant speech database (48kHz)                                                                                              | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
 | vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                                                                        | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |

diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
@@ -752,7 +752,7 @@ if ! "${skip_train}"; then
             _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/utt2category,category,text "
         fi
 
-        # Add the fs information at the end of the data path list
+        # Add the sampling frequency information at the end of the data path list
         if [ -e "${_enh_train_dir}/utt2fs" ] && [ -e "${_enh_valid_dir}/utt2fs" ]; then
             log "[INFO] Adding the sampling frequency information (fs) for training"
 
@@ -837,6 +837,19 @@ if ! "${skip_eval}"; then
                     _data_param+="--data_path_and_name_and_type ${_data}/enroll_spk${spk}.scp,enroll_ref${spk},text "
                 done
             fi
+            # Add the category information at the end of the data path list
+            if [ -e "${_data}/utt2category" ]; then
+                log "[INFO] Adding the category information for inference"
+                log "[WARNING] Please make sure the category information is explicitly processed by the preprocessor defined in '${enh_config}' so that it is converted to an integer"
+
+                _data_param+="--data_path_and_name_and_type ${_data}/utt2category,category,text "
+            fi
+            # Add the sampling frequency information at the end of the data path list
+            if [ -e "${_data}/utt2fs" ]; then
+                log "[INFO] Adding the sampling frequency information for inference"
+
+                _data_param+="--data_path_and_name_and_type ${_data}/utt2fs,fs,text_int "
+            fi
             # 1. Split the key file
             key_file=${_data}/${_scp}
             split_scps=""
@@ -888,6 +901,14 @@ if ! "${skip_eval}"; then
         log "Stage 8: Scoring"
         _cmd=${decode_cmd}
 
+        if ${gpu_inference}; then
+            _cmd=${cuda_cmd}
+            _ngpu=1
+        else
+            _cmd=${decode_cmd}
+            _ngpu=0
+        fi
+
         # score_obs=true: Scoring for observation signal
         # score_obs=false: Scoring for enhanced signal
         for score_obs in true false; do
@@ -944,7 +965,7 @@ if ! "${skip_eval}"; then
                 # 2. Submit scoring jobs
                 log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
                 # shellcheck disable=SC2086
-                ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
+                ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
                     ${python} -m espnet2.bin.enh_scoring \
                         --key_file "${_logdir}"/keys.JOB.scp \
                         --output_dir "${_logdir}"/output.JOB \

diff --git a/egs2/mini_an4/enh1/conf/train_with_chunk_iterator_debug.yaml b/egs2/mini_an4/enh1/conf/train_with_chunk_iterator_debug.yaml
@@ -0,0 +1,41 @@
+# This is a debug config for CI
+encoder: conv
+encoder_conf:
+    channel: 32
+    kernel_size: 20
+    stride: 10
+decoder: conv
+decoder_conf:
+    channel: 32
+    kernel_size: 20
+    stride: 10
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 2
+    stack: 2
+    bottleneck_dim: 16
+    hidden_dim: 48
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+
+criterions:
+  # The first criterion
+  - name: mse_td
+    conf: {}
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
+
+max_epoch: 1
+batch_type: folded
+batch_size: 2
+iterator_type: chunk
+chunk_length: 25 # 0.5s
+chunk_default_fs: 50 # GCD among all possible sampling frequencies
+chunk_max_abs_length: 100000 # max number of samples per chunk for all sampling frequencies (reduce this value if OOM occurs)
+chunk_discard_short_samples: false