Add MixIT support. It is unsupervised only. Semi-supervised config is…

… not available for now.
espnet · Sep 9, 2022 · f705a58 · f705a58
1 parent 6d52365
commit f705a58
Show file tree

Hide file tree

Showing 55 changed files with 810 additions and 156 deletions.
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
@@ -107,9 +107,9 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
     feats_types="raw"
     for t in ${feats_types}; do
         echo "==== feats_type=${t} ==="
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}"
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}" --use_preprocessor true --extra_wav_list "rirs.scp noises.scp" --enh_config ./conf/train_with_preprocessor.yaml
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}" --enh_config conf/train_with_dynamic_mixing.yaml --dynamic_mixing true --spk-num 2
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --enh-args "--max_epoch=1" --python "${python}"
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --enh-args "--max_epoch=1" --python "${python}" --extra_wav_list "rirs.scp noises.scp" --enh_config ./conf/train_with_preprocessor.yaml
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --enh-args "--max_epoch=1" --python "${python}" --enh_config conf/train_with_dynamic_mixing.yaml --ref-num 2
     done
     # Remove generated files in order to reduce the disk usage
     rm -rf exp dump data

diff --git a/egs2/TEMPLATE/enh1/README.md b/egs2/TEMPLATE/enh1/README.md
@@ -70,13 +70,13 @@ The`EnhancementTask` defined in `espnet2/tasks/enh.py` is called in `espnet2/bin
 We have created `EnhancementTask` in `espnet2/tasks/enh.py`, which is used to train the `ESPnetEnhancementModel(AbsESPnetModel)` defined in `espnet2/enh/espnet_model.py`. 
 In `EnhancementTask`, the speech enhancement or separation models follow the `encoder-separator-decoder` style, and several encoders, decoders and separators are implemented. Although it is currently defined as an independent task, the models from `EnhancementTask` can be easily called by other tasks or even jointly trained with other tasks (see `egs2/TEMPLATE/enh_asr1/`, `egs2/TEMPLATE/enh_st1/`).
 
-> Now we support adding noise and reverberation on the fly by specifying `--use_preprocessor` and `--extra_wav_list` to use `EnhPreprocessor`. Check [PR #4321](https://github.com/espnet/espnet/pull/4321#issue-1216290237) for more details.
+> Now we support adding noise, reverberation, interference speech on the fly by specifying `preprocessor` in the configuration. For example, to use `EnhPreprocessor`, one can specify `preprocessor: "enh"` in the configuration and specify `--extra_wav_list` in `run.sh`. Check [PR #4321](https://github.com/espnet/espnet/pull/4321#issue-1216290237) for more details.
 >
 > We also support possible integration of other speech enhancement/separation toolkits (e.g. [Asteroid](https://github.com/asteroid-team/asteroid)), so that models trained with other speech enhancement/separation toolkits can be reused/evaluated on ESPnet for downstream tasks such as ASR.
 
 Related arguments in `enh.sh` include:
 
-  + --spk_num
+  + --ref_num
   + --enh_args
   + --enh_config
   + --enh_exp
@@ -85,7 +85,6 @@ Related arguments in `enh.sh` include:
   + --init_param
   + --use_dereverb_ref
   + --use_noise_ref
-  + --use_preprocessor
   + --extra_wav_list
 
 Related python files:
@@ -163,7 +162,7 @@ This stage generates the enhanced or separated speech with the trained model. Th
 
 Related arguments in `enh.sh` include:
 
-  + --spk_num
+  + --ref_num
   + --fs
   + --gpu_inference
   + --inference_args
@@ -283,7 +282,7 @@ Prepare training configuration files (e.g. [train.yaml](https://github.com/espne
 Write `run.sh` to provide a template entry script, so that users can easily run your recipe by `./run.sh`.
 Check [egs2/wsj0_2mix/enh1/run.sh](https://github.com/espnet/espnet/blob/master/egs2/wsj0_2mix/enh1/run.sh) for reference.
 
-> Please ensure that the argument `--spk_num` in `run.sh` is consistent with the `num_spk` (under `separator_conf`) in the training configuration files created in last step.
+> Please ensure that the argument `--ref_num` in `run.sh` is consistent with the `num_spk` (under `separator_conf`) in the training configuration files created in last step, except in MixIT training. In MixIT, the argument `--inf_num` in `run.sh` should be consistent with the `num_spk` (under `separator_conf`).
 >
 > If your recipes provide references for noise and/or dereverberation, you can set the argument `--use_noise_ref true` and/or `--use_dereverb_ref true` in `run.sh`.
 

diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
@@ -58,15 +58,17 @@ enh_tag=    # Suffix to the result dir for enhancement model training.
 enh_config= # Config for enhancement model training.
 enh_args=   # Arguments for enhancement model training, e.g., "--max_epoch 10".
             # Note that it will overwrite args in enhancement config.
-spk_num=2   # Number of speakers
-dynamic_mixing=false # Flag for dynamic mixing in speech separation task. 
+ref_num=2   # Number of references for training.
+            # In supervised learning based speech enhancement / separation, it is equivalent to number of speakers.
+inf_num=    # Number of inferences output by the model
+            # Note that if it is not specified, it will be the same as ref_num. Otherwise, it will be overwritten.
+            # In MixIT, number of outputs is larger than that of references.
 noise_type_num=1
 dereverb_ref_num=1
 
 # Training data related
 use_dereverb_ref=false
 use_noise_ref=false
-use_preprocessor=false
 extra_wav_list= # Extra list of scp files for wav formatting
 
 # Pretrained model related
@@ -142,8 +144,11 @@ Options:
     --enh_config # Config for enhancement model training (default="${enh_config}").
     --enh_args   # Arguments for enhancement model training, e.g., "--max_epoch 10" (default="${enh_args}").
                  # Note that it will overwrite args in enhancement config.
-    --spk_num    # Number of speakers in the input audio (default="${spk_num}")
-    --dynamic_mixing   # Flag for dynamic mixing in speech separation task (default="${dynamic_mixing}").
+    --ref_num    # Number of references for training (default="${ref_num}").
+                 # In supervised learning based speech enhancement / separation, it is equivalent to number of speakers. 
+    --inf_num    # Number of inference audio generated by the model (default="${ref_num}")
+                 # Note that if it is not specified, it will be the same as ref_num. Otherwise, it will be overwritten.
+                 # In MixIT, number of outputs is larger than that of references.
     --noise_type_num   # Number of noise types in the input audio (default="${noise_type_num}")
     --dereverb_ref_num # Number of references for dereverberation (default="${dereverb_ref_num}")
 
@@ -152,7 +157,6 @@ Options:
                          for training a dereverberation model (default="${use_dereverb_ref}")
     --use_noise_ref    # Whether or not to use noise signal as an additional reference
                          for training a denoising model (default="${use_noise_ref}")
-    --use_preprocessor # Whether or not to apply preprocessing (default="${use_preprocessor}")
     --extra_wav_list   # Extra list of scp files for wav formatting (default="${extra_wav_list}")
 
     # Pretrained model related
@@ -215,6 +219,7 @@ utt_extra_files="utt2category"
 
 data_feats=${dumpdir}/raw
 
+inf_num=${inf_num:=${ref_num}}
 
 # Set tag for naming of model directory
 if [ -z "${enh_tag}" ]; then
@@ -283,7 +288,7 @@ if ! "${skip_data_prep}"; then
            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
 
             _scp_list="wav.scp "
-            for i in $(seq ${spk_num}); do
+            for i in $(seq ${ref_num}); do
                 _scp_list+="spk${i}.scp "
             done
 
@@ -338,7 +343,7 @@ if ! "${skip_data_prep}"; then
 
 
             _spk_list=" "
-            for i in $(seq ${spk_num}); do
+            for i in $(seq ${ref_num}); do
                 _spk_list+="spk${i} "
             done
             if $use_noise_ref && [ -n "${_suf}" ]; then
@@ -373,6 +378,10 @@ if ! "${skip_data_prep}"; then
 
             echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
 
+            for f in ${utt_extra_files}; do
+                [ -f data/${dset}/${f} ] && cp data/${dset}/${f} ${data_feats}${_suf}/${dset}/${f}
+            done
+
         done
     fi
 
@@ -385,7 +394,7 @@ if ! "${skip_data_prep}"; then
 
             _spk_list=" "
             _scp_list=" "
-            for i in $(seq ${spk_num}); do
+            for i in $(seq ${ref_num}); do
                 _spk_list+="spk${i} "
                 _scp_list+="spk${i}.scp "
             done
@@ -406,6 +415,11 @@ if ! "${skip_data_prep}"; then
             for spk in ${_spk_list};do
                 cp "${data_feats}/org/${dset}/${spk}.scp" "${data_feats}/${dset}/${spk}.scp"
             done
+            for f in ${utt_extra_files}; do
+                if [ -f "${data_feats}/org/${dset}/${f}" ]; then
+                    cp "${data_feats}/org/${dset}/${f}" "${data_feats}/${dset}/${f}"
+                fi
+            done
 
             _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
             _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
@@ -423,7 +437,7 @@ if ! "${skip_data_prep}"; then
             done
 
             # fix_data_dir.sh leaves only utts which exist in all files
-            utils/fix_data_dir.sh --utt_extra_files "${_scp_list}" "${data_feats}/${dset}"
+            utils/fix_data_dir.sh --utt_extra_files "${_scp_list} ${utt_extra_files}" "${data_feats}/${dset}"
         done
     fi
 else
@@ -489,7 +503,7 @@ if ! "${skip_train}"; then
         # prepare train and valid data parameters
         _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/wav.scp,speech_mix,${_type} "
         _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,${_type} "
-        for spk in $(seq "${spk_num}"); do
+        for spk in $(seq "${ref_num}"); do
             _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
             _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
         done
@@ -518,7 +532,6 @@ if ! "${skip_train}"; then
         ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
             ${python} -m espnet2.bin.enh_train \
                 --collect_stats true \
-                ${use_preprocessor:+--use_preprocessor $use_preprocessor} \
                 ${_train_data_param} \
                 ${_valid_data_param} \
                 --train_shape_file "${_logdir}/train.JOB.scp" \
@@ -549,15 +562,7 @@ if ! "${skip_train}"; then
             _opts+="--config ${enh_config} "
         fi
 
-        if ${dynamic_mixing}; then
-            # In current version, if you want to enable dynamic mixing in speech separation,
-            # you need to prepare the training set manually. Here we assume all speech sources 
-            # are collected in "spk1.scp", and other scp files (wav.scp, spk{N}.scp) are not used. 
-            log "Dynamic mixing is enabled, use spk1.scp as the source file list."
-            _scp=spk1.scp
-        else
-            _scp=wav.scp
-        fi
+        _scp="wav.scp"
         # "sound" supports "wav", "flac", etc.
         if [[ "${audio_format}" == *ark* ]]; then
             _type=kaldi_ark
@@ -567,32 +572,19 @@ if ! "${skip_train}"; then
         fi
         _fold_length="$((enh_speech_fold_length * 100))"
 
-
-        if ! ${dynamic_mixing} ; then
-
-            # prepare train and valid data parameters
-            _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/${_scp},speech_mix,${_type} "
-            _train_shape_param="--train_shape_file ${enh_stats_dir}/train/speech_mix_shape "
-            _fold_length_param="--fold_length ${_fold_length} "
-            _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,${_type} "
-            _valid_shape_param="--valid_shape_file ${enh_stats_dir}/valid/speech_mix_shape "
+        # prepare train and valid data parameters
+        _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/${_scp},speech_mix,${_type} "
+        _train_shape_param="--train_shape_file ${enh_stats_dir}/train/speech_mix_shape "
+        _fold_length_param="--fold_length ${_fold_length} "
+        _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,${_type} "
+        _valid_shape_param="--valid_shape_file ${enh_stats_dir}/valid/speech_mix_shape "
 
-            for spk in $(seq "${spk_num}"); do
-                _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
-                _train_shape_param+="--train_shape_file ${enh_stats_dir}/train/speech_ref${spk}_shape "
-            done
-
-        else 
-            # prepare train and valid data parameters
-            _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/${_scp},speech_ref1,${_type} "
-            _train_shape_param="--train_shape_file ${enh_stats_dir}/train/speech_ref1_shape "
-            _fold_length_param="--fold_length ${_fold_length} "
-            _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,${_type} "
-            _valid_shape_param="--valid_shape_file ${enh_stats_dir}/valid/speech_mix_shape "
-            _opts+="--utt2spk ${_enh_train_dir}/utt2spk "
-        fi
+        for spk in $(seq "${ref_num}"); do
+            _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
+            _train_shape_param+="--train_shape_file ${enh_stats_dir}/train/speech_ref${spk}_shape "
+        done
 
-        for spk in $(seq "${spk_num}"); do
+        for spk in $(seq "${ref_num}"); do
             _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
             _valid_shape_param+="--valid_shape_file ${enh_stats_dir}/valid/speech_ref${spk}_shape "
             _fold_length_param+="--fold_length ${_fold_length} "
@@ -639,7 +631,6 @@ if ! "${skip_train}"; then
             --init_file_prefix "${enh_exp}"/.dist_init_ \
             --multiprocessing_distributed true -- \
             ${python} -m espnet2.bin.enh_train \
-                ${use_preprocessor:+--use_preprocessor $use_preprocessor} \
                 ${_train_data_param} \
                 ${_valid_data_param} \
                 ${_train_shape_param} \
@@ -713,7 +704,7 @@ if ! "${skip_eval}"; then
 
 
             _spk_list=" "
-            for i in $(seq ${spk_num}); do
+            for i in $(seq ${inf_num}); do
                 _spk_list+="spk${i} "
             done
 
@@ -765,18 +756,26 @@ if ! "${skip_eval}"; then
 
 
                 _ref_scp=
-                for spk in $(seq "${spk_num}"); do
+                for spk in $(seq "${ref_num}"); do
                     _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
                 done
                 _inf_scp=
-                for spk in $(seq "${spk_num}"); do
-                    if "${score_obs}"; then
+                if "${score_obs}"; then
+                    for spk in $(seq "${ref_num}"); do
                         # To compute the score of observation, input original wav.scp
                         _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
-                    else
+                    done
+                    flexible_numspk=false
+                else
+                    for spk in $(seq "${inf_num}"); do
                         _inf_scp+="--inf_scp ${enh_exp}/${inference_tag}_${dset}/spk${spk}.scp "
+                    done
+                    if [[ "${ref_num}" -ne "${inf_num}" ]]; then
+                        flexible_numspk=true
+                    else
+                        flexible_numspk=false
                     fi
-                done
+                fi
 
                 # 2. Submit scoring jobs
                 log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
@@ -787,9 +786,10 @@ if ! "${skip_eval}"; then
                         --output_dir "${_logdir}"/output.JOB \
                         ${_ref_scp} \
                         ${_inf_scp} \
-                        --ref_channel ${ref_channel}
+                        --ref_channel ${ref_channel} \
+                        --flexible_numspk ${flexible_numspk}
 
-                for spk in $(seq "${spk_num}"); do
+                for spk in $(seq "${ref_num}"); do
                     for protocol in ${scoring_protocol} wav; do
                         for i in $(seq "${_nj}"); do
                             cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
@@ -800,7 +800,7 @@ if ! "${skip_eval}"; then
 
                 for protocol in ${scoring_protocol}; do
                     # shellcheck disable=SC2046
-                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    paste $(for j in $(seq ${ref_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
                     awk 'BEGIN{sum=0}
                         {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
                         END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
@@ -857,7 +857,7 @@ if "${score_with_asr}"; then
                     _dir="${enh_exp}/${inference_asr_tag}/${dset}"
                 fi
 
-                for spk in $(seq "${spk_num}"); do
+                for spk in $(seq "${ref_num}"); do
                     _ddir=${_dir}/spk_${spk}
                     _logdir="${_ddir}/logdir"
                     _decode_dir="${_ddir}/decode"
@@ -953,7 +953,7 @@ if "${score_with_asr}"; then
                     _dir="${enh_exp}/${inference_asr_tag}/${dset}/"
                 fi
 
-                for spk in $(seq "${spk_num}"); do
+                for spk in $(seq "${ref_num}"); do
                     _ddir=${_dir}/spk_${spk}
                     _logdir="${_ddir}/logdir"
                     _decode_dir="${_ddir}/decode"

diff --git a/egs2/TEMPLATE/enh_diar1/enh_diar.sh b/egs2/TEMPLATE/enh_diar1/enh_diar.sh
@@ -737,7 +737,7 @@ if ! "${skip_eval}"; then
                         ${_ref_scp} \
                         ${_inf_scp} \
                         --ref_channel ${ref_channel} \
-                        --flexible_numspk True
+                        --flexible_numspk true
 
                 for spk in $(seq "${spk_num}"); do
                     for protocol in ${scoring_protocol}; do

diff --git a/egs2/chime4/enh1/run.sh b/egs2/chime4/enh1/run.sh
@@ -24,7 +24,7 @@ test_sets="et05_simu_isolated_1ch_track"
     --test_sets "${test_sets}" \
     --fs ${sample_rate} \
     --ngpu 2 \
-    --spk_num 1 \
+    --ref_num 1 \
     --ref_channel 3 \
     --local_data_opts "--extra-annotations ${extra_annotations} --stage 1 --stop-stage 2" \
     --enh_config conf/tuning/train_enh_conv_tasnet.yaml \

diff --git a/egs2/clarity21/enh1/run.sh b/egs2/clarity21/enh1/run.sh
@@ -21,7 +21,7 @@ test_sets="dev"
     --test_sets "${test_sets}" \
     --fs ${sample_rate} \
     --ngpu 1 \
-    --spk_num 1 \
+    --ref_num 1 \
     --ref_channel 0 \
     --local_data_opts "--clarity_root ${clarity_root} --sample_rate ${sample_rate}" \
     --enh_config conf/tuning/train_enh_beamformer_mvdr.yaml \

diff --git a/egs2/conferencingspeech21/enh1/run.sh b/egs2/conferencingspeech21/enh1/run.sh
@@ -21,7 +21,7 @@ test_sets="test"
     --test_sets "${test_sets}" \
     --fs ${sample_rate} \
     --ngpu 1 \
-    --spk_num 1 \
+    --ref_num 1 \
     --local_data_opts "--official_data_dir ${official_data_dir}" \
     --enh_config conf/tuning/train_enh_beamformer_mvdr.yaml \
     --use_dereverb_ref false \

diff --git a/egs2/dns_icassp21/enh1/run.sh b/egs2/dns_icassp21/enh1/run.sh
@@ -19,7 +19,7 @@ test_sets="tt_synthetic_track_1"
     --test_sets "${test_sets}" \
     --fs ${sample_rate} \
     --ngpu 1 \
-    --spk_num 1 \
+    --ref_num 1 \
     --local_data_opts "" \
     --enh_config ./conf/train.yaml \
     --use_dereverb_ref false \