Add MixIT support. It is unsupervised only. Semi-supervised config is…

… not available for now.
espnet · Sep 5, 2022 · 5448639 · 5448639
1 parent 6d52365
commit 5448639
Show file tree

Hide file tree

Showing 36 changed files with 625 additions and 99 deletions.
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
@@ -107,9 +107,9 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
     feats_types="raw"
     for t in ${feats_types}; do
         echo "==== feats_type=${t} ==="
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}"
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}" --use_preprocessor true --extra_wav_list "rirs.scp noises.scp" --enh_config ./conf/train_with_preprocessor.yaml
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}" --enh_config conf/train_with_dynamic_mixing.yaml --dynamic_mixing true --spk-num 2
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --enh-args "--max_epoch=1" --python "${python}"
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --enh-args "--max_epoch=1" --python "${python}" --use_preprocessor true --extra_wav_list "rirs.scp noises.scp" --enh_config ./conf/train_with_preprocessor.yaml
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --enh-args "--max_epoch=1" --python "${python}" --enh_config conf/train_with_dynamic_mixing.yaml --dynamic_mixing true --ref-num 2
     done
     # Remove generated files in order to reduce the disk usage
     rm -rf exp dump data

diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
@@ -58,15 +58,16 @@ enh_tag=    # Suffix to the result dir for enhancement model training.
 enh_config= # Config for enhancement model training.
 enh_args=   # Arguments for enhancement model training, e.g., "--max_epoch 10".
             # Note that it will overwrite args in enhancement config.
-spk_num=2   # Number of speakers
+ref_num=2   # Number of references (similar to speakers)
+inf_num=    # Number of inferences output by the model
+            # If not specified, it will be the same as ref_num. If specified, it will be overwritten.
 dynamic_mixing=false # Flag for dynamic mixing in speech separation task. 
 noise_type_num=1
 dereverb_ref_num=1
 
 # Training data related
 use_dereverb_ref=false
 use_noise_ref=false
-use_preprocessor=false
 extra_wav_list= # Extra list of scp files for wav formatting
 
 # Pretrained model related
@@ -142,7 +143,9 @@ Options:
     --enh_config # Config for enhancement model training (default="${enh_config}").
     --enh_args   # Arguments for enhancement model training, e.g., "--max_epoch 10" (default="${enh_args}").
                  # Note that it will overwrite args in enhancement config.
-    --spk_num    # Number of speakers in the input audio (default="${spk_num}")
+    --ref_num    # Number of reference audios for each mixture (default="${ref_num}")
+    --inf_num    # Number of inference audio generated by the model (default="${ref_num}")
+                 # If not specified, it will be the same as ref_num. If specified, it will be overwritten.
     --dynamic_mixing   # Flag for dynamic mixing in speech separation task (default="${dynamic_mixing}").
     --noise_type_num   # Number of noise types in the input audio (default="${noise_type_num}")
     --dereverb_ref_num # Number of references for dereverberation (default="${dereverb_ref_num}")
@@ -152,7 +155,6 @@ Options:
                          for training a dereverberation model (default="${use_dereverb_ref}")
     --use_noise_ref    # Whether or not to use noise signal as an additional reference
                          for training a denoising model (default="${use_noise_ref}")
-    --use_preprocessor # Whether or not to apply preprocessing (default="${use_preprocessor}")
     --extra_wav_list   # Extra list of scp files for wav formatting (default="${extra_wav_list}")
 
     # Pretrained model related
@@ -215,6 +217,7 @@ utt_extra_files="utt2category"
 
 data_feats=${dumpdir}/raw
 
+inf_num=${inf_num:=${ref_num}}
 
 # Set tag for naming of model directory
 if [ -z "${enh_tag}" ]; then
@@ -283,7 +286,7 @@ if ! "${skip_data_prep}"; then
            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
 
             _scp_list="wav.scp "
-            for i in $(seq ${spk_num}); do
+            for i in $(seq ${ref_num}); do
                 _scp_list+="spk${i}.scp "
             done
 
@@ -338,7 +341,7 @@ if ! "${skip_data_prep}"; then
 
 
             _spk_list=" "
-            for i in $(seq ${spk_num}); do
+            for i in $(seq ${ref_num}); do
                 _spk_list+="spk${i} "
             done
             if $use_noise_ref && [ -n "${_suf}" ]; then
@@ -373,6 +376,10 @@ if ! "${skip_data_prep}"; then
 
             echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
 
+            for f in ${utt_extra_files}; do
+                [ -f data/${dset}/${f} ] && cp data/${dset}/${f} ${data_feats}/${dset}
+            done
+
         done
     fi
 
@@ -385,7 +392,7 @@ if ! "${skip_data_prep}"; then
 
             _spk_list=" "
             _scp_list=" "
-            for i in $(seq ${spk_num}); do
+            for i in $(seq ${ref_num}); do
                 _spk_list+="spk${i} "
                 _scp_list+="spk${i}.scp "
             done
@@ -423,7 +430,7 @@ if ! "${skip_data_prep}"; then
             done
 
             # fix_data_dir.sh leaves only utts which exist in all files
-            utils/fix_data_dir.sh --utt_extra_files "${_scp_list}" "${data_feats}/${dset}"
+            utils/fix_data_dir.sh --utt_extra_files "${_scp_list} ${utt_extra_files}" "${data_feats}/${dset}"
         done
     fi
 else
@@ -489,7 +496,7 @@ if ! "${skip_train}"; then
         # prepare train and valid data parameters
         _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/wav.scp,speech_mix,${_type} "
         _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,${_type} "
-        for spk in $(seq "${spk_num}"); do
+        for spk in $(seq "${ref_num}"); do
             _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
             _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
         done
@@ -518,7 +525,6 @@ if ! "${skip_train}"; then
         ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
             ${python} -m espnet2.bin.enh_train \
                 --collect_stats true \
-                ${use_preprocessor:+--use_preprocessor $use_preprocessor} \
                 ${_train_data_param} \
                 ${_valid_data_param} \
                 --train_shape_file "${_logdir}/train.JOB.scp" \
@@ -577,7 +583,7 @@ if ! "${skip_train}"; then
             _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,${_type} "
             _valid_shape_param="--valid_shape_file ${enh_stats_dir}/valid/speech_mix_shape "
 
-            for spk in $(seq "${spk_num}"); do
+            for spk in $(seq "${ref_num}"); do
                 _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
                 _train_shape_param+="--train_shape_file ${enh_stats_dir}/train/speech_ref${spk}_shape "
             done
@@ -592,7 +598,7 @@ if ! "${skip_train}"; then
             _opts+="--utt2spk ${_enh_train_dir}/utt2spk "
         fi
 
-        for spk in $(seq "${spk_num}"); do
+        for spk in $(seq "${ref_num}"); do
             _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
             _valid_shape_param+="--valid_shape_file ${enh_stats_dir}/valid/speech_ref${spk}_shape "
             _fold_length_param+="--fold_length ${_fold_length} "
@@ -639,7 +645,6 @@ if ! "${skip_train}"; then
             --init_file_prefix "${enh_exp}"/.dist_init_ \
             --multiprocessing_distributed true -- \
             ${python} -m espnet2.bin.enh_train \
-                ${use_preprocessor:+--use_preprocessor $use_preprocessor} \
                 ${_train_data_param} \
                 ${_valid_data_param} \
                 ${_train_shape_param} \
@@ -713,7 +718,7 @@ if ! "${skip_eval}"; then
 
 
             _spk_list=" "
-            for i in $(seq ${spk_num}); do
+            for i in $(seq ${inf_num}); do
                 _spk_list+="spk${i} "
             done
 
@@ -765,18 +770,26 @@ if ! "${skip_eval}"; then
 
 
                 _ref_scp=
-                for spk in $(seq "${spk_num}"); do
+                for spk in $(seq "${ref_num}"); do
                     _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
                 done
                 _inf_scp=
-                for spk in $(seq "${spk_num}"); do
-                    if "${score_obs}"; then
+                if "${score_obs}"; then
+                    for spk in $(seq "${ref_num}"); do
                         # To compute the score of observation, input original wav.scp
                         _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
-                    else
+                    done
+                    flexible_numspk=false
+                else
+                    for spk in $(seq "${inf_num}"); do
                         _inf_scp+="--inf_scp ${enh_exp}/${inference_tag}_${dset}/spk${spk}.scp "
+                    done
+                    if [[ "${ref_num}" -ne "${inf_num}" ]]; then
+                        flexible_numspk=true
+                    else
+                        flexible_numspk=false
                     fi
-                done
+                fi
 
                 # 2. Submit scoring jobs
                 log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
@@ -787,9 +800,10 @@ if ! "${skip_eval}"; then
                         --output_dir "${_logdir}"/output.JOB \
                         ${_ref_scp} \
                         ${_inf_scp} \
-                        --ref_channel ${ref_channel}
+                        --ref_channel ${ref_channel} \
+                        --flexible_numspk ${flexible_numspk}
 
-                for spk in $(seq "${spk_num}"); do
+                for spk in $(seq "${ref_num}"); do
                     for protocol in ${scoring_protocol} wav; do
                         for i in $(seq "${_nj}"); do
                             cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
@@ -800,7 +814,7 @@ if ! "${skip_eval}"; then
 
                 for protocol in ${scoring_protocol}; do
                     # shellcheck disable=SC2046
-                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    paste $(for j in $(seq ${ref_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
                     awk 'BEGIN{sum=0}
                         {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
                         END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
@@ -857,7 +871,7 @@ if "${score_with_asr}"; then
                     _dir="${enh_exp}/${inference_asr_tag}/${dset}"
                 fi
 
-                for spk in $(seq "${spk_num}"); do
+                for spk in $(seq "${ref_num}"); do
                     _ddir=${_dir}/spk_${spk}
                     _logdir="${_ddir}/logdir"
                     _decode_dir="${_ddir}/decode"
@@ -953,7 +967,7 @@ if "${score_with_asr}"; then
                     _dir="${enh_exp}/${inference_asr_tag}/${dset}/"
                 fi
 
-                for spk in $(seq "${spk_num}"); do
+                for spk in $(seq "${ref_num}"); do
                     _ddir=${_dir}/spk_${spk}
                     _logdir="${_ddir}/logdir"
                     _decode_dir="${_ddir}/decode"

diff --git a/egs2/mini_an4/enh1/conf/train_with_dynamic_mixing.yaml b/egs2/mini_an4/enh1/conf/train_with_dynamic_mixing.yaml
@@ -21,8 +21,12 @@ separator_conf:
 # The maximum random gain (in dB) for each source before the mixing.
 # The gain (in dB) of each source is unifromly sampled in
 # [-dynamic_mixing_gain_db, dynamic_mixing_gain_db]
-dynamic_mixing: True
-dynamic_mixing_gain_db: 2.0
+preprocessor: dynamic_mixing
+preprocessor_conf:
+    num_utts: 1
+    dynamic_mixing_gain_db: 2.0
+    source_scp_name: "spk1.scp"
+    mixture_source_name: "speech_mix"
 
 criterions: 
   # The first criterion

diff --git a/egs2/mini_an4/enh1/run.sh b/egs2/mini_an4/enh1/run.sh
@@ -8,7 +8,7 @@ set -o pipefail
 ./enh.sh \
     --fs 16k \
     --lang en \
-    --spk-num 1 \
+    --ref-num 1 \
     --train_set train_nodev \
     --valid_set train_dev \
     --test_sets "train_dev test" \

diff --git a/egs2/wsj0_2mix/mixit_enh1/cmd.sh b/egs2/wsj0_2mix/mixit_enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/wsj0_2mix/mixit_enh1/conf/pbs.conf b/egs2/wsj0_2mix/mixit_enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/wsj0_2mix/mixit_enh1/conf/queue.conf b/egs2/wsj0_2mix/mixit_enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q