espnet · mergify · Jul 22, 2023 · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/README.md b/README.md
diff --git a/doc/espnet2_tutorial.md b/doc/espnet2_tutorial.md
@@ -415,7 +415,7 @@ Latency: 52581.004 [ms/sentence]
 
 ## Transducer ASR
 
-> ***Important***: If you encounter any issue related to Transducer loss, please open an issue in [our fork of warp-transducer](https://github.com/b-flo/warp-transducer).
+> ***Important***: If you encounter any issue related to `warp-transducer`, please open an issue in [our forked repo](https://github.com/b-flo/warp-transducer).
 
 ESPnet2 supports models trained with the (RNN-)Tranducer loss, aka Transducer models. Currently, two versions of these models exist within ESPnet2: one under `asr` and the other under `asr_transducer`. The first one is designed as a supplement of CTC-Attention ASR models while the second is designed independently and purely for the Transducer task. For that, we rely on `ESPnetASRTransducerModel` instead of `ESPnetASRModel` and a new task called `ASRTransducerTask` is used in place of `ASRTask`.
 
@@ -431,13 +431,31 @@ To enable Transducer model training or decoding in your experiments, the followi
 asr.sh --asr_task asr_transducer [...]
 ```
 
-For Transducer loss computation during training, we rely on a fork of `warp-transducer`. The installation procedure is described [here](https://espnet.github.io/espnet/installation.html#step-3-optional-custom-tool-installation).
+For Transducer loss computation during training, we rely by default on a fork of `warp-transducer`. The installation procedure is described [here](https://espnet.github.io/espnet/installation.html#step-3-optional-custom-tool-installation).
 
 **Note:** We made available FastEmit regularization [[Yu et al., 2021]](https://arxiv.org/pdf/2010.11148) during loss computation. To enable it, `fastemit_lambda` need to be set in `model_conf`:
 
     model_conf:
       fastemit_lambda: Regularization parameter for FastEmit. (float, default = 0.0)
 
+Optionnaly, we also support training with the Pruned RNN-T loss [[Kuang et al. 2022]](https://arxiv.org/pdf/2206.13236.pdf) made available in the [k2](https://github.com/k2-fsa/k2) toolkit. To use it, the parameter `use_k2_pruned_loss` should be set to `True` in `model_conf`. From here, the loss computation can be controlled by setting the following parameters through `k2_pruned_loss_args` in `model_conf`:
+
+    model_conf:
+      use_k2_pruned_loss: True
+      k2_pruned_loss_args:
+        prune_range: How many tokens by frame are used compute the pruned loss. (int, default = 5)
+        simple_loss_scaling: The weight to scale the simple loss after warm-up. (float, default = 0.5)
+        lm_scale: The scale factor to smooth the LM part. (float, default = 0.0)
+        am_scale: The scale factor to smooth the AM part. (float, default = 0.0)
+        loss_type: Define the type of path to take for loss computation, either 'regular', 'smoothed' or 'constrained'. (str, default = "regular")
+
+**Note:** Because the number of tokens emitted by timestep can be restricted during training with this version, we also make available the parameter `validation_nstep`. It let the users apply similar constraints during the validation process, when reporting CER or/and WER:
+
+    model_conf:
+      validation_nstep: Maximum number of symbol expansions at each time step when reporting CER or/and WER using mAES.
+
+For more information, see section Inference and "modified Adaptive Expansion Search" algorithm.
+
 ### Architecture
 
 The architecture is composed of three modules: encoder, decoder and joint network. Each module has one (or three)  config(s) with various parameters in order to configure the internal parts. The following sections describe the mandatory and optional parameters for each module.

diff --git a/doc/installation.md b/doc/installation.md
@@ -150,7 +150,7 @@ We also have [prebuilt Kaldi binaries](https://github.com/espnet/espnet/blob/mas
 
         ```sh
         $ cd <espnet-root>/tools
-        $ CONDA_ROOT=${CONDA_EXE}/../..  # CONDA_EXE is an environment variable set by ${CONDA_ROOT}/etc/profile.d/conda.sh
+        $ CONDA_ROOT=${${CONDA_PREFIX}/../..  # CONDA_PREFIX is an environment variable set by ${CONDA_ROOT}/etc/profile.d/conda.sh
         $ ./setup_anaconda.sh ${CONDA_ROOT} [conda-env-name] [python-version]
         # e.g.
         $ ./setup_anaconda.sh ${CONDA_ROOT} espnet 3.8
@@ -218,7 +218,7 @@ e.g.
     ```sh
     cd <espnet-root>/tools
     cuda_root=<cuda-root>  # e.g. <cuda-root> = /usr/local/cuda
-    bach -c ". activate_python.sh; . ./setup_cuda_env.sh $cuda_root; ./installers/install_warp-transducer.sh"
+    bash -c ". activate_python.sh; . ./setup_cuda_env.sh $cuda_root; ./installers/install_warp-transducer.sh"
     ```
 - To install PyOpenJTalk
     ```sh

diff --git a/egs2/TEMPLATE/spk1/spk.sh b/egs2/TEMPLATE/spk1/spk.sh
@@ -181,21 +181,75 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
             _dsets="${test_sets}"
         fi
     else
-        _dsets="${train_set} ${valid_set} ${test_sets}"
+        _dsets="${valid_set} ${test_sets}"
     fi
 
     if [ "${feats_type}" = raw ]; then
-        log "Stage 2: Format wav.scp: data/ -> ${data_feats}"
+        if [ "${skip_train}" = false ]; then
+            utils/copy_data_dir.sh --validate_opts --non-print data/"${train_set}" "${data_feats}/${train_set}"
+
+            # shellcheck disable=SC2086
+            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                --audio-format "${audio_format}" --fs "${fs}" \
+                --multi-columns-input "${multi_columns_input_wav_scp}" \
+                --multi-columns-output "${multi_columns_output_wav_scp}" \
+                "data/${train_set}/wav.scp" "${data_feats}/${train_set}"
+
+            echo "${feats_type}" > "${data_feats}/${train_set}/feats_type"
+            if "${multi_columns_output_wav_scp}"; then
+                echo "multi_${audio_format}" > "${data_feats}/${train_set}/audio_format"
+            else
+                echo "${audio_format}" > "${data_feats}/${train_set}/audio_format"
+            fi
+        fi
+
+        # Calculate EER for valid/test since speaker verification is an open set problem
+        # Train can be either multi-column data or not, but valid/test always require multi-column trial
         for dset in ${_dsets}; do
             utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}/${dset}"
-            echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
+            cp data/${dset}/trial.scp "${data_feats}/${dset}"
+            cp data/${dset}/trial2.scp "${data_feats}/${dset}"
+            cp data/${dset}/trial_label "${data_feats}/${dset}"
 
             # shellcheck disable=SC2086
             scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
                 --audio-format "${audio_format}" --fs "${fs}" \
                 --multi-columns-input "${multi_columns_input_wav_scp}" \
                 --multi-columns-output "${multi_columns_output_wav_scp}" \
-                "data/${dset}/wav.scp" "${data_feats}/${dset}"
+                --out_filename trial.scp \
+                "data/${dset}/trial.scp" "${data_feats}/${dset}"
+            # shellcheck disable=SC2086
+            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                --audio-format "${audio_format}" --fs "${fs}" \
+                --multi-columns-input "${multi_columns_input_wav_scp}" \
+                --multi-columns-output "${multi_columns_output_wav_scp}" \
+                --out_filename trial2.scp \
+                "data/${dset}/trial2.scp" "${data_feats}/${dset}"
+
+            echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
+            echo "multi_${audio_format}" > "${data_feats}/${dset}/audio_format"
+
+        done
+    elif [ "${feats_type}" = raw_copy ]; then
+        if [ "${skip_train}" = false ]; then
+            utils/copy_data_dir.sh --validate_opts --non-print data/"${train_set}" "${data_feats}/${train_set}"
+
+            echo "${feats_type}" > "${data_feats}/${train_set}/feats_type"
+            if "${multi_columns_output_wav_scp}"; then
+                echo "multi_${audio_format}" > "${data_feats}/${train_set}/audio_format"
+            else
+                echo "${audio_format}" > "${data_feats}/${train_set}/audio_format"
+            fi
+        fi
+
+        # Calculate EER for valid/test since speaker verification is an open set problem
+        # Train can be either multi-column data or not, but valid/test always require multi-column trial
+        for dset in ${_dsets}; do
+            utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}/${dset}"
+            cp data/${dset}/trial_label "${data_feats}/${dset}"
+
+            echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
+            echo "multi_${audio_format}" > "${data_feats}/${dset}/audio_format"
 
         done
     else
@@ -252,14 +306,13 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # shellcheck disable=SC2046,SC2086
     ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
         ${python} -m espnet2.bin.spk_train \
-            --use_preprocessor true \
+            --use_preprocessor false \
             --collect_stats true \
             --train_data_path_and_name_and_type ${_spk_train_dir}/wav.scp,speech,${_type} \
-            --train_data_path_and_name_and_type ${_spk_train_dir}/utt2spk,spk_labels,text \
             --valid_data_path_and_name_and_type ${_spk_valid_dir}/trial.scp,speech,${_type} \
-            --valid_data_path_and_name_and_type ${_spk_valid_dir}/trial_label,spk_labels,text \
             --train_shape_file "${_logdir}/train.JOB.scp" \
             --valid_shape_file "${_logdir}/valid.JOB.scp" \
+            --spk2utt ${_spk_train_dir}/spk2utt \
             --output_dir "${_logdir}/stats.JOB" \
             ${_opts} ${spk_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1;  }
 
@@ -271,6 +324,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # shellcheck disable=SC2086
     ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --skip_sum_stats --output_dir "${spk_stats_dir}"
 
+    cp ${spk_stats_dir}/valid/speech_shape ${spk_stats_dir}/valid/speech_shape2
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then

diff --git a/egs2/mini_an4/spk1/conf/train_mini_RawNet3.yaml b/egs2/mini_an4/spk1/conf/train_mini_RawNet3.yaml
@@ -15,15 +15,15 @@ projector_conf:
   input_size: 192
   output_size: 16
 
-
 preprocessor: spk
 preprocessor_conf:
-  utt2spk: dump/raw/train_nodev/utt2spk
-  spk2utt: dump/raw/train_nodev/spk2utt
   target_duration: 3.0
   sr: 16000
   num_eval: 1
 
+model_conf:
+  extract_feats_in_collect_stats: false
+
 loss: aamsoftmax
 loss_conf:
   nOut: 16
@@ -32,3 +32,4 @@ loss_conf:
   scale: 15
 
 optim: adam
+num_att_plot: 0
diff --git a/egs2/mini_an4/spk1/local/data.sh b/egs2/mini_an4/spk1/local/data.sh
@@ -58,7 +58,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     done
 
     # make a dev set
-    utils/subset_data_dir.sh --first data/train 1 data/${train_dev}
+    utils/subset_data_dir.sh --first data/train 2 data/${train_dev}
     n=$(($(wc -l < data/train/text) - 1))
     utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
 
@@ -78,6 +78,11 @@ EOF
         awk '{print $1 " 1ch_16k"}' data/${x}/wav.scp > data/${x}/utt2category
     done
 
+    # for spk task validation
+    for x in test test_seg ${train_set} ${train_dev}; do
+        python local/make_trial.py data/${x}/wav.scp data/${x}
+    done
+
     find downloads/noise/ -iname "*.wav" | awk '{print "noise" NR " " $1}' > data/${train_set}/noises.scp
     find downloads/rirs/ -iname "*.wav" | awk '{print "rir" NR " " $1}' > data/${train_set}/rirs.scp
 fi

diff --git a/egs2/mini_an4/spk1/local/make_trial.py b/egs2/mini_an4/spk1/local/make_trial.py
@@ -0,0 +1,14 @@
+import os
+import sys
+
+if __name__ == "__main__":
+    with open(sys.argv[1]) as f:
+        lines = f.readlines()
+
+    joint_key = lines[0].strip().split(" ")[0] + "*" + lines[1].strip().split(" ")[0]
+    with open(os.path.join(sys.argv[2], "trial.scp"), "w") as f:
+        f.write(joint_key + " " + " ".join(lines[0].strip().split(" ")[1:]) + "\n")
+    with open(os.path.join(sys.argv[2], "trial2.scp"), "w") as f:
+        f.write(joint_key + " " + " ".join(lines[1].strip().split(" ")[1:]) + "\n")
+    with open(os.path.join(sys.argv[2], "trial_label"), "w") as f:
+        f.write(joint_key + " 0\n")
diff --git a/egs2/ml_superb/asr1/run_multi.sh b/egs2/ml_superb/asr1/run_multi.sh
@@ -21,7 +21,7 @@ lid=false # whether to add joint LID task in multiligual ASR
 inference_config=conf/decode_asr.yaml
 asr_config=conf/tuning/train_asr_fbank_${duration}.yaml
 
-./utils/parse_options.sh || exit 1
+. utils/parse_options.sh || exit 1
 
 # Common configs for ML-SUPERB
 token_type=char
@@ -41,7 +41,7 @@ train_dev=dev_${duration}${suffix}
 test_set="${train_dev} test_${duration}${suffix}"
 
 nlsyms_txt=data/local/nlsyms.txt
-asr_tag="$(basename "${asr_config}" .yaml)_${lang}_${duration}"
+asr_tag="$(basename "${asr_config}" .yaml)_multilingual_${duration}"
 
 local_data_opts="--duration ${duration} --lid ${lid} --only_lid ${only_lid}"
 local_data_opts+=" --multilingual true --nlsyms_txt ${nlsyms_txt}"
@@ -53,7 +53,7 @@ local_data_opts+=" --multilingual true --nlsyms_txt ${nlsyms_txt}"
     --nj ${nj} \
     --inference_nj ${inference_nj} \
     --gpu_inference ${gpu_inference} \
-    --lang "multilingual" \
+    --lang "multilingual_${duration}_${suffix}" \
     --inference_asr_model valid.loss.ave.pth \
     --local_data_opts "${local_data_opts}" \
     --nlsyms_txt ${nlsyms_txt} \
@@ -67,5 +67,5 @@ local_data_opts+=" --multilingual true --nlsyms_txt ${nlsyms_txt}"
     --valid_set "${train_dev}" \
     --test_sets "${test_set}" \
     --asr_tag "${asr_tag}" \
-    --asr_stats_dir exp/asr_stats_${lang}_${duration} \
+    --asr_stats_dir exp/asr_stats_multilingual_${duration} \
     --local_score_opts "${lid} ${only_lid} normal"
diff --git a/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr.yaml b/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr.yaml
@@ -0,0 +1,62 @@
+# RawNet3 reproduce recipe configuration.
+
+frontend: raw
+
+encoder: rawnet3
+encoder_conf:
+  model_scale: 8
+  ndim: 1024
+  sinc_stride: 16
+
+pooling: chn_attn_stat
+pooling_conf:
+  input_size: 1536  # 1.5 * ndim of RawNet3 encoder
+
+projector: rawnet3
+projector_conf:
+  input_size: 3072  # 2 * input_size of pooling
+  output_size: 256
+
+preprocessor: spk
+preprocessor_conf:
+  utt2spk: dump/raw_copy/voxceleb12_devs/utt2spk
+  spk2utt: dump/raw_copy/voxceleb12_devs/spk2utt
+  target_duration: 3.0
+  sr: 16000
+  num_eval: 5
+
+model_conf:
+  extract_feats_in_collect_stats: false
+
+loss: aamsoftmax
+loss_conf:
+  nOut: 256
+  nClasses: 7205
+  margin: 0.2
+  scale: 15
+
+max_epoch: 40
+#num_iters_per_epoch: 1
+num_att_plot: 0
+num_workers: 6
+cudnn_deterministic: False
+cudnn_benchmark: True
+batch_size: 32
+#batch_type: unsorted
+iterator_type: sequence
+shuffle_within_batch: True
+log_interval: 50
+optim: adamw
+optim_conf:
+  lr: 0.001
+  weight_decay: 0.00005
+  amsgrad: False
+
+scheduler: CosineAnnealingWarmupRestarts
+scheduler_conf:
+  first_cycle_steps: 310160
+  cycle_mult: 1.0
+  max_lr: 0.001
+  min_lr: 0.000005
+  warmup_steps: 5000
+  gamma: 0.8
diff --git a/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr_bs.yaml b/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr_bs.yaml
@@ -0,0 +1,60 @@
+# RawNet3 reproduce recipe configuration.
+
+frontend: raw
+
+encoder: rawnet3
+encoder_conf:
+  model_scale: 8
+  ndim: 1024
+  sinc_stride: 16
+
+pooling: chn_attn_stat
+pooling_conf:
+  input_size: 1536  # 1.5 * ndim of RawNet3 encoder
+
+projector: rawnet3
+projector_conf:
+  input_size: 3072  # 2 * input_size of pooling
+  output_size: 256
+
+preprocessor: spk
+preprocessor_conf:
+  target_duration: 3.0
+  sr: 16000
+  num_eval: 5
+
+model_conf:
+  extract_feats_in_collect_stats: false
+
+loss: aamsoftmax
+loss_conf:
+  nOut: 256
+  nClasses: 7205
+  margin: 0.3
+  scale: 30
+
+max_epoch: 40
+#num_iters_per_epoch: 1
+num_att_plot: 0
+num_workers: 8
+cudnn_deterministic: False
+cudnn_benchmark: True
+batch_size: 128
+#batch_type: unsorted
+iterator_type: sequence
+shuffle_within_batch: True
+log_interval: 50
+optim: adamw
+optim_conf:
+  lr: 0.001
+  weight_decay: 0.00005
+  amsgrad: False
+
+scheduler: CosineAnnealingWarmupRestarts
+scheduler_conf:
+  first_cycle_steps: 77544
+  cycle_mult: 1.0
+  max_lr: 0.001
+  min_lr: 0.000005
+  warmup_steps: 1000
+  gamma: 0.75