Merge branch 'master' into update_vivos_url

espnet · Sep 23, 2022 · 4c1ad85 · 4c1ad85
2 parents 91c743b + ef6c15f
commit 4c1ad85
Show file tree

Hide file tree

Showing 64 changed files with 1,143 additions and 410 deletions.
diff --git a/README.md b/README.md
@@ -176,8 +176,10 @@ Demonstration
     - En / Jp / Zn / Nl / And more...
 - Supports using context from previous utterances
 - Supports using other tasks like SE in pipeline manner
+- Supports Two Pass SLU that combines audio and ASR transcript
 Demonstration
 - Performing noisy spoken language understanding using speech enhancement model followed by spoken language understanding model.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14nCrJ05vJcQX0cJuXjbMVFWUHJ3Wfb6N?usp=sharing)
+- Performing two pass spoken language understanding where the second pass model attends on both acoustic and semantic information.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1p2cbGIPpIIcynuDl4ZVHDpmNPl8Nh_ci?usp=sharing)
 - Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See SLU demo on multiple languages: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Siddhant/ESPnet2-SLU)
 
 

diff --git a/ci/install.sh b/ci/install.sh
@@ -21,7 +21,7 @@ ${CXX:-g++} -v
     . ./activate_python.sh
     make TH_VERSION="${TH_VERSION}"
 
-    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done
+    make warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done
     rm -rf kaldi
 )
 . tools/activate_python.sh

diff --git a/doc/faq.md b/doc/faq.md
@@ -5,7 +5,7 @@ Our documentation, [Installation](./installation.md), assumes that some basic to
 so you need to also install them if you don't have. They are undocumented, but the configuration of our CI may help you because it also build the environment from scratch:  https://github.com/espnet/espnet/blob/master/.circleci/config.yml
 
 
-## ModuleNotFoundError: No module named 'espnet', 'warpctc_pytorch', or etc.
+## ModuleNotFoundError: No module named 'espnet', or etc.
 
 Firstly, you definitely missed some installation processes. Please read [Installation](./installation.md) again before posting an issue. If you still have a problem, then please try to manual installation.
 
@@ -15,7 +15,7 @@ pip install <some-tools>
 conda install <some-tools>
 ```
 
-If you need to install some packages not distributed in pypi, e.g. `warp_transducer`, try to use the installer scripts in espnet.
+If you need to install some packages not distributed in pypi, e.g. `k2`, try to use the installer scripts in espnet.
 
 ```
 cd tools
@@ -33,5 +33,4 @@ cd tools
     ```bash
     $ python
     >>> import espnet
-    >>> import warpctc_pytorch   # If you'll use warpctc
     ```
diff --git a/egs2/fisher_callhome_spanish/st1/conf/tuning/train_conformer_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_conformer_st.yaml
@@ -71,7 +71,7 @@ scheduler_conf:
 
 # minibatch related
 batch_type: folded
-batch_bins: 64
+batch_size: 64
 
 best_model_criterion:
 -   - valid

diff --git a/egs2/fisher_callhome_spanish/st1/conf/tuning/train_transformer_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_transformer_st.yaml
@@ -62,7 +62,7 @@ scheduler_conf:
 
 # minibatch related
 batch_type: folded
-batch_bins: 64
+batch_size: 64
 
 best_model_criterion:
 -   - valid

diff --git a/egs2/fsc_challenge/asr1/run.sh b/egs2/fsc_challenge/asr1/run.sh
@@ -7,7 +7,7 @@ set -o pipefail
 
 train_set="train"
 valid_set="valid"
-test_sets="test valid"
+test_sets="utt_test spk_test valid"
 
 if python3 -c 'import torch as t; from packaging.version import parse as L; assert L(t.__version__) >= L("1.7.0")' &> /dev/null;  then
 	asr_config=conf/train_asr.yaml

diff --git a/egs2/fsc_challenge/slu1/README.md b/egs2/fsc_challenge/slu1/README.md
@@ -0,0 +1,49 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Environments
+- date: `Sun Oct  3 22:25:25 EDT 2021`
+- python version: `3.8.11 (default, Aug  3 2021, 15:09:35)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a3`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `a1a55e1eef2a74d2b8580d8071ce5229e7fa654c`
+  - Commit date: `Mon Nov 8 23:56:06 2021 -0500`
+
+## Using Transformer based deliberation encoder with ground truth transcript
+- ASR config: [conf/tuning/train_asr_hubert_transformer_adam_specaug_deliberation_transformer_gt.yaml](conf/tuning/train_asr_hubert_transformer_adam_specaug_deliberation_transformer_gt.yaml)
+- token_type: word
+- local_data_opts: "--gt true"
+- Pretrained Model
+  - Hugging Face : https://huggingface.co/espnet/fsc_challenge_slu_2pass_transformer_gt
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/spk_test|3366|100.0|
+|inference_asr_model_valid.acc.ave_5best/utt_test|3970|87.6|
+|inference_asr_model_valid.acc.ave_5best/valid|2624|100.0|
+
+## Using Transformer based deliberation encoder with ASR transcript
+- ASR config: [conf/train_asr.yaml](conf/train_asr.yaml)
+- token_type: word
+- local_data_opts: "--gt false"
+- Pretrained Model
+  - Hugging Face : https://huggingface.co/espnet/fsc_challenge_slu_2pass_transformer
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/spk_test|3366|98.1|
+|inference_asr_model_valid.acc.ave_5best/utt_test|3970|82.3|
+|inference_asr_model_valid.acc.ave_5best/valid|2624|98.6|
+
+## Using Conformer based deliberation encoder with ASR transcript
+- ASR config: [conf/tuning/train_asr_hubert_transformer_adam_specaug_deliberation.yaml](conf/tuning/train_asr_hubert_transformer_adam_specaug_deliberation.yaml)
+- token_type: word
+- local_data_opts: "--gt false"
+- Pretrained Model
+  - Hugging Face : https://huggingface.co/espnet/fsc_challenge_slu_2pass_conformer
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/spk_test|3366|97.5|
+|inference_asr_model_valid.acc.ave_5best/utt_test|3970|81.9|
+|inference_asr_model_valid.acc.ave_5best/valid|2624|98.2|
diff --git a/egs2/fsc_challenge/slu1/cmd.sh b/egs2/fsc_challenge/slu1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/fsc_challenge/slu1/conf/fbank.conf b/egs2/fsc_challenge/slu1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/fsc_challenge/slu1/conf/pbs.conf b/egs2/fsc_challenge/slu1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/fsc_challenge/slu1/conf/pitch.conf b/egs2/fsc_challenge/slu1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/fsc_challenge/slu1/conf/queue.conf b/egs2/fsc_challenge/slu1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/fsc_challenge/slu1/conf/slurm.conf b/egs2/fsc_challenge/slu1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/fsc_challenge/slu1/conf/train_asr.yaml b/egs2/fsc_challenge/slu1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_hubert_transformer_adam_specaug_deliberation_transformer.yaml
diff --git a/...sc_challenge/slu1/conf/tuning/train_asr_hubert_transformer_adam_specaug_deliberation.yaml b/...sc_challenge/slu1/conf/tuning/train_asr_hubert_transformer_adam_specaug_deliberation.yaml
@@ -0,0 +1,97 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+deliberationencoder: conformer
+deliberationencoder_conf:
+    output_size: 256
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: linear
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+
+postdecoder: hugging_face_transformers
+postdecoder_conf:
+    # pick up a model from https://huggingface.co/models?filter=transformers
+    # most of models should work, but maybe some don't
+    # known to work: bert, gpt2, xlnet, roberta, mpnet, t5, bart
+    # xlnet currently works for single gpu only
+    model_name_or_path: "bert-base-cased"
+    output_size: 256
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 80
+keep_nbest_models: 5
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+freeze_param: [
+"encoder",
+"postdecoder.model",
+"frontend.upstream"
+]
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2