From a623908c6463436e876c4d6aab90025e71463d14 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Thu, 6 Oct 2022 14:55:09 +0000 Subject: [PATCH 01/15] add zero pad for convolution (ref: icefall) --- espnet2/asr_transducer/encoder/blocks/conformer.py | 2 +- espnet2/asr_transducer/encoder/modules/convolution.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/espnet2/asr_transducer/encoder/blocks/conformer.py b/espnet2/asr_transducer/encoder/blocks/conformer.py index b95a426e6b5..1d687505d3b 100644 --- a/espnet2/asr_transducer/encoder/blocks/conformer.py +++ b/espnet2/asr_transducer/encoder/blocks/conformer.py @@ -123,7 +123,7 @@ def forward( residual = x x = self.norm_conv(x) - x, _ = self.conv_mod(x) + x, _ = self.conv_mod(x, mask=mask) x = residual + self.dropout(x) residual = x diff --git a/espnet2/asr_transducer/encoder/modules/convolution.py b/espnet2/asr_transducer/encoder/modules/convolution.py index 012538a7db9..0e50f33292c 100644 --- a/espnet2/asr_transducer/encoder/modules/convolution.py +++ b/espnet2/asr_transducer/encoder/modules/convolution.py @@ -70,6 +70,7 @@ def forward( self, x: torch.Tensor, cache: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, right_context: int = 0, ) -> Tuple[torch.Tensor, torch.Tensor]: """Compute convolution module. @@ -87,6 +88,9 @@ def forward( x = self.pointwise_conv1(x.transpose(1, 2)) x = torch.nn.functional.glu(x, dim=1) + if mask is not None: + x.masked_fill(mask.unsqueeze(1).expand_as(x), 0.0) + if self.lorder > 0: if cache is None: x = torch.nn.functional.pad(x, (self.lorder, 0), "constant", 0.0) From c4ef9cf7ccc3093bcdb1f6b5bfadd8ec19fdc596 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Thu, 6 Oct 2022 14:55:53 +0000 Subject: [PATCH 02/15] fix aux. lm loss, reporter variable and doc --- .../asr_transducer/espnet_transducer_model.py | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/espnet2/asr_transducer/espnet_transducer_model.py b/espnet2/asr_transducer/espnet_transducer_model.py index 3e66605dfe3..345e5a38a32 100644 --- a/espnet2/asr_transducer/espnet_transducer_model.py +++ b/espnet2/asr_transducer/espnet_transducer_model.py @@ -69,7 +69,7 @@ def __init__( auxiliary_ctc_weight: float = 0.0, auxiliary_ctc_dropout_rate: float = 0.0, auxiliary_lm_loss_weight: float = 0.0, - auxiliary_lm_loss_smoothing: float = 0.0, + auxiliary_lm_loss_smoothing: float = 0.05, ignore_id: int = -1, sym_space: str = "", sym_blank: str = "", @@ -82,7 +82,10 @@ def __init__( assert check_argument_types() - # The following labels ID are reserved: 0 (blank) and vocab_size - 1 (sos/eos) + # The following labels ID are reserved: + # - 0: Blank symbol. + # - 1: Unknown symbol. + # - vocab_size - 1: SOS/EOS symbol. self.vocab_size = vocab_size self.ignore_id = ignore_id self.token_list = token_list.copy() @@ -110,7 +113,11 @@ def __init__( if self.use_auxiliary_lm_loss: self.lm_lin = torch.nn.Linear(decoder.output_size, vocab_size) - self.lm_loss_smoothing = auxiliary_lm_loss_smoothing + + eps = auxiliary_lm_loss_smoothing / (vocab_size - 1) + + self.lm_loss_smooth_neg = eps + self.lm_loss_smooth_pos = (1 - auxiliary_lm_loss_smoothing) + eps self.transducer_weight = transducer_weight self.fastemit_lambda = fastemit_lambda @@ -207,8 +214,8 @@ def forward( stats = dict( loss=loss.detach(), loss_transducer=loss_trans.detach(), - aux_ctc_loss=loss_ctc.detach() if loss_ctc > 0.0 else None, - aux_lm_loss=loss_lm.detach() if loss_lm > 0.0 else None, + loss_aux_ctc=loss_ctc.detach() if loss_ctc > 0.0 else None, + loss_aux_lm=loss_lm.detach() if loss_lm > 0.0 else None, cer_transducer=cer_trans, wer_transducer=wer_trans, ) @@ -432,7 +439,7 @@ def _calc_lm_loss( decoder_out: torch.Tensor, target: torch.Tensor, ) -> torch.Tensor: - """Compute LM loss. + """Compute LM loss (i.e.: Cross-entropy with smoothing). Args: decoder_out: Decoder output sequences. (B, U, D_dec) @@ -442,26 +449,21 @@ def _calc_lm_loss( loss_lm: LM loss value. """ - lm_loss_in = self.lm_lin(decoder_out[:, :-1, :]).view(-1, self.vocab_size) - lm_target = target.view(-1).type(torch.int64) + batch_size = decoder_out.size(0) - with torch.no_grad(): - true_dist = lm_loss_in.clone() - true_dist.fill_(self.lm_loss_smoothing / (self.vocab_size - 1)) + logp = torch.log_softmax( + self.lm_lin(decoder_out[:, :-1, :]).view(-1, self.vocab_size), + dim=1, + ) + target = target.view(-1).type(torch.int64) + ignore = (target == 0).unsqueeze(1) - # Ignore blank ID (0) - ignore = lm_target == 0 - lm_target = lm_target.masked_fill(ignore, 0) + with torch.no_grad(): + true_dist = logp.clone().fill_(self.lm_loss_smooth_neg) - true_dist.scatter_(1, lm_target.unsqueeze(1), (1 - self.lm_loss_smoothing)) + true_dist.scatter_(1, target.unsqueeze(1), self.lm_loss_smooth_pos) - loss_lm = torch.nn.functional.kl_div( - torch.log_softmax(lm_loss_in, dim=1), - true_dist, - reduction="none", - ) - loss_lm = loss_lm.masked_fill(ignore.unsqueeze(1), 0).sum() / decoder_out.size( - 0 - ) + loss_lm = torch.nn.functional.kl_div(logp, true_dist, reduction="none") + loss_lm = loss_lm.masked_fill(ignore, 0).sum() / batch_size return loss_lm From 4d045dfecc64f9ac14b4975838d8b11f4844d68a Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Thu, 6 Oct 2022 14:56:46 +0000 Subject: [PATCH 03/15] parser -> group for add_argument --- espnet2/tasks/asr_transducer.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/espnet2/tasks/asr_transducer.py b/espnet2/tasks/asr_transducer.py index aa3cbf54e88..8c7786277d2 100644 --- a/espnet2/tasks/asr_transducer.py +++ b/espnet2/tasks/asr_transducer.py @@ -135,7 +135,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): default={}, help="The keyword arguments for the joint network class.", ) - group = parser.add_argument_group(description="Preprocess related.") + group = group.add_argument_group(description="Preprocess related.") group.add_argument( "--use_preprocessor", type=str2bool, @@ -155,56 +155,56 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): default=None, help="The path of the sentencepiece model.", ) - parser.add_argument( + group.add_argument( "--non_linguistic_symbols", type=str_or_none, help="The 'non_linguistic_symbols' file path.", ) - parser.add_argument( + group.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Text cleaner to use.", ) - parser.add_argument( + group.add_argument( "--g2p", type=str_or_none, choices=g2p_choices, default=None, help="g2p method to use if --token_type=phn.", ) - parser.add_argument( + group.add_argument( "--speech_volume_normalize", type=float_or_none, default=None, help="Normalization value for maximum amplitude scaling.", ) - parser.add_argument( + group.add_argument( "--rir_scp", type=str_or_none, default=None, help="The RIR SCP file path.", ) - parser.add_argument( + group.add_argument( "--rir_apply_prob", type=float, default=1.0, help="The probability of the applied RIR convolution.", ) - parser.add_argument( + group.add_argument( "--noise_scp", type=str_or_none, default=None, help="The path of noise SCP file.", ) - parser.add_argument( + group.add_argument( "--noise_apply_prob", type=float, default=1.0, help="The probability of the applied noise addition.", ) - parser.add_argument( + group.add_argument( "--noise_db_range", type=str, default="13_15", From df6a0ded3d46b25f1b9e6b62b841a5efa73858d5 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Thu, 6 Oct 2022 14:57:17 +0000 Subject: [PATCH 04/15] add libri-100 asr transducer task --- egs2/librispeech_100/asr_transducer1/asr.sh | 1 + egs2/librispeech_100/asr_transducer1/cmd.sh | 1 + .../asr_transducer1/conf/decode.yaml | 14 ++++ .../conf/decode_streaming.yaml | 9 ++ .../conf/train_conformer-rnn_transducer.yaml | 78 ++++++++++++++++++ ...in_conformer-rnn_transducer_streaming.yaml | 82 +++++++++++++++++++ egs2/librispeech_100/asr_transducer1/db.sh | 1 + egs2/librispeech_100/asr_transducer1/local | 1 + egs2/librispeech_100/asr_transducer1/path.sh | 1 + .../librispeech_100/asr_transducer1/pyscripts | 1 + egs2/librispeech_100/asr_transducer1/run.sh | 37 +++++++++ egs2/librispeech_100/asr_transducer1/scripts | 1 + egs2/librispeech_100/asr_transducer1/steps | 1 + egs2/librispeech_100/asr_transducer1/utils | 1 + 14 files changed, 229 insertions(+) create mode 120000 egs2/librispeech_100/asr_transducer1/asr.sh create mode 120000 egs2/librispeech_100/asr_transducer1/cmd.sh create mode 100644 egs2/librispeech_100/asr_transducer1/conf/decode.yaml create mode 100644 egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml create mode 100644 egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml create mode 100644 egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml create mode 120000 egs2/librispeech_100/asr_transducer1/db.sh create mode 120000 egs2/librispeech_100/asr_transducer1/local create mode 120000 egs2/librispeech_100/asr_transducer1/path.sh create mode 120000 egs2/librispeech_100/asr_transducer1/pyscripts create mode 100755 egs2/librispeech_100/asr_transducer1/run.sh create mode 120000 egs2/librispeech_100/asr_transducer1/scripts create mode 120000 egs2/librispeech_100/asr_transducer1/steps create mode 120000 egs2/librispeech_100/asr_transducer1/utils diff --git a/egs2/librispeech_100/asr_transducer1/asr.sh b/egs2/librispeech_100/asr_transducer1/asr.sh new file mode 120000 index 00000000000..60b05122cfd --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/asr.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/asr.sh \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/cmd.sh b/egs2/librispeech_100/asr_transducer1/cmd.sh new file mode 120000 index 00000000000..297411a5ecc --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/cmd.sh @@ -0,0 +1 @@ +../asr1/cmd.sh \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/conf/decode.yaml b/egs2/librispeech_100/asr_transducer1/conf/decode.yaml new file mode 100644 index 00000000000..72bff4d70c8 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/decode.yaml @@ -0,0 +1,14 @@ +beam_size: 5 # 10 produces slightly better results. +beam_search_config: + search_type: default + + # ALSD (search-type: alsd) + u_max: 50 + + # TSD (search-type: tsd) + max_sym_exp: 2 + + # mAES (search-type: maes) + nstep: 1 + expansion_gamma: 1.5 + expansion_beta: 1 diff --git a/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml b/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml new file mode 100644 index 00000000000..a064a80d581 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml @@ -0,0 +1,9 @@ +beam_size: 5 # 10 produces slightly better results. +beam_search_config: + search_type: maes + nstep: 1 + expansion_gamma: 2.3 + expansion_beta: 2 +streaming: True +chunk_size: 64 +left_context: 256 \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml new file mode 100644 index 00000000000..f9e954c4762 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml @@ -0,0 +1,78 @@ +# general +batch_type: numel +batch_bins: 4000000 +accum_grad: 8 +max_epoch: 60 # 100 produces better results. +patience: none +init: none +num_att_plot: 0 + +# optimizer +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 + +# criterion +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 10 # 20 produces slightly better results. + +model_conf: + transducer_weight: 1.0 + auxiliary_ctc_weight: 0.3 + report_cer: True + report_wer: True + +# specaug conf +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 5 + +encoder_conf: + main_conf: + pos_wise_act_type: swish + conv_mod_act_type: swish + pos_enc_dropout_rate: 0.2 + input_conf: + vgg_like: True + body_conf: + - block_type: conformer + linear_size: 1024 + hidden_size: 256 + heads: 4 + dropout_rate: 0.1 + pos_wise_dropout_rate: 0.1 + att_dropout_rate: 0.1 + conv_mod_kernel_size: 31 + num_blocks: 18 +decoder: rnn +decoder_conf: + rnn_type: lstm + num_layers: 1 + embed_size: 256 + hidden_size: 256 + dropout_rate: 0.1 + embed_dropout_rate: 0.2 +joint_network_conf: + joint_space_size: 256 diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml new file mode 100644 index 00000000000..4036367a5d8 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml @@ -0,0 +1,82 @@ +# general +batch_type: numel +batch_bins: 4000000 +accum_grad: 8 +max_epoch: 60 # 100 produces better results. +patience: none +init: none +num_att_plot: 0 + +# optimizer +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 + +# criterion +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 10 # 20 produces slightly better results. + +model_conf: + transducer_weight: 1.0 + auxiliary_ctc_weight: 0.3 + report_cer: True + report_wer: True + +# specaug conf +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 5 + +encoder_conf: + main_conf: + pos_wise_act_type: swish + conv_mod_act_type: swish + pos_enc_dropout_rate: 0.2 + dynamic_chunk_training: True + short_chunk_size: 25 + short_chunk_threshold: 0.75 + left_chunk_size: 4 + input_conf: + vgg_like: True + body_conf: + - block_type: conformer + linear_size: 1024 + hidden_size: 256 + heads: 4 + dropout_rate: 0.1 + pos_wise_dropout_rate: 0.1 + att_dropout_rate: 0.1 + conv_mod_kernel_size: 31 + num_blocks: 18 +decoder: rnn +decoder_conf: + rnn_type: lstm + num_layers: 1 + embed_size: 256 + hidden_size: 256 + dropout_rate: 0.1 + embed_dropout_rate: 0.2 +joint_network_conf: + joint_space_size: 256 diff --git a/egs2/librispeech_100/asr_transducer1/db.sh b/egs2/librispeech_100/asr_transducer1/db.sh new file mode 120000 index 00000000000..50d86130898 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/db.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/db.sh \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/local b/egs2/librispeech_100/asr_transducer1/local new file mode 120000 index 00000000000..23830fb51b8 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/local @@ -0,0 +1 @@ +../asr1/local \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/path.sh b/egs2/librispeech_100/asr_transducer1/path.sh new file mode 120000 index 00000000000..c9ac0a75bc6 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/path.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/path.sh \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/pyscripts b/egs2/librispeech_100/asr_transducer1/pyscripts new file mode 120000 index 00000000000..ac68ad75b60 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/pyscripts @@ -0,0 +1 @@ +../../TEMPLATE/asr1/pyscripts \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/run.sh b/egs2/librispeech_100/asr_transducer1/run.sh new file mode 100755 index 00000000000..1821c0746f9 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/run.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -e +set -u +set -o pipefail + +train_set="train_clean_100" +valid_set="dev" +test_sets="test_clean test_other dev_clean dev_other" + +asr_config=conf/train_conformer-rnn_transducer.yaml +inference_config=conf/decode.yaml +inference_model=valid.loss.ave_10best.pth + +./asr.sh \ + --asr_task asr_transducer \ + --skip_data_prep false \ + --skip_train false \ + --skip_eval false \ + --lang en \ + --ngpu 1 \ + --nj 32 \ + --inference_nj 32 \ + --nbpe 500 \ + --max_wav_duration 30 \ + --speed_perturb_factors "0.9 1.0 1.1" \ + --audio_format "flac.ark" \ + --feats_type raw \ + --use_lm false \ + --asr_config "${asr_config}" \ + --inference_config "${inference_config}" \ + --inference_asr_model "${inference_model}" \ + --train_set "${train_set}" \ + --valid_set "${valid_set}" \ + --test_sets "${test_sets}" \ + --lm_train_text "data/${train_set}/text" \ + --bpe_train_text "data/${train_set}/text" "$@" diff --git a/egs2/librispeech_100/asr_transducer1/scripts b/egs2/librispeech_100/asr_transducer1/scripts new file mode 120000 index 00000000000..b25829705dc --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/scripts @@ -0,0 +1 @@ +../../TEMPLATE/asr1/scripts \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/steps b/egs2/librispeech_100/asr_transducer1/steps new file mode 120000 index 00000000000..91f2d234e20 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/steps @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/steps \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/utils b/egs2/librispeech_100/asr_transducer1/utils new file mode 120000 index 00000000000..f49247da827 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/utils @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/utils \ No newline at end of file From 2db74a9587a32b659cf4e1abb6b611d9f9551e09 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Thu, 6 Oct 2022 15:01:23 +0000 Subject: [PATCH 05/15] typo --- espnet2/tasks/asr_transducer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/espnet2/tasks/asr_transducer.py b/espnet2/tasks/asr_transducer.py index 8c7786277d2..c464f3020e9 100644 --- a/espnet2/tasks/asr_transducer.py +++ b/espnet2/tasks/asr_transducer.py @@ -135,7 +135,9 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): default={}, help="The keyword arguments for the joint network class.", ) - group = group.add_argument_group(description="Preprocess related.") + + group = parser.add_argument_group(description="Preprocess related.") + group.add_argument( "--use_preprocessor", type=str2bool, From 8a4c5ea41d6472204644a9b53e729d6b41b6bf23 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Fri, 7 Oct 2022 12:41:36 +0000 Subject: [PATCH 06/15] add streaming transducer recipe --- .../librispeech_100/asr_transducer1/README.md | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 egs2/librispeech_100/asr_transducer1/README.md diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md new file mode 100644 index 00000000000..51b9a42eb80 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -0,0 +1,46 @@ +# Streaming Conformer-RNN Transducer + +- General information + - Pretrained model: N.A + - Training config: conf/train_conformer-rnn_transducer.streaming.yaml + - Decoding config: conf/decode.yaml + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 36.7Gb + - Training time: ~ 26 hours + - Decoding time (32 jobs, 1 thread): ~9,1 minutes + +- Environments + - date: `Fri Oct 7 12:02:29 UTC 2022` + - python version: `3.8.10 (default, Jun 22 2022, 20:18:18) [GCC 9.4.0]` + - espnet version: `espnet 202209` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `2db74a9587a32b659cf4e1abb6b611d9f9551e09` + - Commit date: `Thu Oct 6 15:01:23 2022 +0000` + +## WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.3|5.2|0.5|0.7|6.4|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|83.4|14.8|1.8|1.9|18.5|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|93.8|5.6|0.7|0.8|7.0|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|82.9|15.0|2.0|1.8|18.9|83.5| + +## CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.2|1.0|0.8|0.6|2.4|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.1|4.1|2.9|1.9|8.9|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.0|1.1|0.9|0.6|2.6|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.0|4.0|3.0|1.8|8.9|83.5| + +## TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.0|3.6|1.4|0.6|5.5|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|84.7|11.6|3.6|2.2|17.4|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|94.7|3.7|1.6|0.6|6.0|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5| From 508f33986e3344919b690478798925721a24e72e Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Sat, 8 Oct 2022 18:12:42 +0000 Subject: [PATCH 07/15] add offline model results --- .../librispeech_100/asr_transducer1/README.md | 55 ++++++++++++++++++- .../conf/decode_streaming.yaml | 2 +- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md index 51b9a42eb80..19cad9158c0 100644 --- a/egs2/librispeech_100/asr_transducer1/README.md +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -1,14 +1,15 @@ # Streaming Conformer-RNN Transducer +# asr_train_conformer-rnn_transducer_streaming_raw_en_bpe500_sp - General information - Pretrained model: N.A - - Training config: conf/train_conformer-rnn_transducer.streaming.yaml - - Decoding config: conf/decode.yaml + - Training config: conf/train_conformer-rnn_transducer_streaming.yaml + - Decoding config: conf/decode.yaml (or conf/decode_streaming.yaml) - GPU: Nvidia A100 40Gb - CPU: AMD EPYC 7502P 32c - Peak VRAM usage during training: 36.7Gb - Training time: ~ 26 hours - - Decoding time (32 jobs, 1 thread): ~9,1 minutes + - Decoding time (32 jobs, 1 thread): ~9,1 minutes (full context) - Environments - date: `Fri Oct 7 12:02:29 UTC 2022` @@ -44,3 +45,51 @@ |decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|84.7|11.6|3.6|2.2|17.4|82.1| |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|94.7|3.7|1.6|0.6|6.0|58.9| |decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5| + +# Conformer-RNN Transducer +# asr_train_conformer-rnn_transducer_raw_en_bpe500_sp + +- General information + - Pretrained model: N.A + - Training config: conf/train_conformer-rnn_transducer.yaml + - Decoding config: conf/decode.yaml + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 36.4Gb + - Training time: ~ 26 hours + - Decoding time (32 jobs, 1 thread): ~9 minutes + +- Environments + - date: `Fri Oct 7 12:02:29 UTC 2022` + - python version: `3.8.10 (default, Jun 22 2022, 20:18:18) [GCC 9.4.0]` + - espnet version: `espnet 202209` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `2db74a9587a32b659cf4e1abb6b611d9f9551e09` + - Commit date: `Thu Oct 6 15:01:23 2022 +0000` + +## WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.7|4.8|0.4|0.6|5.9|55.1| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|84.2|14.1|1.7|1.8|17.6|80.2| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.3|5.2|0.6|0.7|6.4|56.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|83.9|14.2|1.9|1.8|17.9|81.5| + +## CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.4|0.9|0.7|0.6|2.2|55.1| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.4|3.9|2.7|1.9|8.5|80.2| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.2|1.0|0.8|0.6|2.3|56.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.5|3.8|2.7|1.8|8.3|81.5| + +## TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.4|3.4|1.2|0.6|5.2|55.1| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|85.5|11.0|3.5|2.1|16.6|80.2| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.1|3.4|1.4|0.6|5.5|56.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|85.3|10.9|3.9|2.0|16.7|81.5| diff --git a/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml b/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml index a064a80d581..f7a3da0a1b5 100644 --- a/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml +++ b/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml @@ -6,4 +6,4 @@ beam_search_config: expansion_beta: 2 streaming: True chunk_size: 64 -left_context: 256 \ No newline at end of file +left_context: 256 From 9f4f12da4610cae21d0db0283eb28ff610edf838 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Fri, 21 Oct 2022 11:18:43 +0000 Subject: [PATCH 08/15] fix conflict --- espnet2/asr_transducer/encoder/modules/convolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espnet2/asr_transducer/encoder/modules/convolution.py b/espnet2/asr_transducer/encoder/modules/convolution.py index 0e50f33292c..df3e7dcc558 100644 --- a/espnet2/asr_transducer/encoder/modules/convolution.py +++ b/espnet2/asr_transducer/encoder/modules/convolution.py @@ -89,7 +89,7 @@ def forward( x = torch.nn.functional.glu(x, dim=1) if mask is not None: - x.masked_fill(mask.unsqueeze(1).expand_as(x), 0.0) + x.masked_fill_(mask.unsqueeze(1).expand_as(x), 0.0) if self.lorder > 0: if cache is None: From 359ac2de185dd6b89be31f6343f842c65e3e8795 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Fri, 21 Oct 2022 11:22:14 +0000 Subject: [PATCH 09/15] add sub factor param for new version --- .../asr_transducer1/conf/train_conformer-rnn_transducer.yaml | 1 + .../conf/train_conformer-rnn_transducer_streaming.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml index f9e954c4762..f265019c581 100644 --- a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml +++ b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml @@ -56,6 +56,7 @@ encoder_conf: pos_enc_dropout_rate: 0.2 input_conf: vgg_like: True + subsampling_factor: 6 body_conf: - block_type: conformer linear_size: 1024 diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml index 4036367a5d8..5e93a7308f7 100644 --- a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml +++ b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml @@ -60,6 +60,7 @@ encoder_conf: left_chunk_size: 4 input_conf: vgg_like: True + subsampling_factor: 6 body_conf: - block_type: conformer linear_size: 1024 From 9e5005d1b85ab8561dabd007fc2b6e605f713061 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Fri, 10 Feb 2023 14:24:27 +0000 Subject: [PATCH 10/15] update offline model and run script --- .../librispeech_100/asr_transducer1/README.md | 40 +++++++++---------- .../asr_transducer1/conf/decode.yaml | 8 ++-- .../conf/train_conformer-rnn_transducer.yaml | 6 +-- egs2/librispeech_100/asr_transducer1/run.sh | 3 -- 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md index 19cad9158c0..b1c429e41bc 100644 --- a/egs2/librispeech_100/asr_transducer1/README.md +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -55,41 +55,41 @@ - Decoding config: conf/decode.yaml - GPU: Nvidia A100 40Gb - CPU: AMD EPYC 7502P 32c - - Peak VRAM usage during training: 36.4Gb - - Training time: ~ 26 hours - - Decoding time (32 jobs, 1 thread): ~9 minutes + - Peak VRAM usage during training: 37.09 Gb + - Training time: ~ 35 hours + - Decoding time (32 jobs, 1 thread): ~15,6 minutes w/ default beam search. - Environments - - date: `Fri Oct 7 12:02:29 UTC 2022` - - python version: `3.8.10 (default, Jun 22 2022, 20:18:18) [GCC 9.4.0]` - - espnet version: `espnet 202209` + - date: `Fri Feb 10 09:27:45 UTC 2023` + - python version: `3.8.10 (default, Nov 14 2022, 12:59:47) [GCC 9.4.0]` + - espnet version: `espnet 202301` - pytorch version: `pytorch 1.8.1+cu111` - - Git hash: `2db74a9587a32b659cf4e1abb6b611d9f9551e09` - - Commit date: `Thu Oct 6 15:01:23 2022 +0000` + - Git hash: `01893f855ca1a3a3645547ee4d3eaf461f7601bf` + - Commit date: `Thu Feb 9 10:04:57 2023 +0000` ## WER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.7|4.8|0.4|0.6|5.9|55.1| -|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|84.2|14.1|1.7|1.8|17.6|80.2| -|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.3|5.2|0.6|0.7|6.4|56.8| -|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|83.9|14.2|1.9|1.8|17.9|81.5| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.9|4.7|0.5|0.6|5.8|53.6| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|84.9|13.4|1.6|1.8|16.9|78.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.6|4.8|0.6|0.6|6.0|54.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|84.7|13.6|1.8|1.6|17.0|80.1| ## CER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.4|0.9|0.7|0.6|2.2|55.1| -|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.4|3.9|2.7|1.9|8.5|80.2| -|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.2|1.0|0.8|0.6|2.3|56.8| -|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.5|3.8|2.7|1.8|8.3|81.5| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.4|0.9|0.6|0.6|2.1|53.6| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.8|3.7|2.4|1.8|8.0|78.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.4|0.9|0.7|0.5|2.2|54.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.9|3.6|2.5|1.7|7.9|80.1| ## TER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.4|3.4|1.2|0.6|5.2|55.1| -|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|85.5|11.0|3.5|2.1|16.6|80.2| -|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.1|3.4|1.4|0.6|5.5|56.8| -|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|85.3|10.9|3.9|2.0|16.7|81.5| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.5|3.4|1.1|0.6|5.0|53.6| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|86.2|10.6|3.2|2.0|15.9|78.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.5|3.2|1.3|0.6|5.1|54.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|86.0|10.4|3.5|1.9|15.9|80.1| diff --git a/egs2/librispeech_100/asr_transducer1/conf/decode.yaml b/egs2/librispeech_100/asr_transducer1/conf/decode.yaml index 72bff4d70c8..dd2bc435aa8 100644 --- a/egs2/librispeech_100/asr_transducer1/conf/decode.yaml +++ b/egs2/librispeech_100/asr_transducer1/conf/decode.yaml @@ -1,14 +1,14 @@ -beam_size: 5 # 10 produces slightly better results. +beam_size: 10 # 5 is almost equivalent. beam_search_config: search_type: default # ALSD (search-type: alsd) - u_max: 50 + u_max: 150 # TSD (search-type: tsd) - max_sym_exp: 2 + max_sym_exp: 3 # mAES (search-type: maes) - nstep: 1 + nstep: 2 expansion_gamma: 1.5 expansion_beta: 1 diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml index f265019c581..3eb4f52aa63 100644 --- a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml +++ b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml @@ -2,7 +2,7 @@ batch_type: numel batch_bins: 4000000 accum_grad: 8 -max_epoch: 60 # 100 produces better results. +max_epoch: 60 # 100 produces slightly better results. patience: none init: none num_att_plot: 0 @@ -24,7 +24,7 @@ best_model_criterion: - - valid - loss - min -keep_nbest_models: 10 # 20 produces slightly better results. +keep_nbest_models: 10 model_conf: transducer_weight: 1.0 @@ -56,7 +56,7 @@ encoder_conf: pos_enc_dropout_rate: 0.2 input_conf: vgg_like: True - subsampling_factor: 6 + subsampling_factor: 4 body_conf: - block_type: conformer linear_size: 1024 diff --git a/egs2/librispeech_100/asr_transducer1/run.sh b/egs2/librispeech_100/asr_transducer1/run.sh index 1821c0746f9..ade3a73e1b0 100755 --- a/egs2/librispeech_100/asr_transducer1/run.sh +++ b/egs2/librispeech_100/asr_transducer1/run.sh @@ -14,9 +14,6 @@ inference_model=valid.loss.ave_10best.pth ./asr.sh \ --asr_task asr_transducer \ - --skip_data_prep false \ - --skip_train false \ - --skip_eval false \ --lang en \ --ngpu 1 \ --nj 32 \ From 17af213961e275a853e617ea0dd24f4ec8b3afa6 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Tue, 14 Feb 2023 09:46:51 +0000 Subject: [PATCH 11/15] add ebranchformer config and results --- .../librispeech_100/asr_transducer1/README.md | 48 +++++++++++ .../train_ebranchformer-rnn_transducer.yaml | 79 +++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-rnn_transducer.yaml diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md index b1c429e41bc..2ff98780397 100644 --- a/egs2/librispeech_100/asr_transducer1/README.md +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -93,3 +93,51 @@ |decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|86.2|10.6|3.2|2.0|15.9|78.9| |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.5|3.2|1.3|0.6|5.1|54.8| |decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|86.0|10.4|3.5|1.9|15.9|80.1| + +# E-Branchformer-RNN Transducer +# asr_train_ebranchformer-rnn_transducer_raw_en_bpe500_sp + +- General information + - Pretrained model: N.A + - Training config: conf/train_ebranchformer-rnn_transducer.yaml + - Decoding config: conf/decode.yaml + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 37.39 Gb + - Training time: ~ 33,8 hours + - Decoding time (32 jobs, 1 thread): ~15,7 minutes w/ default beam search. + +- Environments + - date: Tue Feb 14 07:41:14 UTC 2023` + - python version: `3.8.10 (default, Nov 14 2022, 12:59:47) [GCC 9.4.0]` + - espnet version: `espnet 202301` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `01893f855ca1a3a3645547ee4d3eaf461f7601bf` + - Commit date: `Thu Feb 9 10:04:57 2023 +0000` + +## WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.9|4.7|0.4|0.6|5.7|53.0| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|85.0|13.4|1.6|1.8|16.8|77.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.6|4.9|0.5|0.6|6.0|55.5| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|84.7|13.6|1.8|1.7|17.1|80.6| + +## CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.5|0.9|0.6|0.6|2.1|53.0| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.9|3.7|2.4|1.8|7.9|77.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.4|0.9|0.7|0.5|2.1|55.5| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.9|3.6|2.5|1.8|7.9|80.6| + +## TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.6|3.3|1.1|0.6|5.0|53.0| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|86.2|10.6|3.2|2.0|15.8|77.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.4|3.2|1.3|0.6|5.1|55.5| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|85.9|10.5|3.6|2.0|16.0|80.6| diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-rnn_transducer.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-rnn_transducer.yaml new file mode 100644 index 00000000000..fd2fb1804f1 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-rnn_transducer.yaml @@ -0,0 +1,79 @@ +# general +batch_type: numel +batch_bins: 4000000 +accum_grad: 8 +max_epoch: 60 # 100 produces slightly better results. +patience: none +init: none +num_att_plot: 0 + +# optimizer +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 + +# criterion +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 10 + +model_conf: + transducer_weight: 1.0 + auxiliary_ctc_weight: 0.3 + report_cer: True + report_wer: True + +# specaug conf +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 5 + +encoder_conf: + main_conf: + pos_wise_act_type: swish + conv_mod_act_type: swish + pos_enc_dropout_rate: 0.2 + input_conf: + vgg_like: True + subsampling_factor: 4 + body_conf: + - block_type: ebranchformer + linear_size: 1024 + hidden_size: 256 + heads: 4 + dropout_rate: 0.1 + pos_wise_dropout_rate: 0.1 + att_dropout_rate: 0.1 + conv_mod_kernel_size: 31 + num_blocks: 14 +decoder: rnn +decoder_conf: + rnn_type: lstm + num_layers: 1 + embed_size: 256 + hidden_size: 256 + dropout_rate: 0.1 + embed_dropout_rate: 0.2 +joint_network_conf: + joint_space_size: 256 From aa033d1bb3040bfd03a5c73d66df30cc467d6020 Mon Sep 17 00:00:00 2001 From: Florian Boyer Date: Tue, 14 Feb 2023 09:53:15 +0000 Subject: [PATCH 12/15] rework README --- .../librispeech_100/asr_transducer1/README.md | 118 +++++++++--------- 1 file changed, 61 insertions(+), 57 deletions(-) diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md index 2ff98780397..60238f147b3 100644 --- a/egs2/librispeech_100/asr_transducer1/README.md +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -1,53 +1,7 @@ -# Streaming Conformer-RNN Transducer -# asr_train_conformer-rnn_transducer_streaming_raw_en_bpe500_sp +# OFFLINE SYSTEMS -- General information - - Pretrained model: N.A - - Training config: conf/train_conformer-rnn_transducer_streaming.yaml - - Decoding config: conf/decode.yaml (or conf/decode_streaming.yaml) - - GPU: Nvidia A100 40Gb - - CPU: AMD EPYC 7502P 32c - - Peak VRAM usage during training: 36.7Gb - - Training time: ~ 26 hours - - Decoding time (32 jobs, 1 thread): ~9,1 minutes (full context) - -- Environments - - date: `Fri Oct 7 12:02:29 UTC 2022` - - python version: `3.8.10 (default, Jun 22 2022, 20:18:18) [GCC 9.4.0]` - - espnet version: `espnet 202209` - - pytorch version: `pytorch 1.8.1+cu111` - - Git hash: `2db74a9587a32b659cf4e1abb6b611d9f9551e09` - - Commit date: `Thu Oct 6 15:01:23 2022 +0000` - -## WER - -|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| -|---|---|---|---|---|---|---|---|---| -|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.3|5.2|0.5|0.7|6.4|56.9| -|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|83.4|14.8|1.8|1.9|18.5|82.1| -|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|93.8|5.6|0.7|0.8|7.0|58.9| -|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|82.9|15.0|2.0|1.8|18.9|83.5| - -## CER - -|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| -|---|---|---|---|---|---|---|---|---| -|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.2|1.0|0.8|0.6|2.4|56.9| -|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.1|4.1|2.9|1.9|8.9|82.1| -|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.0|1.1|0.9|0.6|2.6|58.9| -|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.0|4.0|3.0|1.8|8.9|83.5| - -## TER - -|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| -|---|---|---|---|---|---|---|---|---| -|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.0|3.6|1.4|0.6|5.5|56.9| -|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|84.7|11.6|3.6|2.2|17.4|82.1| -|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|94.7|3.7|1.6|0.6|6.0|58.9| -|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5| - -# Conformer-RNN Transducer -# asr_train_conformer-rnn_transducer_raw_en_bpe500_sp +## Conformer/RNN Transducer +## asr_train_conformer-rnn_transducer_raw_en_bpe500_sp - General information - Pretrained model: N.A @@ -67,7 +21,7 @@ - Git hash: `01893f855ca1a3a3645547ee4d3eaf461f7601bf` - Commit date: `Thu Feb 9 10:04:57 2023 +0000` -## WER +### WER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| @@ -76,7 +30,7 @@ |decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.6|4.8|0.6|0.6|6.0|54.8| |decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|84.7|13.6|1.8|1.6|17.0|80.1| -## CER +### CER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| @@ -85,7 +39,7 @@ |decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.4|0.9|0.7|0.5|2.2|54.8| |decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.9|3.6|2.5|1.7|7.9|80.1| -## TER +### TER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| @@ -94,8 +48,8 @@ |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.5|3.2|1.3|0.6|5.1|54.8| |decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|86.0|10.4|3.5|1.9|15.9|80.1| -# E-Branchformer-RNN Transducer -# asr_train_ebranchformer-rnn_transducer_raw_en_bpe500_sp +## E-Branchformer/RNN Transducer +## asr_train_ebranchformer-rnn_transducer_raw_en_bpe500_sp - General information - Pretrained model: N.A @@ -115,7 +69,7 @@ - Git hash: `01893f855ca1a3a3645547ee4d3eaf461f7601bf` - Commit date: `Thu Feb 9 10:04:57 2023 +0000` -## WER +### WER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| @@ -124,7 +78,7 @@ |decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.6|4.9|0.5|0.6|6.0|55.5| |decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|84.7|13.6|1.8|1.7|17.1|80.6| -## CER +### CER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| @@ -133,7 +87,7 @@ |decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.4|0.9|0.7|0.5|2.1|55.5| |decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.9|3.6|2.5|1.8|7.9|80.6| -## TER +### TER |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| @@ -141,3 +95,53 @@ |decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|86.2|10.6|3.2|2.0|15.8|77.9| |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.4|3.2|1.3|0.6|5.1|55.5| |decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|85.9|10.5|3.6|2.0|16.0|80.6| + +# STREAMING SYSTEMS + +## Conformer/RNN Transducer +## asr_train_conformer-rnn_transducer_streaming_raw_en_bpe500_sp + +- General information + - Pretrained model: N.A + - Training config: conf/train_conformer-rnn_transducer_streaming.yaml + - Decoding config: conf/decode.yaml (or conf/decode_streaming.yaml) + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 36.7Gb + - Training time: ~ 26 hours + - Decoding time (32 jobs, 1 thread): ~9,1 minutes (full context) + +- Environments + - date: `Fri Oct 7 12:02:29 UTC 2022` + - python version: `3.8.10 (default, Jun 22 2022, 20:18:18) [GCC 9.4.0]` + - espnet version: `espnet 202209` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `2db74a9587a32b659cf4e1abb6b611d9f9551e09` + - Commit date: `Thu Oct 6 15:01:23 2022 +0000` + +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.3|5.2|0.5|0.7|6.4|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|83.4|14.8|1.8|1.9|18.5|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|93.8|5.6|0.7|0.8|7.0|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|82.9|15.0|2.0|1.8|18.9|83.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.2|1.0|0.8|0.6|2.4|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.1|4.1|2.9|1.9|8.9|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.0|1.1|0.9|0.6|2.6|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.0|4.0|3.0|1.8|8.9|83.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.0|3.6|1.4|0.6|5.5|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|84.7|11.6|3.6|2.2|17.4|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|94.7|3.7|1.6|0.6|6.0|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5| \ No newline at end of file From 29c54874631ea3a2f60ac1b45a04cf8106483ae4 Mon Sep 17 00:00:00 2001 From: b-flo Date: Thu, 22 Jun 2023 08:24:24 +0000 Subject: [PATCH 13/15] add MEGA config and results --- .../librispeech_100/asr_transducer1/README.md | 47 ++++++++++ .../train_ebranchformer-mega_transducer.yaml | 85 +++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-mega_transducer.yaml diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md index 60238f147b3..a17a686ac06 100644 --- a/egs2/librispeech_100/asr_transducer1/README.md +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -96,6 +96,53 @@ |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.4|3.2|1.3|0.6|5.1|55.5| |decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|85.9|10.5|3.6|2.0|16.0|80.6| +## E-Branchformer/MEGA Transducer +## asr_train_ebranchformer-mega_transducer_raw_en_bpe500_sp + +- General information + - Pretrained model: N.A + - Training config: conf/train_ebranchformer-mega_transducer.yaml + - Decoding config: conf/decode.yaml + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 37.39 Gb + - Training time: ~ 48,9 hours + - Decoding time (32 jobs, 1 thread): N.A + +- Environments + - date: Tue Jun 06 05:30:22 UTC 2023` + - python version: `3.8.10 (default, Nov 14 2022, 12:59:47) [GCC 9.4.0]` + - espnet version: `espnet 202301` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `6048cbb8c93019c3931070c7ab0298a2f626945d` + - Commit date: `Thu Feb 9 10:04:57 2023 +0000` + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.9|4.6|0.4|0.6|5.6|53.0| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|85.2|13.2|1.6|1.7|16.5|77.3| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.6|4.8|0.6|0.7|6.1|55.2| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|84.9|13.3|1.7|1.7|16.7|79.4| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.4|0.9|0.7|0.6|2.2|53.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.9|3.7|2.4|1.9|8.0|77.3| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.4|0.9|0.7|0.6|2.2|55.2| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.9|3.6|2.5|1.8|7.9|79.4| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.4|3.4|1.2|0.6|5.2|53.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|86.2|10.6|3.2|2.1|15.9|77.3| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.4|3.3|1.3|0.6|5.3|55.2| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|86.0|10.4|3.6|2.0|16.0|79.4| + + # STREAMING SYSTEMS ## Conformer/RNN Transducer diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-mega_transducer.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-mega_transducer.yaml new file mode 100644 index 00000000000..9ff93a77897 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-mega_transducer.yaml @@ -0,0 +1,85 @@ +# general +batch_type: numel +batch_bins: 4000000 +accum_grad: 8 +max_epoch: 60 +patience: none +init: none +num_att_plot: 0 + +# optimizer +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 + +# criterion +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 10 + +model_conf: + transducer_weight: 1.0 + auxiliary_ctc_weight: 0.3 + report_cer: True + report_wer: True + +# specaug conf +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 5 + +encoder_conf: + main_conf: + pos_wise_act_type: swish + conv_mod_act_type: swish + pos_enc_dropout_rate: 0.2 + input_conf: + vgg_like: True + subsampling_factor: 4 + body_conf: + - block_type: ebranchformer + linear_size: 1024 + hidden_size: 256 + heads: 4 + dropout_rate: 0.1 + pos_wise_dropout_rate: 0.1 + att_dropout_rate: 0.1 + conv_mod_kernel_size: 31 + num_blocks: 14 +decoder: mega +decoder_conf: + block_size: 256 + linear_size: 2048 + qk_size: 128 + v_size: 1024 + max_positions: 1024 + num_heads: 4 + rel_pos_bias_type: "rotary" + num_blocks: 6 + dropout_rate: 0.1 + ffn_dropout_rate: 0.1 + att_dropout_rate: 0.1 + embed_dropout_rate: 0.1 +joint_network_conf: + joint_space_size: 256 From 9890efe0172a92be53d66ba4081816e0c40cadff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 22 Jun 2023 08:25:35 +0000 Subject: [PATCH 14/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- egs2/librispeech_100/asr_transducer1/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md index a17a686ac06..21a8ada695a 100644 --- a/egs2/librispeech_100/asr_transducer1/README.md +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -191,4 +191,4 @@ |decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.0|3.6|1.4|0.6|5.5|56.9| |decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|84.7|11.6|3.6|2.2|17.4|82.1| |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|94.7|3.7|1.6|0.6|6.0|58.9| -|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5| \ No newline at end of file +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5| From b541bd1c7251e1eaf961631eedf09b09c63fec08 Mon Sep 17 00:00:00 2001 From: b-flo Date: Thu, 22 Jun 2023 08:28:32 +0000 Subject: [PATCH 15/15] merge some lines for readibility --- egs2/librispeech_100/asr_transducer1/README.md | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md index a17a686ac06..b3144229f14 100644 --- a/egs2/librispeech_100/asr_transducer1/README.md +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -1,7 +1,6 @@ # OFFLINE SYSTEMS -## Conformer/RNN Transducer -## asr_train_conformer-rnn_transducer_raw_en_bpe500_sp +## Conformer/RNN Transducer (asr_train_conformer-rnn_transducer_raw_en_bpe500_sp) - General information - Pretrained model: N.A @@ -48,8 +47,7 @@ |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.5|3.2|1.3|0.6|5.1|54.8| |decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|86.0|10.4|3.5|1.9|15.9|80.1| -## E-Branchformer/RNN Transducer -## asr_train_ebranchformer-rnn_transducer_raw_en_bpe500_sp +## E-Branchformer/RNN Transducer (asr_train_ebranchformer-rnn_transducer_raw_en_bpe500_sp) - General information - Pretrained model: N.A @@ -96,8 +94,7 @@ |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.4|3.2|1.3|0.6|5.1|55.5| |decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|85.9|10.5|3.6|2.0|16.0|80.6| -## E-Branchformer/MEGA Transducer -## asr_train_ebranchformer-mega_transducer_raw_en_bpe500_sp +## E-Branchformer/MEGA Transducer (asr_train_ebranchformer-mega_transducer_raw_en_bpe500_sp) - General information - Pretrained model: N.A @@ -145,8 +142,7 @@ # STREAMING SYSTEMS -## Conformer/RNN Transducer -## asr_train_conformer-rnn_transducer_streaming_raw_en_bpe500_sp +## Conformer/RNN Transducer (asr_train_conformer-rnn_transducer_streaming_raw_en_bpe500_sp) - General information - Pretrained model: N.A @@ -191,4 +187,4 @@ |decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.0|3.6|1.4|0.6|5.5|56.9| |decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|84.7|11.6|3.6|2.2|17.4|82.1| |decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|94.7|3.7|1.6|0.6|6.0|58.9| -|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5| \ No newline at end of file +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5|