Merge branch 'master' into chime7task1

espnet · Feb 14, 2023 · 5179f7a · 5179f7a
2 parents 00308e1 + 34d6117
commit 5179f7a
Show file tree

Hide file tree

Showing 4 changed files with 236 additions and 0 deletions.
diff --git a/egs2/tedlium2/asr1/README.md b/egs2/tedlium2/asr1/README.md
@@ -116,6 +116,78 @@
 |decode_asr_asr_model_valid.acc.ave/test|1155|52113|95.0|2.6|2.5|0.9|5.9|64.2|
 
 
+# E-Branchformer with Transducer, 12 layers
+## Environments
+- date: `Thu Feb  9 01:29:33 CST 2023`
+- python version: `3.9.15 (main, Nov 24 2022, 14:31:59)  [GCC 11.2.0]`
+- espnet version: `espnet 202301`
+- pytorch version: `pytorch 1.13.1`
+- Git hash: `478ba004e114e7862b05fb01112de7f7e1da3996`
+  - Commit date: `Tue Feb 7 00:50:49 2023 +0000`
+
+## asr_train_asr_transducer_e_branchformer_e12_raw_en_bpe500_sp
+- ASR config: [conf/tuning/train_asr_transducer_e_branchformer_e12.yaml](conf/tuning/train_asr_transducer_e_branchformer_e12.yaml)
+- Params: 26.26M
+- Model link: [https://huggingface.co/pyf98/tedlium2_transducer_e_branchformer](https://huggingface.co/pyf98/tedlium2_transducer_e_branchformer)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transducer_asr_model_valid.loss.ave/dev|466|14671|93.4|4.3|2.3|1.0|7.6|71.7|
+|decode_asr_transducer_asr_model_valid.loss.ave/test|1155|27500|93.6|4.0|2.4|1.0|7.4|63.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transducer_asr_model_valid.loss.ave/dev|466|78259|97.1|0.9|2.0|0.9|3.8|71.7|
+|decode_asr_transducer_asr_model_valid.loss.ave/test|1155|145066|97.1|0.9|2.1|0.9|3.9|63.5|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transducer_asr_model_valid.loss.ave/dev|466|28296|94.7|3.1|2.3|0.8|6.2|71.7|
+|decode_asr_transducer_asr_model_valid.loss.ave/test|1155|52113|95.1|2.6|2.2|0.9|5.8|63.5|
+
+
+# Conformer with Transducer, 12 layers, 2048 linear units
+## Environments
+- date: `Wed Feb  8 22:07:40 CST 2023`
+- python version: `3.9.15 (main, Nov 24 2022, 14:31:59)  [GCC 11.2.0]`
+- espnet version: `espnet 202301`
+- pytorch version: `pytorch 1.13.1`
+- Git hash: `478ba004e114e7862b05fb01112de7f7e1da3996`
+  - Commit date: `Tue Feb 7 00:50:49 2023 +0000`
+
+## asr_train_asr_transducer_conformer_e12_linear2048_raw_en_bpe500_sp
+- ASR config: [conf/tuning/train_asr_transducer_conformer_e12_linear2048.yaml](conf/tuning/train_asr_transducer_conformer_e12_linear2048.yaml)
+- Params: 34.62M
+- Model link: [https://huggingface.co/pyf98/tedlium2_transducer_conformer_e12_linear2048](https://huggingface.co/pyf98/tedlium2_transducer_conformer_e12_linear2048)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transducer_asr_model_valid.loss.ave/dev|466|14671|93.3|4.5|2.3|1.1|7.8|71.2|
+|decode_asr_transducer_asr_model_valid.loss.ave/test|1155|27500|93.2|4.2|2.6|1.0|7.8|65.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transducer_asr_model_valid.loss.ave/dev|466|78259|97.0|0.9|2.1|1.0|3.9|71.2|
+|decode_asr_transducer_asr_model_valid.loss.ave/test|1155|145066|96.9|0.9|2.2|0.9|4.0|65.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transducer_asr_model_valid.loss.ave/dev|466|28296|94.6|3.0|2.4|0.9|6.3|71.2|
+|decode_asr_transducer_asr_model_valid.loss.ave/test|1155|52113|94.8|2.7|2.5|0.9|6.0|65.6|
+
+
 
 # E-Branchformer with CTC, 12 layers
 ## Environments

diff --git a/egs2/tedlium2/asr1/conf/decode_asr_transducer.yaml b/egs2/tedlium2/asr1/conf/decode_asr_transducer.yaml
@@ -0,0 +1,4 @@
+beam_size: 10
+transducer_conf:
+    search_type: default
+    score_norm: true
diff --git a/egs2/tedlium2/asr1/conf/tuning/train_asr_transducer_conformer_e12_linear2048.yaml b/egs2/tedlium2/asr1/conf/tuning/train_asr_transducer_conformer_e12_linear2048.yaml
@@ -0,0 +1,78 @@
+# Trained with NVIDIA A40 GPU (48GB) x 2
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transducer
+decoder_conf:
+    rnn_type: lstm
+    num_layers: 1
+    hidden_size: 256
+    dropout: 0.1
+    dropout_embed: 0.2
+
+joint_net_conf:
+    joint_space_size: 320
+
+model_conf:
+    ctc_weight: 0.3
+    report_cer: False
+    report_wer: False
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+use_amp: false
+num_workers: 6
+batch_type: numel
+batch_bins: 10000000
+accum_grad: 5
+max_epoch: 50
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/tedlium2/asr1/conf/tuning/train_asr_transducer_e_branchformer_e12.yaml b/egs2/tedlium2/asr1/conf/tuning/train_asr_transducer_e_branchformer_e12.yaml
@@ -0,0 +1,82 @@
+# Trained with NVIDIA A40 GPU (48GB) x 2
+encoder: e_branchformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    cgmlp_linear_units: 1024
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    layer_drop_rate: 0.0
+    linear_units: 1024
+    positionwise_layer_type: linear
+    use_ffn: true
+    macaron_ffn: true
+    merge_conv_kernel: 31
+
+decoder: transducer
+decoder_conf:
+    rnn_type: lstm
+    num_layers: 1
+    hidden_size: 256
+    dropout: 0.1
+    dropout_embed: 0.2
+
+joint_net_conf:
+    joint_space_size: 320
+
+model_conf:
+    ctc_weight: 0.3
+    report_cer: False
+    report_wer: False
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+use_amp: false
+num_workers: 6
+batch_type: numel
+batch_bins: 10000000
+accum_grad: 5
+max_epoch: 50
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5