Merge pull request #5130 from Some-random/tedlium3

add e-branchformer result for tedlium3 and add checker for text output length
espnet · May 2, 2023 · 33aa097 · 33aa097
2 parents 2219358 + cac7f59
commit 33aa097
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 4 deletions.
diff --git a/egs2/tedlium3/asr1/README.md b/egs2/tedlium3/asr1/README.md
@@ -1,5 +1,66 @@
+# E-Branchformer, 12 encoder layers, with external language source
+
+
+## Environments
+- date: `Tue Apr 11 01:15:36 EDT 2023`
+- python version: `3.8.16 (default, Mar  2 2023, 03:21:46)  [GCC 11.2.0]`
+- espnet version: `espnet 202301`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `b0cceeac2ecd330e8270789cef945e49058858fa`
+  - Commit date: `Thu Mar 30 08:26:54 2023 -0400`
+
+
+## Model info
+- Model link: https://huggingface.co/espnet/dongwei_tedlium3_asr_e-branchformer_external_lm
+- ASR config: conf/tuning/train_asr_e_branchformer_size256_mlp1024_e12_mactrue.yaml
+- Decode config: conf/tuning/decode_asr.yaml
+- LM config: conf/tuning/train_lm_transformer.yaml
+
+
+## exp/asr_train_asr_e_branchformer_size256_mlp1024_e12_mactrue_raw_en_bpe500_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|27500|94.2|2.5|3.3|0.6|6.4|59.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|145066|96.8|0.5|2.7|0.6|3.8|59.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|54206|95.8|1.6|2.6|0.5|4.7|59.2|
+
+## exp/asr_train_asr_e_branchformer_size256_mlp1024_e12_mactrue_raw_en_bpe500_sp/decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|org/dev|507|17783|93.6|3.1|3.3|0.9|7.3|69.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|org/dev|507|95429|96.5|0.7|2.8|0.8|4.4|69.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|org/dev|507|36002|95.4|2.0|2.6|0.8|5.5|69.0|
+
+
+
+
 # Conformer, 12 encoder layers, with external language source
 
+
 ## Environments
 - date: `Mon Mar 27 04:02:03 EDT 2023`
 - python version: `3.8.16 (default, Mar  2 2023, 03:21:46)  [GCC 11.2.0]`
@@ -8,24 +69,30 @@
 - Git hash: `ff841366229d539eb74d23ac999cae7c0cc62cad`
   - Commit date: `Mon Feb 20 12:23:15 2023 -0500`
 
+## Model info
+- Model link: https://huggingface.co/espnet/dongwei_tedlium3_asr_conformer_external_lm
+- ASR config: conf/train_asr_conformer.yaml
+- Decode config: conf/tuning/decode_asr.yaml
+- LM config: conf/tuning/train_lm_transformer.yaml
+
 ## exp/asr_train_raw_en_bpe500_sp
 ### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|466|14671|94.0|2.7|3.3|0.7|6.6|65.9|
+|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|507|17783|93.2|3.2|3.5|1.0|7.8|68.6|
 |decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|27500|93.9|2.7|3.4|0.7|6.8|61.1|
 
 ### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|466|78259|96.6|0.6|2.8|0.6|4.0|65.9|
+|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|507|95429|96.2|0.7|3.1|0.9|4.7|68.6|
 |decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|145066|96.6|0.6|2.8|0.6|4.1|61.1|
 
 ### TER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|466|29364|95.5|1.9|2.7|0.5|5.1|65.9|
+|decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|507|36002|95.0|2.2|2.8|0.9|5.8|68.6|
 |decode_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|54206|95.5|1.7|2.7|0.6|5.1|61.1|
diff --git a/egs2/tedlium3/asr1/conf/train.yaml b/egs2/tedlium3/asr1/conf/train.yaml
@@ -1 +1 @@
-tuning/train_asr_conformer.yaml
+conf/tuning/train_asr_e_branchformer_size256_mlp1024_e12_mactrue.yaml
diff --git a/egs2/tedlium3/asr1/conf/tuning/train_asr_e_branchformer_size256_mlp1024_e12_mactrue.yaml b/egs2/tedlium3/asr1/conf/tuning/train_asr_e_branchformer_size256_mlp1024_e12_mactrue.yaml
@@ -0,0 +1,81 @@
+# Trained with NVIDIA V100 GPU (32GB) x 4
+encoder: e_branchformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    cgmlp_linear_units: 1024
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    layer_drop_rate: 0.0
+    linear_units: 1024
+    positionwise_layer_type: linear
+    use_ffn: true
+    macaron_ffn: true
+    merge_conv_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+use_amp: true
+num_workers: 6
+batch_type: numel
+batch_bins: 50000000
+accum_grad: 1
+max_epoch: 50
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/tedlium3/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/tedlium3/asr1/conf/tuning/train_lm_transformer.yaml
@@ -1,3 +1,5 @@
+# Trained with NVIDIA V100 GPU (32GB) x 4
+
 lm: transformer
 lm_conf:
     pos_enc: null

diff --git a/espnet2/train/preprocessor.py b/espnet2/train/preprocessor.py
@@ -332,6 +332,12 @@ def _text_process(
             text = self.text_cleaner(text)
             tokens = self.tokenizer.text2tokens(text)
             text_ints = self.token_id_converter.tokens2ids(tokens)
+            if len(text_ints) > 100:
+                logging.warning(
+                    "The length of the text output exceeds 100, "
+                    "which may cause OOM on the GPU."
+                    "Please ensure that the data processing is correct and verify it."
+                )
             data[self.text_name] = np.array(text_ints, dtype=np.int64)
         if self.aux_task_names is not None and self.tokenizer is not None:
             for name in self.aux_task_names: