Merge pull request #4882 from pyf98/ebf

Add E-Branchformer for GigaSpeech
espnet · Jan 24, 2023 · 0f1ac0e · 0f1ac0e
2 parents 3970558 + 7b17228
commit 0f1ac0e
Show file tree

Hide file tree

Showing 7 changed files with 125 additions and 10 deletions.
diff --git a/egs2/gigaspeech/asr1/README.md b/egs2/gigaspeech/asr1/README.md
@@ -1,4 +1,32 @@
-# RESULTS
+# Notes
+In the data preparation stage, we will clone the [official repo](https://github.com/SpeechColab/GigaSpeech) of GigaSpeech, which provides standard scripts for data preparation, post processing and scoring. For fair comparison across toolkits, here we report the results generated by the official scoring script.
+
+
+# E-Branchformer
+
+## Environments
+- date: `Sat Jan 21 17:54:14 EST 2023`
+- python version: `3.9.15 (main, Nov 24 2022, 14:31:59)  [GCC 11.2.0]`
+- espnet version: `espnet 202211`
+- pytorch version: `pytorch 1.12.1`
+- Git hash: `197dc412eab82e9bab008f00fbcb922c824d8cf2`
+  - Commit date: `Sat Jan 21 13:59:20 2023 -0500`
+
+## asr_train_asr_e_branchformer_e17_size512_mlp3072_linear1024_layerdrop_raw_en_bpe5000
+
+- ASR config: [conf/tuning/train_asr_e_branchformer_e17_size512_mlp3072_linear1024_layerdrop.yaml](conf/tuning/train_asr_e_branchformer_e17_size512_mlp3072_linear1024_layerdrop.yaml)
+- Model link: [https://huggingface.co/pyf98/gigaspeech_e_branchformer](https://huggingface.co/pyf98/gigaspeech_e_branchformer)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|5715|127790|92.2|5.7|2.0|2.8|10.6|69.9|
+|decode_asr_asr_model_valid.acc.ave/test|19930|390744|91.5|6.4|2.1|2.0|10.5|63.3| 
+
+
+# Conformer
+
 ## Environments
 - date: `Tue Mar 23 10:03:49 EDT 2021`
 - python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`

diff --git a/egs2/gigaspeech/asr1/conf/decode_asr.yaml b/egs2/gigaspeech/asr1/conf/decode_asr.yaml
@@ -1,6 +1,6 @@
-lm_weight: 0.3
 beam_size: 20
+ctc_weight: 0.2
+lm_weight: 0.0
 penalty: 0.0
 maxlenratio: 0.0
 minlenratio: 0.0
-ctc_weight: 0.3
diff --git a/egs2/gigaspeech/asr1/conf/train_asr.yaml b/egs2/gigaspeech/asr1/conf/train_asr.yaml
diff --git a/egs2/gigaspeech/asr1/conf/train_asr_e_branchformer.yaml b/egs2/gigaspeech/asr1/conf/train_asr_e_branchformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_e_branchformer_e17_size512_mlp3072_linear1024_layerdrop.yaml
diff --git a/...h/asr1/conf/tuning/train_asr_e_branchformer_e17_size512_mlp3072_linear1024_layerdrop.yaml b/...h/asr1/conf/tuning/train_asr_e_branchformer_e17_size512_mlp3072_linear1024_layerdrop.yaml
@@ -0,0 +1,86 @@
+# Trained with A5000 (24GB) x 5. It took about 8 days.
+# The e_branchformer encoder_conf is based on:
+# egs2/librispeech/asr1/conf/tuning/train_asr_e_branchformer.yaml
+
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 30
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+num_workers: 4
+unused_parameters: true
+
+encoder: e_branchformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    cgmlp_linear_units: 3072
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    num_blocks: 17
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    layer_drop_rate: 0.1
+    linear_units: 1024
+    positionwise_layer_type: linear
+    macaron_ffn: true
+    use_ffn: true
+    merge_conv_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+    layer_drop_rate: 0.2
+
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0015
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/gigaspeech/asr1/local/score.sh b/egs2/gigaspeech/asr1/local/score.sh
diff --git a/egs2/gigaspeech/asr1/run.sh b/egs2/gigaspeech/asr1/run.sh
@@ -9,7 +9,7 @@ train_set="train"
 valid_set="dev"
 test_sets="dev test"
 
-asr_config=conf/train_asr.yaml
+asr_config=conf/train_asr_e_branchformer.yaml
 lm_config=conf/train_lm.yaml
 inference_config=conf/decode_asr.yaml
 
@@ -20,9 +20,10 @@ speed_perturb_factors=""
 ./asr.sh \
     --audio_format flac.ark \
     --lang en \
-    --ngpu 4 \
-    --nj 128 \
-    --inference_nj 256 \
+    --ngpu 5 \
+    --nj 64 \
+    --gpu_inference true \
+    --inference_nj 5 \
     --use_lm false \
     --nbpe 5000 \
     --max_wav_duration 30 \
@@ -33,5 +34,5 @@ speed_perturb_factors=""
     --train_set "${train_set}" \
     --valid_set "${valid_set}" \
     --test_sets "${test_sets}" \
-    --bpe_train_text "data/${train_set}/text" "$@" \
-    --local_score_opts "--inference_config ${inference_config} --use_lm false"
+    --bpe_train_text "data/${train_set}/text" \
+    --local_score_opts "--inference_config ${inference_config} --use_lm false" "$@"