diff --git a/egs2/librispeech_100/asr_transducer1/README.md b/egs2/librispeech_100/asr_transducer1/README.md new file mode 100644 index 00000000000..b3144229f14 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/README.md @@ -0,0 +1,190 @@ +# OFFLINE SYSTEMS + +## Conformer/RNN Transducer (asr_train_conformer-rnn_transducer_raw_en_bpe500_sp) + +- General information + - Pretrained model: N.A + - Training config: conf/train_conformer-rnn_transducer.yaml + - Decoding config: conf/decode.yaml + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 37.09 Gb + - Training time: ~ 35 hours + - Decoding time (32 jobs, 1 thread): ~15,6 minutes w/ default beam search. + +- Environments + - date: `Fri Feb 10 09:27:45 UTC 2023` + - python version: `3.8.10 (default, Nov 14 2022, 12:59:47) [GCC 9.4.0]` + - espnet version: `espnet 202301` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `01893f855ca1a3a3645547ee4d3eaf461f7601bf` + - Commit date: `Thu Feb 9 10:04:57 2023 +0000` + +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.9|4.7|0.5|0.6|5.8|53.6| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|84.9|13.4|1.6|1.8|16.9|78.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.6|4.8|0.6|0.6|6.0|54.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|84.7|13.6|1.8|1.6|17.0|80.1| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.4|0.9|0.6|0.6|2.1|53.6| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.8|3.7|2.4|1.8|8.0|78.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.4|0.9|0.7|0.5|2.2|54.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.9|3.6|2.5|1.7|7.9|80.1| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.5|3.4|1.1|0.6|5.0|53.6| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|86.2|10.6|3.2|2.0|15.9|78.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.5|3.2|1.3|0.6|5.1|54.8| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|86.0|10.4|3.5|1.9|15.9|80.1| + +## E-Branchformer/RNN Transducer (asr_train_ebranchformer-rnn_transducer_raw_en_bpe500_sp) + +- General information + - Pretrained model: N.A + - Training config: conf/train_ebranchformer-rnn_transducer.yaml + - Decoding config: conf/decode.yaml + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 37.39 Gb + - Training time: ~ 33,8 hours + - Decoding time (32 jobs, 1 thread): ~15,7 minutes w/ default beam search. + +- Environments + - date: Tue Feb 14 07:41:14 UTC 2023` + - python version: `3.8.10 (default, Nov 14 2022, 12:59:47) [GCC 9.4.0]` + - espnet version: `espnet 202301` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `01893f855ca1a3a3645547ee4d3eaf461f7601bf` + - Commit date: `Thu Feb 9 10:04:57 2023 +0000` + +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.9|4.7|0.4|0.6|5.7|53.0| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|85.0|13.4|1.6|1.8|16.8|77.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.6|4.9|0.5|0.6|6.0|55.5| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|84.7|13.6|1.8|1.7|17.1|80.6| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.5|0.9|0.6|0.6|2.1|53.0| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.9|3.7|2.4|1.8|7.9|77.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.4|0.9|0.7|0.5|2.1|55.5| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.9|3.6|2.5|1.8|7.9|80.6| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.6|3.3|1.1|0.6|5.0|53.0| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|86.2|10.6|3.2|2.0|15.8|77.9| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.4|3.2|1.3|0.6|5.1|55.5| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|85.9|10.5|3.6|2.0|16.0|80.6| + +## E-Branchformer/MEGA Transducer (asr_train_ebranchformer-mega_transducer_raw_en_bpe500_sp) + +- General information + - Pretrained model: N.A + - Training config: conf/train_ebranchformer-mega_transducer.yaml + - Decoding config: conf/decode.yaml + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 37.39 Gb + - Training time: ~ 48,9 hours + - Decoding time (32 jobs, 1 thread): N.A + +- Environments + - date: Tue Jun 06 05:30:22 UTC 2023` + - python version: `3.8.10 (default, Nov 14 2022, 12:59:47) [GCC 9.4.0]` + - espnet version: `espnet 202301` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `6048cbb8c93019c3931070c7ab0298a2f626945d` + - Commit date: `Thu Feb 9 10:04:57 2023 +0000` + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.9|4.6|0.4|0.6|5.6|53.0| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|85.2|13.2|1.6|1.7|16.5|77.3| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|94.6|4.8|0.6|0.7|6.1|55.2| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|84.9|13.3|1.7|1.7|16.7|79.4| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.4|0.9|0.7|0.6|2.2|53.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.9|3.7|2.4|1.9|8.0|77.3| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.4|0.9|0.7|0.6|2.2|55.2| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.9|3.6|2.5|1.8|7.9|79.4| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.4|3.4|1.2|0.6|5.2|53.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|86.2|10.6|3.2|2.1|15.9|77.3| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|95.4|3.3|1.3|0.6|5.3|55.2| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|86.0|10.4|3.6|2.0|16.0|79.4| + + +# STREAMING SYSTEMS + +## Conformer/RNN Transducer (asr_train_conformer-rnn_transducer_streaming_raw_en_bpe500_sp) + +- General information + - Pretrained model: N.A + - Training config: conf/train_conformer-rnn_transducer_streaming.yaml + - Decoding config: conf/decode.yaml (or conf/decode_streaming.yaml) + - GPU: Nvidia A100 40Gb + - CPU: AMD EPYC 7502P 32c + - Peak VRAM usage during training: 36.7Gb + - Training time: ~ 26 hours + - Decoding time (32 jobs, 1 thread): ~9,1 minutes (full context) + +- Environments + - date: `Fri Oct 7 12:02:29 UTC 2022` + - python version: `3.8.10 (default, Jun 22 2022, 20:18:18) [GCC 9.4.0]` + - espnet version: `espnet 202209` + - pytorch version: `pytorch 1.8.1+cu111` + - Git hash: `2db74a9587a32b659cf4e1abb6b611d9f9551e09` + - Commit date: `Thu Oct 6 15:01:23 2022 +0000` + +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|54402|94.3|5.2|0.5|0.7|6.4|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|50948|83.4|14.8|1.8|1.9|18.5|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|52576|93.8|5.6|0.7|0.8|7.0|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|52343|82.9|15.0|2.0|1.8|18.9|83.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|288456|98.2|1.0|0.8|0.6|2.4|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|265951|93.1|4.1|2.9|1.9|8.9|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|281530|98.0|1.1|0.9|0.6|2.6|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|272758|93.0|4.0|3.0|1.8|8.9|83.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_asr_model_valid.loss.ave_10best/dev_clean|2703|107929|95.0|3.6|1.4|0.6|5.5|56.9| +|decode_asr_model_valid.loss.ave_10best/dev_other|2864|98610|84.7|11.6|3.6|2.2|17.4|82.1| +|decode_asr_model_valid.loss.ave_10best/test_clean|2620|105724|94.7|3.7|1.6|0.6|6.0|58.9| +|decode_asr_model_valid.loss.ave_10best/test_other|2939|101026|84.3|11.6|4.1|2.0|17.7|83.5| diff --git a/egs2/librispeech_100/asr_transducer1/asr.sh b/egs2/librispeech_100/asr_transducer1/asr.sh new file mode 120000 index 00000000000..60b05122cfd --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/asr.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/asr.sh \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/cmd.sh b/egs2/librispeech_100/asr_transducer1/cmd.sh new file mode 120000 index 00000000000..297411a5ecc --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/cmd.sh @@ -0,0 +1 @@ +../asr1/cmd.sh \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/conf/decode.yaml b/egs2/librispeech_100/asr_transducer1/conf/decode.yaml new file mode 100644 index 00000000000..dd2bc435aa8 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/decode.yaml @@ -0,0 +1,14 @@ +beam_size: 10 # 5 is almost equivalent. +beam_search_config: + search_type: default + + # ALSD (search-type: alsd) + u_max: 150 + + # TSD (search-type: tsd) + max_sym_exp: 3 + + # mAES (search-type: maes) + nstep: 2 + expansion_gamma: 1.5 + expansion_beta: 1 diff --git a/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml b/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml new file mode 100644 index 00000000000..f7a3da0a1b5 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/decode_streaming.yaml @@ -0,0 +1,9 @@ +beam_size: 5 # 10 produces slightly better results. +beam_search_config: + search_type: maes + nstep: 1 + expansion_gamma: 2.3 + expansion_beta: 2 +streaming: True +chunk_size: 64 +left_context: 256 diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml new file mode 100644 index 00000000000..3eb4f52aa63 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer.yaml @@ -0,0 +1,79 @@ +# general +batch_type: numel +batch_bins: 4000000 +accum_grad: 8 +max_epoch: 60 # 100 produces slightly better results. +patience: none +init: none +num_att_plot: 0 + +# optimizer +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 + +# criterion +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 10 + +model_conf: + transducer_weight: 1.0 + auxiliary_ctc_weight: 0.3 + report_cer: True + report_wer: True + +# specaug conf +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 5 + +encoder_conf: + main_conf: + pos_wise_act_type: swish + conv_mod_act_type: swish + pos_enc_dropout_rate: 0.2 + input_conf: + vgg_like: True + subsampling_factor: 4 + body_conf: + - block_type: conformer + linear_size: 1024 + hidden_size: 256 + heads: 4 + dropout_rate: 0.1 + pos_wise_dropout_rate: 0.1 + att_dropout_rate: 0.1 + conv_mod_kernel_size: 31 + num_blocks: 18 +decoder: rnn +decoder_conf: + rnn_type: lstm + num_layers: 1 + embed_size: 256 + hidden_size: 256 + dropout_rate: 0.1 + embed_dropout_rate: 0.2 +joint_network_conf: + joint_space_size: 256 diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml new file mode 100644 index 00000000000..5e93a7308f7 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/train_conformer-rnn_transducer_streaming.yaml @@ -0,0 +1,83 @@ +# general +batch_type: numel +batch_bins: 4000000 +accum_grad: 8 +max_epoch: 60 # 100 produces better results. +patience: none +init: none +num_att_plot: 0 + +# optimizer +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 + +# criterion +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 10 # 20 produces slightly better results. + +model_conf: + transducer_weight: 1.0 + auxiliary_ctc_weight: 0.3 + report_cer: True + report_wer: True + +# specaug conf +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 5 + +encoder_conf: + main_conf: + pos_wise_act_type: swish + conv_mod_act_type: swish + pos_enc_dropout_rate: 0.2 + dynamic_chunk_training: True + short_chunk_size: 25 + short_chunk_threshold: 0.75 + left_chunk_size: 4 + input_conf: + vgg_like: True + subsampling_factor: 6 + body_conf: + - block_type: conformer + linear_size: 1024 + hidden_size: 256 + heads: 4 + dropout_rate: 0.1 + pos_wise_dropout_rate: 0.1 + att_dropout_rate: 0.1 + conv_mod_kernel_size: 31 + num_blocks: 18 +decoder: rnn +decoder_conf: + rnn_type: lstm + num_layers: 1 + embed_size: 256 + hidden_size: 256 + dropout_rate: 0.1 + embed_dropout_rate: 0.2 +joint_network_conf: + joint_space_size: 256 diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-mega_transducer.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-mega_transducer.yaml new file mode 100644 index 00000000000..9ff93a77897 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-mega_transducer.yaml @@ -0,0 +1,85 @@ +# general +batch_type: numel +batch_bins: 4000000 +accum_grad: 8 +max_epoch: 60 +patience: none +init: none +num_att_plot: 0 + +# optimizer +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 + +# criterion +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 10 + +model_conf: + transducer_weight: 1.0 + auxiliary_ctc_weight: 0.3 + report_cer: True + report_wer: True + +# specaug conf +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 5 + +encoder_conf: + main_conf: + pos_wise_act_type: swish + conv_mod_act_type: swish + pos_enc_dropout_rate: 0.2 + input_conf: + vgg_like: True + subsampling_factor: 4 + body_conf: + - block_type: ebranchformer + linear_size: 1024 + hidden_size: 256 + heads: 4 + dropout_rate: 0.1 + pos_wise_dropout_rate: 0.1 + att_dropout_rate: 0.1 + conv_mod_kernel_size: 31 + num_blocks: 14 +decoder: mega +decoder_conf: + block_size: 256 + linear_size: 2048 + qk_size: 128 + v_size: 1024 + max_positions: 1024 + num_heads: 4 + rel_pos_bias_type: "rotary" + num_blocks: 6 + dropout_rate: 0.1 + ffn_dropout_rate: 0.1 + att_dropout_rate: 0.1 + embed_dropout_rate: 0.1 +joint_network_conf: + joint_space_size: 256 diff --git a/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-rnn_transducer.yaml b/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-rnn_transducer.yaml new file mode 100644 index 00000000000..fd2fb1804f1 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/conf/train_ebranchformer-rnn_transducer.yaml @@ -0,0 +1,79 @@ +# general +batch_type: numel +batch_bins: 4000000 +accum_grad: 8 +max_epoch: 60 # 100 produces slightly better results. +patience: none +init: none +num_att_plot: 0 + +# optimizer +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 15000 + +# criterion +val_scheduler_criterion: + - valid + - loss +best_model_criterion: +- - valid + - loss + - min +keep_nbest_models: 10 + +model_conf: + transducer_weight: 1.0 + auxiliary_ctc_weight: 0.3 + report_cer: True + report_wer: True + +# specaug conf +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 5 + +encoder_conf: + main_conf: + pos_wise_act_type: swish + conv_mod_act_type: swish + pos_enc_dropout_rate: 0.2 + input_conf: + vgg_like: True + subsampling_factor: 4 + body_conf: + - block_type: ebranchformer + linear_size: 1024 + hidden_size: 256 + heads: 4 + dropout_rate: 0.1 + pos_wise_dropout_rate: 0.1 + att_dropout_rate: 0.1 + conv_mod_kernel_size: 31 + num_blocks: 14 +decoder: rnn +decoder_conf: + rnn_type: lstm + num_layers: 1 + embed_size: 256 + hidden_size: 256 + dropout_rate: 0.1 + embed_dropout_rate: 0.2 +joint_network_conf: + joint_space_size: 256 diff --git a/egs2/librispeech_100/asr_transducer1/db.sh b/egs2/librispeech_100/asr_transducer1/db.sh new file mode 120000 index 00000000000..50d86130898 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/db.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/db.sh \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/local b/egs2/librispeech_100/asr_transducer1/local new file mode 120000 index 00000000000..23830fb51b8 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/local @@ -0,0 +1 @@ +../asr1/local \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/path.sh b/egs2/librispeech_100/asr_transducer1/path.sh new file mode 120000 index 00000000000..c9ac0a75bc6 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/path.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/path.sh \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/pyscripts b/egs2/librispeech_100/asr_transducer1/pyscripts new file mode 120000 index 00000000000..ac68ad75b60 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/pyscripts @@ -0,0 +1 @@ +../../TEMPLATE/asr1/pyscripts \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/run.sh b/egs2/librispeech_100/asr_transducer1/run.sh new file mode 100755 index 00000000000..ade3a73e1b0 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/run.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -e +set -u +set -o pipefail + +train_set="train_clean_100" +valid_set="dev" +test_sets="test_clean test_other dev_clean dev_other" + +asr_config=conf/train_conformer-rnn_transducer.yaml +inference_config=conf/decode.yaml +inference_model=valid.loss.ave_10best.pth + +./asr.sh \ + --asr_task asr_transducer \ + --lang en \ + --ngpu 1 \ + --nj 32 \ + --inference_nj 32 \ + --nbpe 500 \ + --max_wav_duration 30 \ + --speed_perturb_factors "0.9 1.0 1.1" \ + --audio_format "flac.ark" \ + --feats_type raw \ + --use_lm false \ + --asr_config "${asr_config}" \ + --inference_config "${inference_config}" \ + --inference_asr_model "${inference_model}" \ + --train_set "${train_set}" \ + --valid_set "${valid_set}" \ + --test_sets "${test_sets}" \ + --lm_train_text "data/${train_set}/text" \ + --bpe_train_text "data/${train_set}/text" "$@" diff --git a/egs2/librispeech_100/asr_transducer1/scripts b/egs2/librispeech_100/asr_transducer1/scripts new file mode 120000 index 00000000000..b25829705dc --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/scripts @@ -0,0 +1 @@ +../../TEMPLATE/asr1/scripts \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/steps b/egs2/librispeech_100/asr_transducer1/steps new file mode 120000 index 00000000000..91f2d234e20 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/steps @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/steps \ No newline at end of file diff --git a/egs2/librispeech_100/asr_transducer1/utils b/egs2/librispeech_100/asr_transducer1/utils new file mode 120000 index 00000000000..f49247da827 --- /dev/null +++ b/egs2/librispeech_100/asr_transducer1/utils @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/utils \ No newline at end of file