-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Whisper SOT recipe for Librimix #5371
Changes from 20 commits
54c2f9e
48fc9f4
e30e2df
2ba9507
08392ea
50364f8
6761137
728634c
bfa5a6c
e53e8f4
2987dca
c41bfb4
dbfad9c
d5bd029
e775128
dc10c37
7b461a7
c74eb9c
d7a46cf
50b9dba
6b2d1df
7fdefe9
77657df
ebe3ac7
a6c897e
d47b387
7dc5134
8975b9c
0768756
7848af6
736d132
71bddcc
e54bbd4
1bb8a96
18903d4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,4 @@ lm_weight: 0.0 | |
maxlenratio: 3.0 | ||
minlenratio: 0.0 | ||
penalty: 0.0 | ||
sot_asr: true |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
normalize: null | ||
|
||
freeze_param: [ | ||
"decoder.decoders.token_embedding.ori_emb" | ||
] | ||
|
||
encoder: whisper | ||
encoder_conf: | ||
whisper_model: small | ||
dropout_rate: 0.0 | ||
use_specaug: false | ||
|
||
decoder: whisper | ||
decoder_conf: | ||
whisper_model: small | ||
dropout_rate: 0.0 | ||
load_origin_token_embedding: true | ||
|
||
model_conf: | ||
ctc_weight: 0.0 | ||
lsm_weight: 0.1 | ||
length_normalized_loss: false | ||
sym_sos: "<|startoftranscript|>" | ||
sym_eos: "<|endoftext|>" | ||
# do_pad_trim: true # should be set when doing zero-shot inference | ||
|
||
|
||
frontend: null | ||
input_size: 1 # to prevent build_model() from complaining | ||
|
||
|
||
# preprocessor related | ||
preprocessor: multi | ||
preprocessor_conf: | ||
speaker_change_symbol: | ||
- "<sc>" | ||
|
||
# minibatch related | ||
use_amp: true | ||
num_workers: 2 | ||
batch_type: numel | ||
batch_bins: 8000000 | ||
accum_grad: 4 | ||
max_epoch: 13 | ||
patience: none | ||
init: none | ||
best_model_criterion: | ||
- - valid | ||
- acc | ||
- max | ||
keep_nbest_models: 5 | ||
|
||
optim: adam | ||
optim_conf: | ||
lr: 0.0005 | ||
weight_decay: 0.000001 | ||
scheduler: warmuplr | ||
scheduler_conf: | ||
warmup_steps: 20000 | ||
|
||
specaug: null |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
normalize: null | ||
|
||
freeze_param: [ | ||
"decoder.decoders.token_embedding.ori_emb" | ||
] | ||
|
||
encoder: whisper | ||
encoder_conf: | ||
whisper_model: medium | ||
dropout_rate: 0.0 | ||
use_specaug: false | ||
|
||
decoder: whisper | ||
decoder_conf: | ||
whisper_model: medium | ||
dropout_rate: 0.0 | ||
load_origin_token_embedding: true | ||
|
||
model_conf: | ||
ctc_weight: 0.0 | ||
lsm_weight: 0.1 | ||
length_normalized_loss: false | ||
sym_sos: "<|startoftranscript|>" | ||
sym_eos: "<|endoftext|>" | ||
# do_pad_trim: true # should be set when doing zero-shot inference | ||
|
||
|
||
frontend: null | ||
input_size: 1 # to prevent build_model() from complaining | ||
|
||
|
||
# preprocessor related | ||
preprocessor: multi | ||
preprocessor_conf: | ||
speaker_change_symbol: | ||
- "<sc>" | ||
|
||
# minibatch related | ||
use_amp: true | ||
num_workers: 2 | ||
batch_type: numel | ||
batch_bins: 8000000 | ||
accum_grad: 4 | ||
max_epoch: 3 | ||
patience: none | ||
init: none | ||
best_model_criterion: | ||
- - valid | ||
- acc | ||
- max | ||
keep_nbest_models: 1 | ||
|
||
optim: adam | ||
optim_conf: | ||
lr: 0.000001 | ||
weight_decay: 0.000001 | ||
scheduler: warmuplr | ||
scheduler_conf: | ||
warmup_steps: 20000 | ||
|
||
specaug: null |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
normalize: null | ||
|
||
freeze_param: [ | ||
"decoder.decoders.token_embedding.ori_emb" | ||
] | ||
|
||
encoder: whisper | ||
encoder_conf: | ||
whisper_model: small | ||
dropout_rate: 0.0 | ||
use_specaug: false | ||
|
||
decoder: whisper | ||
decoder_conf: | ||
whisper_model: small | ||
dropout_rate: 0.0 | ||
load_origin_token_embedding: true | ||
|
||
model_conf: | ||
ctc_weight: 0.0 | ||
lsm_weight: 0.1 | ||
length_normalized_loss: false | ||
sym_sos: "<|startoftranscript|>" | ||
sym_eos: "<|endoftext|>" | ||
# do_pad_trim: true # should be set when doing zero-shot inference | ||
|
||
|
||
frontend: null | ||
input_size: 1 # to prevent build_model() from complaining | ||
|
||
|
||
# preprocessor related | ||
preprocessor: multi | ||
preprocessor_conf: | ||
speaker_change_symbol: | ||
- "<sc>" | ||
|
||
# minibatch related | ||
use_amp: true | ||
num_workers: 2 | ||
batch_type: numel | ||
batch_bins: 2000000 | ||
accum_grad: 4 | ||
max_epoch: 20 | ||
patience: none | ||
init: none | ||
best_model_criterion: | ||
- - valid | ||
- acc | ||
- max | ||
keep_nbest_models: 1 | ||
|
||
optim: adam | ||
optim_conf: | ||
lr: 0.000001 | ||
weight_decay: 0.000001 | ||
scheduler: warmuplr | ||
scheduler_conf: | ||
warmup_steps: 20000 | ||
|
||
specaug: null |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/usr/bin/env bash | ||
# Set bash to 'debug' mode, it will exit on : | ||
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', | ||
set -e | ||
set -u | ||
set -o pipefail | ||
|
||
train_set="train" | ||
valid_set="dev" | ||
test_sets="dev test" | ||
|
||
asr_config=conf/tuning/train_sot_asr_whisper.yaml | ||
|
||
lm_config=conf/tuning/train_lm_transformer.yaml | ||
inference_config=conf/tuning/decode_sot.yaml | ||
|
||
./asr.sh \ | ||
--lang en \ | ||
--audio_format "flac.ark" \ | ||
--feats_type raw \ | ||
--token_type whisper_multilingual \ | ||
--sot_asr true \ | ||
--max_wav_duration 30 \ | ||
--feats_normalize utterance_mvn \ | ||
--use_lm false \ | ||
--asr_config "${asr_config}" \ | ||
--lm_config "${lm_config}" \ | ||
--inference_config "${inference_config}" \ | ||
--train_set "${train_set}" \ | ||
--valid_set "${valid_set}" \ | ||
--test_sets "${test_sets}" \ | ||
--lm_train_text "data/${train_set}/text_spk1 data/${train_set}/text_spk2 data/local/other_text/text" \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we simply use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LM is not used in this recipe. I'll remove it. |
||
--bpe_train_text "data/${train_set}/text_spk1 data/${train_set}/text_spk2" "$@" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. (not sure about LM, but bpe should be fine to use the |
||
# --speed_perturb_factors "0.9 1.0 1.1" \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Xuankai and I have considered whether a "space" should be added after "sc". We don't know if it matters and how the original paper does it. Do you have any comments?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think there should be no essential difference. But the second makes the text look more natural. My uploaded pre-trained model was trained with "space" after "". Do you have any technical concerns about that "space"?