-
Notifications
You must be signed in to change notification settings - Fork 116
/
run.sh
executable file
·296 lines (271 loc) · 12.9 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#!/bin/bash
# Copyright (c) Yiming Wang
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
set -e -o pipefail
stage=0
ngpus=1 # num GPUs for multiple GPUs training within a single node; should match those in $free_gpu
free_gpu= # comma-separated available GPU ids, eg., "0" or "0,1"; will be automatically assigned if not specified
# E2E model related
affix=
train_set=train_960
valid_set=dev
test_set="test_clean test_other dev_clean dev_other"
checkpoint=checkpoint_best.pt
use_transformer=false
# LM related
lm_affix=
lm_checkpoint=checkpoint_best.pt
lm_shallow_fusion=true # no LM fusion if false
sentencepiece_vocabsize=5000
sentencepiece_type=unigram
# data related
dumpdir=data/dump # directory to dump full features
data= # path to where you want to put the downloaded data; need to be specified if not on CLSP grid
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
data=/export/corpora5
fi
data_url=www.openslr.org/resources/12
kaldi_scoring=true
# feature configuration
do_delta=false
apply_specaug=false
. ./path.sh
. ./cmd.sh
. ./utils/parse_options.sh
lmdir=exp/lm_lstm${lm_affix:+_${lm_affix}}
if $use_transformer; then
dir=exp/transformer${affix:+_$affix}
else
dir=exp/lstm${affix:+_$affix}
fi
if [ ${stage} -le 0 ]; then
echo "Stage 0: Data Downloading"
[ -z "$data" ] && echo "Specify '--data' as the path to which the corpus has been/will be downloaded." && exit 1;
for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
local/download_and_untar.sh $data $data_url $part
done
fi
if [ ${stage} -le 1 ]; then
echo "Stage 1: Data Preparation"
for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
# use underscore-separated names in data directories.
local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g)
done
fi
train_feat_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${train_feat_dir}
valid_feat_dir=${dumpdir}/${valid_set}/delta${do_delta}; mkdir -p ${valid_feat_dir}
if [ ${stage} -le 2 ]; then
echo "Stage 2: Feature Generation"
fbankdir=fbank
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
for dataset in dev_clean test_clean dev_other test_other train_clean_100 train_clean_360 train_other_500; do
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
data/$dataset exp/make_fbank/$dataset ${fbankdir}
utils/fix_data_dir.sh data/$dataset
done
utils/combine_data.sh --extra-files utt2num_frames data/${train_set} data/train_clean_100 data/train_clean_360 data/train_other_500
utils/combine_data.sh --extra-files utt2num_frames data/${valid_set} data/dev_clean data/dev_other
# compute global CMVN
compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
# dump features for training
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${train_feat_dir}/storage ]; then
utils/create_split_dir.pl \
/export/b1{4,5,6,7}/${USER}/fairseq-data/egs/asr_librispeech/dump/${train_set}/delta${do_delta}/storage \
${train_feat_dir}/storage
fi
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${valid_feat_dir}/storage ]; then
utils/create_split_dir.pl \
/export/b1{4,5,6,7}/${USER}/fairseq-data/egs/asr_librispeech/dump/${valid_set}/delta${do_delta}/storage \
${valid_feat_dir}/storage
fi
dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${train_feat_dir}
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
data/${valid_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/valid ${valid_feat_dir}
for dataset in $test_set; do
test_feat_dir=${dumpdir}/$dataset/delta${do_delta}; mkdir -p ${test_feat_dir}
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
data/$dataset/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/$dataset ${test_feat_dir}
done
fi
dict=data/lang/${train_set}_${sentencepiece_type}${sentencepiece_vocabsize}_units.txt
sentencepiece_model=data/lang/${train_set}_${sentencepiece_type}${sentencepiece_vocabsize}
lmdatadir=data/lm_text
if [ ${stage} -le 3 ]; then
echo "Stage 3: Dictionary Preparation and Text Tokenization"
mkdir -p data/lang
cut -f 2- -d" " data/${train_set}/text > data/lang/input
echo "$0: training sentencepiece model..."
python3 ../../scripts/spm_train.py --bos_id=-1 --pad_id=0 --eos_id=1 --unk_id=2 --input=data/lang/input \
--vocab_size=$((sentencepiece_vocabsize+3)) --character_coverage=1.0 \
--model_type=$sentencepiece_type --model_prefix=$sentencepiece_model \
--input_sentence_size=10000000
echo "$0: making a dictionary and tokenizing text for train/valid/test set..."
for dataset in $train_set $valid_set $test_set; do
text=data/$dataset/text
token_text=data/$dataset/token_text
cut -f 2- -d" " $text | \
python3 ../../scripts/spm_encode.py --model=${sentencepiece_model}.model --output_format=piece | \
paste -d" " <(cut -f 1 -d" " $text) - > $token_text
if [ "$dataset" == "$train_set" ]; then
cut -f 2- -d" " $token_text | tr ' ' '\n' | sort | uniq -c | \
awk '{print $2,$1}' | sort > $dict
wc -l $dict
fi
done
echo "$0: preparing text for subword LM..."
mkdir -p $lmdatadir
for dataset in $train_set $valid_set $test_set; do
token_text=data/$dataset/token_text
cut -f 2- -d" " $token_text > $lmdatadir/$dataset.tokens
done
if [ ! -e $lmdatadir/librispeech-lm-norm.txt.gz ]; then
wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P $lmdatadir
fi
echo "$0: preparing extra corpus for subword LM training..."
zcat $lmdatadir/librispeech-lm-norm.txt.gz | \
python3 ../../scripts/spm_encode.py --model=${sentencepiece_model}.model --output_format=piece | \
cat $lmdatadir/$train_set.tokens - > $lmdatadir/train.tokens
fi
lmdict=$dict
if [ ${stage} -le 4 ]; then
echo "Stage 4: Text Binarization for subword LM Training"
mkdir -p $lmdatadir/log
for dataset in $test_set; do test_paths="$test_paths $lmdatadir/$dataset.tokens"; done
test_paths=$(echo $test_paths | awk '{$1=$1;print}' | tr ' ' ',')
${decode_cmd} $lmdatadir/log/preprocess.log \
python3 ../../fairseq_cli/preprocess.py --task language_modeling_for_asr \
--workers 50 --srcdict $lmdict --only-source \
--trainpref $lmdatadir/train.tokens \
--validpref $lmdatadir/$valid_set.tokens \
--testpref $test_paths \
--destdir $lmdatadir
fi
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
[ -z "$free_gpu" ] && free_gpu=$(free-gpu -n $ngpus) || echo "Unable to get $ngpus GPUs"
else
[ -z "$free_gpu" ] && free_gpu=$(echo $(seq 0 $(($ngpus-1))) | sed 's/ /,/g')
fi
[ -z "$free_gpu" ] && echo "$0: please specify --free-gpu" && exit 1;
[ $(echo $free_gpu | sed 's/,/ /g' | awk '{print NF}') -ne "$ngpus" ] && \
echo "number of GPU ids in --free-gpu=$free_gpu does not match --ngpus=$ngpus" && exit 1;
if [ ${stage} -le 5 ]; then
echo "Stage 5: subword LM Training"
valid_subset=valid
mkdir -p $lmdir/log
log_file=$lmdir/log/train.log
[ -f $lmdir/checkpoint_last.pt ] && log_file="-a $log_file"
update_freq=$(((2+ngpus-1)/ngpus))
CUDA_VISIBLE_DEVICES=$free_gpu python3 ../../fairseq_cli/train.py $lmdatadir --seed 1 \
--task language_modeling_for_asr --dict $lmdict \
--log-interval $((16000/ngpus/update_freq)) --log-format simple \
--num-workers 6 --max-tokens 32000 --batch-size 1024 --curriculum 1 --empty-cache-freq 30 \
--valid-subset $valid_subset --batch-size-valid 1536 --update-freq $update_freq \
--distributed-world-size $ngpus --required-batch-size-multiple 8 \
--max-epoch 30 --optimizer adam --lr 0.001 --clip-norm 1.0 \
--lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
--save-dir $lmdir --restore-file checkpoint_last.pt --save-interval-updates $((16000/ngpus/update_freq)) \
--keep-interval-updates 3 --keep-last-epochs 5 --validate-interval 1 \
--arch lstm_lm_librispeech --criterion cross_entropy --sample-break-mode eos 2>&1 | tee $log_file
fi
if [ ${stage} -le 6 ]; then
echo "Stage 6: subword LM Evaluation"
gen_set_array=(test)
num=$(echo $test_set | awk '{print NF-1}')
for i in $(seq $num); do gen_set_array[$i]="test$i"; done
test_set_array=($test_set)
for i in $(seq 0 $num); do
log_file=$lmdir/log/evaluation_${test_set_array[$i]}.log
python3 ../../fairseq_cli/eval_lm.py $lmdatadir --cpu \
--task language_modeling_for_asr --dict $lmdict --gen-subset ${gen_set_array[$i]} \
--max-tokens 40960 --batch-size 1536 --required-batch-size-multiple 8 --sample-break-mode eos \
--path $lmdir/$lm_checkpoint 2>&1 | tee $log_file
done
fi
if [ ${stage} -le 7 ]; then
echo "Stage 7: Dump Json Files"
for dataset in $train_set $valid_set $test_set; do
feat=${dumpdir}/$dataset/delta${do_delta}/feats.scp
text=data/$dataset/text
utt2num_frames=data/$dataset/utt2num_frames
if [[ $dataset == "$train_set" ]]; then
name=train
elif [[ $dataset == "$valid_set" ]]; then
name=valid
else
name=$dataset
fi
asr_prep_json.py --feat-files $feat --text-files $text --utt2num-frames-files $utt2num_frames --output data/$name.json
done
fi
if [ ${stage} -le 8 ]; then
echo "Stage 8: Model Training"
valid_subset=valid
mkdir -p $dir/log
log_file=$dir/log/train.log
[ -f $dir/checkpoint_last.pt ] && log_file="-a $log_file"
opts=""
if $use_transformer; then
update_freq=$(((8+ngpus-1)/ngpus))
opts="$opts --arch speech_transformer_librispeech --max-tokens 22000 --max-epoch 100 --lr-scheduler tri_stage --encoder-relative-positional-embeddings"
opts="$opts --warmup-steps $((22000/ngpus/update_freq)) --hold-steps $((810000/ngpus/update_freq)) --decay-steps $((1400000/ngpus/update_freq))"
if $apply_specaug; then
specaug_config="{'time_warp_W': 0, 'freq_mask_F': 27, 'freq_mask_N': 2, 'time_mask_pm': 0.04, 'time_mask_ps': 0.04}"
fi
else
update_freq=$(((2+ngpus-1)/ngpus))
opts="$opts --arch speech_conv_lstm_librispeech"
if $apply_specaug; then
opts="$opts --max-epoch 95 --lr-scheduler tri_stage"
opts="$opts --warmup-steps $((1800/ngpus/update_freq)) --hold-steps $((540000/ngpus/update_freq)) --decay-steps $((940000/ngpus/update_freq))"
opts="$opts --encoder-rnn-layers 5"
specaug_config="{'time_warp_W': 0, 'freq_mask_F': 27, 'freq_mask_N': 2, 'time_mask_pm': 0.04, 'time_mask_ps': 0.04}"
else
opts="$opts --max-epoch 30 --lr-scheduler reduce_lr_on_plateau_v2 --lr-shrink 0.5 --start-reduce-lr-epoch 10"
fi
fi
CUDA_VISIBLE_DEVICES=$free_gpu python3 ../../fairseq_cli/train.py data --task speech_recognition_espresso --seed 1 \
--log-interval $((8000/ngpus/update_freq)) --log-format simple --print-training-sample-interval $((4000/ngpus/update_freq)) \
--num-workers 6 --data-buffer-size 0 --max-tokens 26000 --batch-size 24 --curriculum 1 --empty-cache-freq 10 \
--valid-subset $valid_subset --batch-size-valid 48 --ddp-backend legacy_ddp --update-freq $update_freq \
--distributed-world-size $ngpus --required-batch-size-multiple 1 \
--optimizer adam --lr 0.001 --weight-decay 0.0 --clip-norm 2.0 \
--save-dir $dir --restore-file checkpoint_last.pt --save-interval-updates $((6000/ngpus/update_freq)) \
--keep-interval-updates 3 --keep-last-epochs 5 --validate-interval 1 --best-checkpoint-metric wer \
--criterion label_smoothed_cross_entropy_v2 --label-smoothing 0.1 --smoothing-type uniform \
--dict $dict --bpe sentencepiece --sentencepiece-model ${sentencepiece_model}.model \
--max-source-positions 3600 --max-target-positions 200 \
$opts --specaugment-config "$specaug_config" 2>&1 | tee $log_file
fi
if [ ${stage} -le 9 ]; then
echo "Stage 9: Decoding"
opts=""
path=$dir/$checkpoint
decode_affix=
if $lm_shallow_fusion; then
opts="$opts --lm-path $lmdir/$lm_checkpoint"
opts="$opts --lm-weight 0.47 --eos-factor 1.5"
if $apply_specaug; then
# overwrite the existing opts
opts="$opts --lm-weight 0.4"
fi
decode_affix=shallow_fusion
fi
for dataset in $test_set; do
decode_dir=$dir/decode_$dataset${decode_affix:+_${decode_affix}}
CUDA_VISIBLE_DEVICES=$(echo $free_gpu | sed 's/,/ /g' | awk '{print $1}') speech_recognize.py data \
--task speech_recognition_espresso --max-tokens 15000 --batch-size 24 --required-batch-size-multiple 1 \
--num-shards 1 --shard-id 0 --dict $dict --bpe sentencepiece --sentencepiece-model ${sentencepiece_model}.model \
--gen-subset $dataset --max-source-positions 9999 --max-target-positions 999 \
--path $path --beam 60 --max-len-a 0.08 --max-len-b 0 --lenpen 1.0 \
--results-path $decode_dir $opts
echo "log saved in ${decode_dir}/decode.log"
if $kaldi_scoring; then
echo "verify WER by scoring with Kaldi..."
local/score_e2e.sh data/$dataset $decode_dir
cat ${decode_dir}/scoring_kaldi/wer
fi
done
fi