From 492bbad04bad3ffd82e741a85b751bbc7b9a855b Mon Sep 17 00:00:00 2001 From: Yiming Wang Date: Thu, 13 Feb 2020 21:26:32 -0500 Subject: [PATCH] add options to accept utt2num_frames files to speed up the data loading (#22) * add options to accept utt2num_frames files to speed up the data loading --- espresso/data/scp_text_dataset.py | 27 ++++++++++++----- espresso/tasks/speech_recognition.py | 43 ++++++++++++++++++++++++---- examples/asr_librispeech/run.sh | 9 ++++-- examples/asr_swbd/run.sh | 9 ++++-- examples/asr_wsj/run.sh | 10 +++++-- 5 files changed, 76 insertions(+), 22 deletions(-) diff --git a/espresso/data/scp_text_dataset.py b/espresso/data/scp_text_dataset.py index 436995d6c6..898480348a 100644 --- a/espresso/data/scp_text_dataset.py +++ b/espresso/data/scp_text_dataset.py @@ -22,25 +22,38 @@ class ScpDataset(torch.utils.data.Dataset): every time each entry is inquired, thus incurs the most intensive I/O. """ - def __init__(self, path): + def __init__(self, path, utt2num_frames_path=None): super().__init__() self.dtype = np.float - self.read_scp(path) + self.read_scp(path, utt2num_frames_path) - def read_scp(self, path): + def read_scp(self, path, utt2num_frames_path=None): with open(path, 'r', encoding='utf-8') as f: scp_entries = [line.strip().split(None, 1) for line in f] self.utt_ids = [entry[0] for entry in scp_entries] self.extended_filenames = [entry[1] for entry in scp_entries] self.size = len(scp_entries) # number of utterances self.sizes = [] # length of each utterance + if utt2num_frames_path is not None: + with open(utt2num_frames_path, 'r', encoding='utf-8') as f: + i = 0 + for line in f: + utt_id, num_frames = line.strip().split(None, 1) + assert utt_id == self.utt_ids[i], \ + 'utterance ids mismatch: ' + utt_id + ' vs. ' + self.utt_ids[i] + self.sizes.append(int(num_frames)) + i += 1 + for filename in self.extended_filenames: try: feat = kaldi_io.read_mat(filename) except Exception: raise Exception('failed to read feature matrix {}.'.format(filename)) assert feat is not None and isinstance(feat, np.ndarray) + if len(self.sizes) == self.size: + break self.sizes.append(feat.shape[0]) + self.sizes = np.array(self.sizes, dtype=np.int32) self.feat_dim = feat.shape[1] # feature dimension @@ -84,8 +97,8 @@ class ScpCachedDataset(ScpDataset): It balances the I/O efficiency and memory usage. """ - def __init__(self, path, ordered_prefetch=False, cache_size=4096): - super().__init__(path) + def __init__(self, path, utt2num_frames_path=None, ordered_prefetch=False, cache_size=4096): + super().__init__(path, utt2num_frames_path) self.cache = None self.cache_index = {} self.cache_size = cache_size # in terms of number of examples @@ -156,8 +169,8 @@ class ScpInMemoryDataset(ScpDataset): It has the maximum memory usage and least I/O. """ - def __init__(self, path): - super().__init__(path) + def __init__(self, path, utt2num_frames_path=None): + super().__init__(path, utt2num_frames_path) self.read_data() def read_data(self): diff --git a/espresso/tasks/speech_recognition.py b/espresso/tasks/speech_recognition.py index 21c6c9cf87..14c0961cfe 100644 --- a/espresso/tasks/speech_recognition.py +++ b/espresso/tasks/speech_recognition.py @@ -59,21 +59,34 @@ def add_args(parser): help='path(s) to text file(s) for training, where ' 'each should matches with one in --train-feat-files, ' 'will be iterated upon during epochs in round-robin manner') + parser.add_argument('--train-utt2num-frames-files', nargs='+', default=None, + help='path(s) to utt2num_frames file(s) for training. if not None, ' + 'each should matches with one in --train-feat-files, ' + 'will be iterated upon during epochs in round-robin manner') parser.add_argument('--valid-feat-files', nargs='+', help='path(s) to scp feature file(s) for validation') parser.add_argument('--valid-text-files', nargs='+', help='path(s) to text file(s) for validation, where ' 'each should matches with one in --valid-feat-files') + parser.add_argument('--valid-utt2num-frames-files', nargs='+', default=None, + help='path(s) to utt2num_frames file(s) for validation. if not None, ' + 'each should matches with one in --valid-feat-files') parser.add_argument('--test-feat-files', nargs='+', help='path(s) to scp feature file(s) for test') - parser.add_argument('--test-text-files', nargs='*', default=None, + parser.add_argument('--test-text-files', nargs='+', default=None, help='path(s) to text file(s) for test. if not None, ' 'each one should matches with one in --test-feat-files') + parser.add_argument('--test-utt2num-frames-files', nargs='+', default=None, + help='path(s) to utt2num_frames file(s) for test. if not None, ' + 'each should matches with one in --test-feat-files') parser.add_argument('--train-subset-feat-files', nargs='+', help='path(s) to scp feature file(s) for validation') parser.add_argument('--train-subset-text-files', nargs='+', help='path(s) to text file(s) for validation, where ' 'each should matches with one in --train-subset-feat-files') + parser.add_argument('--train-subset-utt2num-frames-files', nargs='+', default=None, + help='path(s) to utt2num_frames file(s) for validation. if not None, ' + 'each should matches with one in --train-subset-feat-files') parser.add_argument('--dict', default=None, type=str, help='path to the dictionary') parser.add_argument('--non-lang-syms', default=None, type=str, @@ -159,29 +172,47 @@ def load_dataset(self, split, epoch=0, combine=False, **kwargs): if split == 'train': feat_files = self.args.train_feat_files text_files = self.args.train_text_files + utt2num_frames_files = self.args.train_utt2num_frames_files # can be None assert len(feat_files) > 0 and len(feat_files) == len(text_files) + assert utt2num_frames_files is None or len(feat_files) == len(utt2num_frames_files) feat_files = [feat_files[epoch % len(feat_files)]] text_files = [text_files[epoch % len(text_files)]] + if utt2num_frames_files is not None: + utt2num_frames_files = [utt2num_frames_files[epoch % len(utt2num_frames_files)]] + else: + utt2num_frames_files = [None] elif split == 'valid': feat_files = self.args.valid_feat_files text_files = self.args.valid_text_files + utt2num_frames_files = self.args.valid_utt2num_frames_files # can be None + if utt2num_frames_files is None: + utt2num_frames_files = [None] * len(feat_files) elif split == 'test': feat_files = self.args.test_feat_files - text_files = self.args.test_text_files # can be empty + text_files = self.args.test_text_files # can be None + utt2num_frames_files = self.args.test_utt2num_frames_files # can be None if text_files is None: text_files = [None] * len(feat_files) + if utt2num_frames_files is None: + utt2num_frames_files = [None] * len(feat_files) elif split == 'train_subset': feat_files = self.args.train_subset_feat_files text_files = self.args.train_subset_text_files + utt2num_frames_files = self.args.train_subset_utt2num_frames_files # can be None + if utt2num_frames_files is None: + utt2num_frames_files = [None] * len(feat_files) else: raise ValueError('split should be one of "train", "valid", "test", "train_subset"') - assert len(feat_files) > 0 and len(feat_files) == len(text_files) - file_pairs = zip(feat_files, text_files) - for feat, text in file_pairs: + assert len(feat_files) > 0 and len(feat_files) == len(text_files) and \ + len(feat_files) == len(utt2num_frames_files) + file_tuples = zip(feat_files, text_files, utt2num_frames_files) + for feat, text, utt2num_frames in file_tuples: assert ScpCachedDataset.exists(feat), feat + ' does not exists' assert text is None or AsrTextDataset.exists(text), text + ' does not exists' - src_datasets.append(ScpCachedDataset(feat, ordered_prefetch=True)) + assert utt2num_frames is None or ScpCachedDataset.exists(utt2num_frames), \ + utt2num_frames + ' does not exists' + src_datasets.append(ScpCachedDataset(feat, utt2num_frames, ordered_prefetch=True)) logger.info('{} {} examples'.format(feat, len(src_datasets[-1]))) if text is not None: tgt_datasets.append(AsrTextDataset(text, self.dictionary)) diff --git a/examples/asr_librispeech/run.sh b/examples/asr_librispeech/run.sh index 59a3fc1319..a09335a0b1 100755 --- a/examples/asr_librispeech/run.sh +++ b/examples/asr_librispeech/run.sh @@ -197,8 +197,10 @@ fi train_feat=$train_feat_dir/feats.scp train_token_text=data/$train_set/token_text +train_utt2num_frames=data/$train_set/utt2num_frames valid_feat=$valid_feat_dir/feats.scp valid_token_text=data/$valid_set/token_text +valid_utt2num_frames=data/$valid_set/utt2num_frames if [ ${stage} -le 7 ]; then echo "Stage 7: Model Training" valid_subset=valid @@ -217,8 +219,8 @@ if [ ${stage} -le 7 ]; then --arch speech_conv_lstm_librispeech --criterion label_smoothed_cross_entropy_v2 \ --label-smoothing 0.1 --smoothing-type uniform \ --scheduled-sampling-probs 1.0 --start-scheduled-sampling-epoch 1 \ - --train-feat-files $train_feat --train-text-files $train_token_text \ - --valid-feat-files $valid_feat --valid-text-files $valid_token_text \ + --train-feat-files $train_feat --train-text-files $train_token_text --train-utt2num-frames-files $train_utt2num_frames \ + --valid-feat-files $valid_feat --valid-text-files $valid_token_text --valid-utt2num-frames-files $valid_utt2num_frames \ --dict $dict --remove-bpe sentencepiece \ --max-source-positions 9999 --max-target-positions 999 2>&1 | tee $log_file fi @@ -236,10 +238,11 @@ if [ ${stage} -le 8 ]; then for dataset in $test_set; do feat=${dumpdir}/$dataset/delta${do_delta}/feats.scp text=data/$dataset/token_text + utt2num_frames=data/$dataset/utt2num_frames decode_dir=$dir/decode_$dataset${decode_affix:+_${decode_affix}} CUDA_VISIBLE_DEVICES=$(echo $free_gpu | sed 's/,/ /g' | awk '{print $1}') speech_recognize.py \ --task speech_recognition_espresso --user-dir espresso --max-tokens 15000 --max-sentences 24 \ - --num-shards 1 --shard-id 0 --test-feat-files $feat --test-text-files $text \ + --num-shards 1 --shard-id 0 --test-feat-files $feat --test-text-files $text --test-utt2num-frames-files $utt2num_frames \ --dict $dict --remove-bpe sentencepiece \ --max-source-positions 9999 --max-target-positions 999 \ --path $path --beam 60 --max-len-a 0.08 --max-len-b 0 --lenpen 1.0 \ diff --git a/examples/asr_swbd/run.sh b/examples/asr_swbd/run.sh index 3a645e2229..db3bc8ee50 100755 --- a/examples/asr_swbd/run.sh +++ b/examples/asr_swbd/run.sh @@ -235,8 +235,10 @@ fi train_feat=$train_feat_dir/feats.scp train_token_text=data/$train_set/token_text +train_utt2num_frames=data/$train_set/utt2num_frames valid_feat=$valid_feat_dir/feats.scp valid_token_text=data/$valid_set/token_text +valid_utt2num_frames=data/$valid_set/utt2num_frames if [ $stage -le 6 ]; then echo "Stage 6: Model Training" valid_subset=valid @@ -257,8 +259,8 @@ if [ $stage -le 6 ]; then --arch speech_conv_lstm_swbd --criterion label_smoothed_cross_entropy_v2 \ --label-smoothing 0.1 --smoothing-type uniform \ --scheduled-sampling-probs 0.9,0.8,0.7,0.6 --start-scheduled-sampling-epoch 6 \ - --train-feat-files $train_feat --train-text-files $train_token_text \ - --valid-feat-files $valid_feat --valid-text-files $valid_token_text \ + --train-feat-files $train_feat --train-text-files $train_token_text --train-utt2num-frames-files $train_utt2num_frames \ + --valid-feat-files $valid_feat --valid-text-files $valid_token_text --valid-utt2num-frames-files $valid_utt2num_frames \ --dict $dict --remove-bpe sentencepiece --non-lang-syms $nlsyms \ --max-source-positions 9999 --max-target-positions 999 $opts 2>&1 | tee $log_file fi @@ -277,12 +279,13 @@ if [ $stage -le 7 ]; then [ -f local/wer_output_filter ] && opts="$opts --wer-output-filter local/wer_output_filter" for dataset in $test_set; do feat=${dumpdir}/$dataset/delta${do_delta}/feats.scp + utt2num_frames=data/$dataset/utt2num_frames decode_dir=$dir/decode_${dataset}${decode_affix:+_${decode_affix}} # only score train_dev with built-in scorer text_opt= && [ "$dataset" == "train_dev" ] && text_opt="--test-text-files data/$dataset/token_text" CUDA_VISIBLE_DEVICES=$(echo $free_gpu | sed 's/,/ /g' | awk '{print $1}') speech_recognize.py \ --task speech_recognition_espresso --user-dir espresso --max-tokens 24000 --max-sentences 48 \ - --num-shards 1 --shard-id 0 --test-feat-files $feat $text_opt \ + --num-shards 1 --shard-id 0 --test-feat-files $feat $text_opt --test-utt2num-frames-files $utt2num_frames \ --dict $dict --remove-bpe sentencepiece --non-lang-syms $nlsyms \ --max-source-positions 9999 --max-target-positions 999 \ --path $path --beam 35 --max-len-a 0.1 --max-len-b 0 --lenpen 1.0 \ diff --git a/examples/asr_wsj/run.sh b/examples/asr_wsj/run.sh index a50f81a8be..20b2923ffd 100755 --- a/examples/asr_wsj/run.sh +++ b/examples/asr_wsj/run.sh @@ -249,8 +249,10 @@ fi train_feat=$train_feat_dir/feats.scp train_token_text=data/$train_set/token_text +train_utt2num_frames=data/$train_set/utt2num_frames valid_feat=$valid_feat_dir/feats.scp valid_token_text=data/$valid_set/token_text +valid_utt2num_frames=data/$valid_set/utt2num_frames if [ ${stage} -le 8 ]; then echo "Stage 8: Model Training" opts="" @@ -259,6 +261,7 @@ if [ ${stage} -le 8 ]; then valid_subset="$valid_subset,train_subset" opts="$opts --train-subset-feat-files $train_subset_feat_dir/feats.scp" opts="$opts --train-subset-text-files data/${train_set}_${train_subset_size}/token_text" + opts="$opts --train-subset-utt2num-frames-files data/${train_set}_${train_subset_size}/utt2num_frames" fi [ -f local/wer_output_filter ] && opts="$opts --wer-output-filter local/wer_output_filter" mkdir -p $dir/logs @@ -276,8 +279,8 @@ if [ ${stage} -le 8 ]; then --arch speech_conv_lstm_wsj --criterion label_smoothed_cross_entropy_v2 \ --label-smoothing 0.05 --smoothing-type temporal \ --scheduled-sampling-probs 0.5 --start-scheduled-sampling-epoch 6 \ - --train-feat-files $train_feat --train-text-files $train_token_text \ - --valid-feat-files $valid_feat --valid-text-files $valid_token_text \ + --train-feat-files $train_feat --train-text-files $train_token_text --train-utt2num-frames-files $train_utt2num_frames \ + --valid-feat-files $valid_feat --valid-text-files $valid_token_text --valid-utt2num-frames-files $valid_utt2num_frames \ --dict $dict --non-lang-syms $nlsyms \ --max-source-positions 9999 --max-target-positions 999 $opts 2>&1 | tee $log_file fi @@ -306,10 +309,11 @@ if [ ${stage} -le 9 ]; then feat=$test_feat_dir/feats.scp fi text=data/$dataset/token_text + utt2num_frames=data/$dataset/utt2num_frames decode_dir=$dir/decode_$dataset${decode_affix:+_${decode_affix}} CUDA_VISIBLE_DEVICES=$(echo $free_gpu | sed 's/,/ /g' | awk '{print $1}') speech_recognize.py \ --task speech_recognition_espresso --user-dir espresso --max-tokens 20000 --max-sentences 32 \ - --num-shards 1 --shard-id 0 --test-feat-files $feat --test-text-files $text \ + --num-shards 1 --shard-id 0 --test-feat-files $feat --test-text-files $text --test-utt2num-frames-files $utt2num_frames \ --dict $dict --non-lang-syms $nlsyms \ --max-source-positions 9999 --max-target-positions 999 \ --path $path --beam 50 --max-len-a 0.2 --max-len-b 0 --lenpen 1.0 \