Skip to content

Commit

Permalink
add options to accept utt2num_frames files to speed up the data loadi…
Browse files Browse the repository at this point in the history
…ng (#22)

* add options to accept utt2num_frames files to speed up the data loading
  • Loading branch information
freewym committed Oct 14, 2022
1 parent 2437bdd commit 0f3779e
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 22 deletions.
27 changes: 20 additions & 7 deletions espresso/data/scp_text_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,38 @@ class ScpDataset(torch.utils.data.Dataset):
every time each entry is inquired, thus incurs the most intensive I/O.
"""

def __init__(self, path):
def __init__(self, path, utt2num_frames_path=None):
super().__init__()
self.dtype = np.float
self.read_scp(path)
self.read_scp(path, utt2num_frames_path)

def read_scp(self, path):
def read_scp(self, path, utt2num_frames_path=None):
with open(path, 'r', encoding='utf-8') as f:
scp_entries = [line.strip().split(None, 1) for line in f]
self.utt_ids = [entry[0] for entry in scp_entries]
self.extended_filenames = [entry[1] for entry in scp_entries]
self.size = len(scp_entries) # number of utterances
self.sizes = [] # length of each utterance
if utt2num_frames_path is not None:
with open(utt2num_frames_path, 'r', encoding='utf-8') as f:
i = 0
for line in f:
utt_id, num_frames = line.strip().split(None, 1)
assert utt_id == self.utt_ids[i], \
'utterance ids mismatch: ' + utt_id + ' vs. ' + self.utt_ids[i]
self.sizes.append(int(num_frames))
i += 1

for filename in self.extended_filenames:
try:
feat = kaldi_io.read_mat(filename)
except Exception:
raise Exception('failed to read feature matrix {}.'.format(filename))
assert feat is not None and isinstance(feat, np.ndarray)
if len(self.sizes) == self.size:
break
self.sizes.append(feat.shape[0])

self.sizes = np.array(self.sizes, dtype=np.int32)
self.feat_dim = feat.shape[1] # feature dimension

Expand Down Expand Up @@ -84,8 +97,8 @@ class ScpCachedDataset(ScpDataset):
It balances the I/O efficiency and memory usage.
"""

def __init__(self, path, ordered_prefetch=False, cache_size=4096):
super().__init__(path)
def __init__(self, path, utt2num_frames_path=None, ordered_prefetch=False, cache_size=4096):
super().__init__(path, utt2num_frames_path)
self.cache = None
self.cache_index = {}
self.cache_size = cache_size # in terms of number of examples
Expand Down Expand Up @@ -156,8 +169,8 @@ class ScpInMemoryDataset(ScpDataset):
It has the maximum memory usage and least I/O.
"""

def __init__(self, path):
super().__init__(path)
def __init__(self, path, utt2num_frames_path=None):
super().__init__(path, utt2num_frames_path)
self.read_data()

def read_data(self):
Expand Down
43 changes: 37 additions & 6 deletions espresso/tasks/speech_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,34 @@ def add_args(parser):
help='path(s) to text file(s) for training, where '
'each should matches with one in --train-feat-files, '
'will be iterated upon during epochs in round-robin manner')
parser.add_argument('--train-utt2num-frames-files', nargs='+', default=None,
help='path(s) to utt2num_frames file(s) for training. if not None, '
'each should matches with one in --train-feat-files, '
'will be iterated upon during epochs in round-robin manner')
parser.add_argument('--valid-feat-files', nargs='+',
help='path(s) to scp feature file(s) for validation')
parser.add_argument('--valid-text-files', nargs='+',
help='path(s) to text file(s) for validation, where '
'each should matches with one in --valid-feat-files')
parser.add_argument('--valid-utt2num-frames-files', nargs='+', default=None,
help='path(s) to utt2num_frames file(s) for validation. if not None, '
'each should matches with one in --valid-feat-files')
parser.add_argument('--test-feat-files', nargs='+',
help='path(s) to scp feature file(s) for test')
parser.add_argument('--test-text-files', nargs='*', default=None,
parser.add_argument('--test-text-files', nargs='+', default=None,
help='path(s) to text file(s) for test. if not None, '
'each one should matches with one in --test-feat-files')
parser.add_argument('--test-utt2num-frames-files', nargs='+', default=None,
help='path(s) to utt2num_frames file(s) for test. if not None, '
'each should matches with one in --test-feat-files')
parser.add_argument('--train-subset-feat-files', nargs='+',
help='path(s) to scp feature file(s) for validation')
parser.add_argument('--train-subset-text-files', nargs='+',
help='path(s) to text file(s) for validation, where '
'each should matches with one in --train-subset-feat-files')
parser.add_argument('--train-subset-utt2num-frames-files', nargs='+', default=None,
help='path(s) to utt2num_frames file(s) for validation. if not None, '
'each should matches with one in --train-subset-feat-files')
parser.add_argument('--dict', default=None, type=str,
help='path to the dictionary')
parser.add_argument('--non-lang-syms', default=None, type=str,
Expand Down Expand Up @@ -159,29 +172,47 @@ def load_dataset(self, split, epoch=0, combine=False, **kwargs):
if split == 'train':
feat_files = self.args.train_feat_files
text_files = self.args.train_text_files
utt2num_frames_files = self.args.train_utt2num_frames_files # can be None
assert len(feat_files) > 0 and len(feat_files) == len(text_files)
assert utt2num_frames_files is None or len(feat_files) == len(utt2num_frames_files)
feat_files = [feat_files[epoch % len(feat_files)]]
text_files = [text_files[epoch % len(text_files)]]
if utt2num_frames_files is not None:
utt2num_frames_files = [utt2num_frames_files[epoch % len(utt2num_frames_files)]]
else:
utt2num_frames_files = [None]
elif split == 'valid':
feat_files = self.args.valid_feat_files
text_files = self.args.valid_text_files
utt2num_frames_files = self.args.valid_utt2num_frames_files # can be None
if utt2num_frames_files is None:
utt2num_frames_files = [None] * len(feat_files)
elif split == 'test':
feat_files = self.args.test_feat_files
text_files = self.args.test_text_files # can be empty
text_files = self.args.test_text_files # can be None
utt2num_frames_files = self.args.test_utt2num_frames_files # can be None
if text_files is None:
text_files = [None] * len(feat_files)
if utt2num_frames_files is None:
utt2num_frames_files = [None] * len(feat_files)
elif split == 'train_subset':
feat_files = self.args.train_subset_feat_files
text_files = self.args.train_subset_text_files
utt2num_frames_files = self.args.train_subset_utt2num_frames_files # can be None
if utt2num_frames_files is None:
utt2num_frames_files = [None] * len(feat_files)
else:
raise ValueError('split should be one of "train", "valid", "test", "train_subset"')

assert len(feat_files) > 0 and len(feat_files) == len(text_files)
file_pairs = zip(feat_files, text_files)
for feat, text in file_pairs:
assert len(feat_files) > 0 and len(feat_files) == len(text_files) and \
len(feat_files) == len(utt2num_frames_files)
file_tuples = zip(feat_files, text_files, utt2num_frames_files)
for feat, text, utt2num_frames in file_tuples:
assert ScpCachedDataset.exists(feat), feat + ' does not exists'
assert text is None or AsrTextDataset.exists(text), text + ' does not exists'
src_datasets.append(ScpCachedDataset(feat, ordered_prefetch=True))
assert utt2num_frames is None or ScpCachedDataset.exists(utt2num_frames), \
utt2num_frames + ' does not exists'
src_datasets.append(ScpCachedDataset(feat, utt2num_frames, ordered_prefetch=True))
logger.info('{} {} examples'.format(feat, len(src_datasets[-1])))
if text is not None:
tgt_datasets.append(AsrTextDataset(text, self.dictionary))
Expand Down
9 changes: 6 additions & 3 deletions examples/asr_librispeech/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,10 @@ fi

train_feat=$train_feat_dir/feats.scp
train_token_text=data/$train_set/token_text
train_utt2num_frames=data/$train_set/utt2num_frames
valid_feat=$valid_feat_dir/feats.scp
valid_token_text=data/$valid_set/token_text
valid_utt2num_frames=data/$valid_set/utt2num_frames
if [ ${stage} -le 7 ]; then
echo "Stage 7: Model Training"
valid_subset=valid
Expand All @@ -217,8 +219,8 @@ if [ ${stage} -le 7 ]; then
--arch speech_conv_lstm_librispeech --criterion label_smoothed_cross_entropy_v2 \
--label-smoothing 0.1 --smoothing-type uniform \
--scheduled-sampling-probs 1.0 --start-scheduled-sampling-epoch 1 \
--train-feat-files $train_feat --train-text-files $train_token_text \
--valid-feat-files $valid_feat --valid-text-files $valid_token_text \
--train-feat-files $train_feat --train-text-files $train_token_text --train-utt2num-frames-files $train_utt2num_frames \
--valid-feat-files $valid_feat --valid-text-files $valid_token_text --valid-utt2num-frames-files $valid_utt2num_frames \
--dict $dict --remove-bpe sentencepiece \
--max-source-positions 9999 --max-target-positions 999 2>&1 | tee $log_file
fi
Expand All @@ -236,10 +238,11 @@ if [ ${stage} -le 8 ]; then
for dataset in $test_set; do
feat=${dumpdir}/$dataset/delta${do_delta}/feats.scp
text=data/$dataset/token_text
utt2num_frames=data/$dataset/utt2num_frames
decode_dir=$dir/decode_$dataset${decode_affix:+_${decode_affix}}
CUDA_VISIBLE_DEVICES=$(echo $free_gpu | sed 's/,/ /g' | awk '{print $1}') speech_recognize.py \
--task speech_recognition_espresso --user-dir espresso --max-tokens 15000 --max-sentences 24 \
--num-shards 1 --shard-id 0 --test-feat-files $feat --test-text-files $text \
--num-shards 1 --shard-id 0 --test-feat-files $feat --test-text-files $text --test-utt2num-frames-files $utt2num_frames \
--dict $dict --remove-bpe sentencepiece \
--max-source-positions 9999 --max-target-positions 999 \
--path $path --beam 60 --max-len-a 0.08 --max-len-b 0 --lenpen 1.0 \
Expand Down
9 changes: 6 additions & 3 deletions examples/asr_swbd/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,10 @@ fi

train_feat=$train_feat_dir/feats.scp
train_token_text=data/$train_set/token_text
train_utt2num_frames=data/$train_set/utt2num_frames
valid_feat=$valid_feat_dir/feats.scp
valid_token_text=data/$valid_set/token_text
valid_utt2num_frames=data/$valid_set/utt2num_frames
if [ $stage -le 6 ]; then
echo "Stage 6: Model Training"
valid_subset=valid
Expand All @@ -257,8 +259,8 @@ if [ $stage -le 6 ]; then
--arch speech_conv_lstm_swbd --criterion label_smoothed_cross_entropy_v2 \
--label-smoothing 0.1 --smoothing-type uniform \
--scheduled-sampling-probs 0.9,0.8,0.7,0.6 --start-scheduled-sampling-epoch 6 \
--train-feat-files $train_feat --train-text-files $train_token_text \
--valid-feat-files $valid_feat --valid-text-files $valid_token_text \
--train-feat-files $train_feat --train-text-files $train_token_text --train-utt2num-frames-files $train_utt2num_frames \
--valid-feat-files $valid_feat --valid-text-files $valid_token_text --valid-utt2num-frames-files $valid_utt2num_frames \
--dict $dict --remove-bpe sentencepiece --non-lang-syms $nlsyms \
--max-source-positions 9999 --max-target-positions 999 $opts 2>&1 | tee $log_file
fi
Expand All @@ -277,12 +279,13 @@ if [ $stage -le 7 ]; then
[ -f local/wer_output_filter ] && opts="$opts --wer-output-filter local/wer_output_filter"
for dataset in $test_set; do
feat=${dumpdir}/$dataset/delta${do_delta}/feats.scp
utt2num_frames=data/$dataset/utt2num_frames
decode_dir=$dir/decode_${dataset}${decode_affix:+_${decode_affix}}
# only score train_dev with built-in scorer
text_opt= && [ "$dataset" == "train_dev" ] && text_opt="--test-text-files data/$dataset/token_text"
CUDA_VISIBLE_DEVICES=$(echo $free_gpu | sed 's/,/ /g' | awk '{print $1}') speech_recognize.py \
--task speech_recognition_espresso --user-dir espresso --max-tokens 24000 --max-sentences 48 \
--num-shards 1 --shard-id 0 --test-feat-files $feat $text_opt \
--num-shards 1 --shard-id 0 --test-feat-files $feat $text_opt --test-utt2num-frames-files $utt2num_frames \
--dict $dict --remove-bpe sentencepiece --non-lang-syms $nlsyms \
--max-source-positions 9999 --max-target-positions 999 \
--path $path --beam 35 --max-len-a 0.1 --max-len-b 0 --lenpen 1.0 \
Expand Down
10 changes: 7 additions & 3 deletions examples/asr_wsj/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,10 @@ fi

train_feat=$train_feat_dir/feats.scp
train_token_text=data/$train_set/token_text
train_utt2num_frames=data/$train_set/utt2num_frames
valid_feat=$valid_feat_dir/feats.scp
valid_token_text=data/$valid_set/token_text
valid_utt2num_frames=data/$valid_set/utt2num_frames
if [ ${stage} -le 8 ]; then
echo "Stage 8: Model Training"
opts=""
Expand All @@ -259,6 +261,7 @@ if [ ${stage} -le 8 ]; then
valid_subset="$valid_subset,train_subset"
opts="$opts --train-subset-feat-files $train_subset_feat_dir/feats.scp"
opts="$opts --train-subset-text-files data/${train_set}_${train_subset_size}/token_text"
opts="$opts --train-subset-utt2num-frames-files data/${train_set}_${train_subset_size}/utt2num_frames"
fi
[ -f local/wer_output_filter ] && opts="$opts --wer-output-filter local/wer_output_filter"
mkdir -p $dir/logs
Expand All @@ -276,8 +279,8 @@ if [ ${stage} -le 8 ]; then
--arch speech_conv_lstm_wsj --criterion label_smoothed_cross_entropy_v2 \
--label-smoothing 0.05 --smoothing-type temporal \
--scheduled-sampling-probs 0.5 --start-scheduled-sampling-epoch 6 \
--train-feat-files $train_feat --train-text-files $train_token_text \
--valid-feat-files $valid_feat --valid-text-files $valid_token_text \
--train-feat-files $train_feat --train-text-files $train_token_text --train-utt2num-frames-files $train_utt2num_frames \
--valid-feat-files $valid_feat --valid-text-files $valid_token_text --valid-utt2num-frames-files $valid_utt2num_frames \
--dict $dict --non-lang-syms $nlsyms \
--max-source-positions 9999 --max-target-positions 999 $opts 2>&1 | tee $log_file
fi
Expand Down Expand Up @@ -306,10 +309,11 @@ if [ ${stage} -le 9 ]; then
feat=$test_feat_dir/feats.scp
fi
text=data/$dataset/token_text
utt2num_frames=data/$dataset/utt2num_frames
decode_dir=$dir/decode_$dataset${decode_affix:+_${decode_affix}}
CUDA_VISIBLE_DEVICES=$(echo $free_gpu | sed 's/,/ /g' | awk '{print $1}') speech_recognize.py \
--task speech_recognition_espresso --user-dir espresso --max-tokens 20000 --max-sentences 32 \
--num-shards 1 --shard-id 0 --test-feat-files $feat --test-text-files $text \
--num-shards 1 --shard-id 0 --test-feat-files $feat --test-text-files $text --test-utt2num-frames-files $utt2num_frames \
--dict $dict --non-lang-syms $nlsyms \
--max-source-positions 9999 --max-target-positions 999 \
--path $path --beam 50 --max-len-a 0.2 --max-len-b 0 --lenpen 1.0 \
Expand Down

0 comments on commit 0f3779e

Please sign in to comment.