Skip to content

Commit

Permalink
Merge pull request #4196 from chintu619/msr_indic_is18
Browse files Browse the repository at this point in the history
bug fixes and efficient train/dev split in data prep of Microsoft Indian Languages recipe
  • Loading branch information
sw005320 committed Mar 24, 2022
2 parents 47f68b6 + c4afcc3 commit 14c6350
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 24 deletions.
15 changes: 6 additions & 9 deletions egs2/ms_indic_18/asr1/local/data.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,14 @@ stop_stage=100
SECONDS=0
lang=te # te ta gu

. utils/parse_options.sh || exit 1;
. utils/parse_options.sh || exit 1;


log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

mkdir -p ${MS_INDIC_IS18}
if [ -z "${MS_INDIC_IS18}" ]; then
log "Fill the value of 'MS_INDIC_IS18' of db.sh"
exit 1
Expand All @@ -33,19 +32,16 @@ set -e
set -u
set -o pipefail

train_set=train_"$(echo "${lang}" | tr - _)"
train_dev=dev_"$(echo "${lang}" | tr - _)"
test_set=test_"$(echo "${lang}" | tr - _)"

log "data preparation started"

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Train" ]]; then
log "stage0: Download data to ${MS_INDIC_IS18}. ${lang}-in-Train} directory is missing"
log "stage0: Download training data to ${MS_INDIC_IS18}. ${lang}-in-Train directory is missing"
exit 1
elif [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Test" ]]; then
log "stage0: Download data to ${MS_INDIC_IS18}. ${lang}-in-Test} directory is missing"
log "stage0: Download test data to ${MS_INDIC_IS18}. ${lang}-in-Test directory is missing"
exit 1
fi
exit 1
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
Expand All @@ -54,4 +50,5 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
local/prepare_data.py ${MS_INDIC_IS18} ${lang}
fi


log "Successfully finished. [elapsed=${SECONDS}s]"
21 changes: 9 additions & 12 deletions egs2/ms_indic_18/asr1/local/prepare_data.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@


import os
import soundfile as sf
import random
import sys
import librosa


if len(sys.argv) != 3:
print("Usage: python prepare_data.py [data-directory] [language-ID]")
Expand All @@ -21,7 +22,7 @@
testdir = f"{datadir}/{lang}-in-Test/"

train_datadir = f"data/train_{lang}/"
valid_datadir = f"data/valid_{lang}/"
valid_datadir = f"data/dev_{lang}/"
test_datadir = f"data/test_{lang}/"

os.popen(f"mkdir -p {train_datadir}").read()
Expand All @@ -30,26 +31,22 @@


# prepare data for training and validation splits
with open(traindir+'transcriptions.txt') as f:
with open(traindir+'transcription.txt') as f:
train_lines = [line.rstrip() for line in f.readlines()]
train_id2text = {}
train_id2dur = {}
train_id2filepath = {}
for line in train_lines:
wav_id = line.split()[0]
filepath = f"{traindir}/Audios/{wav_id}.wav"
train_id2text[wav_id] = ' '.join(line.split()[1:])
train_id2filepath[wav_id] = filepath

def get_duration(filepath):
x,f = sf.read(filepath)
return len(x)/f

wav_ids = list(train_id2text.keys())
random.shuffle(wav_ids)
valid_id2text = {}
valid_totaldur = 2*60*60 # (in seconds) 2 hours taken for validation split
for wav_id in wav_ids:
dur = get_duration(train_id2filepath[wav_id])
dur = librosa.get_duration(filename=train_id2filepath[wav_id])
valid_id2text[wav_id] = train_id2text.pop(wav_id)
valid_totaldur -= dur
if valid_totaldur < 0:
Expand All @@ -58,7 +55,7 @@ def get_duration(filepath):

with open(train_datadir+'text', 'w') as f:
for wav_id in sorted(train_id2text):
f.write(f"{lang}_{wav_id} {test_id2text[wav_id]}\n")
f.write(f"{lang}_{wav_id} {train_id2text[wav_id]}\n")
with open(train_datadir+'wav.scp', 'w') as f:
for wav_id in sorted(train_id2text):
f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n")
Expand All @@ -71,7 +68,7 @@ def get_duration(filepath):

with open(valid_datadir+'text', 'w') as f:
for wav_id in sorted(valid_id2text):
f.write(f"{lang}_{wav_id} {test_id2text[wav_id]}\n")
f.write(f"{lang}_{wav_id} {valid_id2text[wav_id]}\n")
with open(valid_datadir+'wav.scp', 'w') as f:
for wav_id in sorted(valid_id2text):
f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n")
Expand All @@ -84,7 +81,7 @@ def get_duration(filepath):


# prepare test data
with open(testdir+'transcriptions.txt') as f:
with open(testdir+'transcription.txt') as f:
test_lines = [line.rstrip() for line in f.readlines()]
test_id2text = {}
test_id2filepath = {}
Expand Down
6 changes: 3 additions & 3 deletions egs2/ms_indic_18/asr1/run.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ asr_config=conf/train_asr.yaml
lm_config=conf/train_lm.yaml
inference_config=conf/decoder_asr.yaml

if [[ "zh" == *"${lang}"* ]]; then
if [[ "zh" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=te
nbpe=2500
elif [[ "fr" == *"${lang}"* ]]; then
elif [[ "fr" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=ta
nbpe=350
elif [[ "es" == *"${lang}"* ]]; then
elif [[ "es" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=gu
nbpe=235
else
nbpe=150
Expand Down

0 comments on commit 14c6350

Please sign in to comment.