Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

single language track setups #4895

Merged
merged 5 commits into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
132 changes: 87 additions & 45 deletions egs2/msuperb/asr1/local/single_lang_data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,36 @@
"voxpopuli",
]

SINGLE_LANG = ["eng", "deu", "rus", "pol", "swe", "jpn", "cmn", "sat", "nob", "xty"]
SINGLE_LANG = [
"eng1",
"eng2",
"eng3",
"fra1",
"fra2",
"deu1",
"deu2",
"rus",
"swa",
"swe",
"jpn",
"cmn",
"xty",
]
LANG_TO_SELECTED_DATASET = {
"eng1": "mls",
"eng2": "nchlt",
"eng3": "voxpopuli",
"fra1": "voxforge",
"fra2": "voxpopuli",
"deu": "swc",
"deu2": "voxpopuli",
"rus": "M-AILABS",
"swa": "ALFFA",
"swe": "NST",
"jpn": "commonvoice",
"cmn": "fleurs",
"xty": "mexico-el",
}


def process_text(text):
Expand All @@ -36,7 +65,9 @@ def process_text(text):

args = parser.parse_args()
assert args.duration in ["10min", "1h"], "we only support 10min or 1h setting"
assert args.lang in SINGLE_LANG, "the language {} is not in our recommend set"
assert (
args.lang in SINGLE_LANG
), "the language {} is not in our recommend set".format(args.lang)

langs_info = {}

Expand Down Expand Up @@ -91,7 +122,7 @@ def process_text(text):

# iterate through dataset
for dataset in DATA:
langs = [args.lang]
langs = [args.lang[:3]]
for lang in langs:
if not os.path.exists(os.path.join(args.source, dataset, lang)):
continue
Expand All @@ -100,51 +131,62 @@ def process_text(text):
langs_info[lang].append(dataset)

# process train
train_transcript = open(
os.path.join(
args.source,
dataset,
lang,
"transcript_{}_train.txt".format(args.duration),
),
"r",
encoding="utf-8",
)
for line in train_transcript.readlines():
line = line.strip().split(maxsplit=2)
utt_id, _, text = line
train_wavscp.write(
"{} sox {} -c 1 -t wavpcm -|\n".format(
utt_id,
os.path.join(
args.source, dataset, lang, "wav", "{}.wav".format(utt_id)
),
)
if dataset == LANG_TO_SELECTED_DATASET[args.lang]:
train_transcript = open(
os.path.join(
args.source,
dataset,
lang,
"transcript_{}_train.txt".format(args.duration),
),
"r",
encoding="utf-8",
)
train_text.write("{} {}\n".format(utt_id, process_text(text)))
train_utt2spk.write("{} {}\n".format(utt_id, utt_id))
train_transcript.close()

# process dev
dev_transcript = open(
os.path.join(args.source, dataset, lang, "transcript_10min_dev.txt"),
"r",
encoding="utf-8",
)
for line in dev_transcript.readlines():
line = line.strip().split(maxsplit=2)
utt_id, _, text = line
dev_wavscp.write(
"{} sox {} -c 1 -t wavpcm -|\n".format(
utt_id,
os.path.join(
args.source, dataset, lang, "wav", "{}.wav".format(utt_id)
),
for line in train_transcript.readlines():
line = line.strip().split(maxsplit=2)
utt_id, _, text = line
train_wavscp.write(
"{} sox {} -c 1 -t wavpcm -|\n".format(
utt_id,
os.path.join(
args.source,
dataset,
lang,
"wav",
"{}.wav".format(utt_id),
),
)
)
train_text.write("{} {}\n".format(utt_id, process_text(text)))
train_utt2spk.write("{} {}\n".format(utt_id, utt_id))
train_transcript.close()

# process dev
dev_transcript = open(
os.path.join(
args.source, dataset, lang, "transcript_10min_dev.txt"
),
"r",
encoding="utf-8",
)
dev_text.write("{} {}\n".format(utt_id, process_text(text)))
dev_utt2spk.write("{} {}\n".format(utt_id, utt_id))
dev_transcript.close()
for line in dev_transcript.readlines():
line = line.strip().split(maxsplit=2)
utt_id, _, text = line
dev_wavscp.write(
"{} sox {} -c 1 -t wavpcm -|\n".format(
utt_id,
os.path.join(
args.source,
dataset,
lang,
"wav",
"{}.wav".format(utt_id),
),
)
)
dev_text.write("{} {}\n".format(utt_id, process_text(text)))
dev_utt2spk.write("{} {}\n".format(utt_id, utt_id))
dev_transcript.close()

# process test
test_transcript = open(
Expand Down
49 changes: 49 additions & 0 deletions egs2/msuperb/asr1/run_single_lang.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail


multilingual=false
lid=false
nlsyms_txt=data/local/nlsyms.txt
asr_config=conf/tuning/train_asr_fbank_single.yaml
lm_config=conf/train_lm.yaml
inference_config=conf/decode_asr.yaml



for duration in 10min 1h; do
echo ${duration}
for single_lang in eng1 eng2 eng3 fra1 fra2 deu1 deu2 rus swa swe jpn cmn xty ; do
echo ${single_lang}
train_set=train_${duration}_${single_lang}
train_dev=dev_10min_${single_lang}
test_set="${train_dev} test_10min_${single_lang}"
lang=${single_lang}
asr_tag="$(basename "${asr_config}" .yaml)_${single_lang}_${duration}"

./asr.sh \
--ngpu 1 \
--stage 1 \
--stop_stage 12 \
--lang ${lang} \
--nj 4 \
--inference_nj 4 \
--local_data_opts "--duration ${duration} --lid ${lid} --multilingual ${multilingual} --single_lang ${single_lang} --nlsyms_txt ${nlsyms_txt}" \
--use_lm false \
--lm_config "${lm_config}" \
--token_type char \
--feats_type raw \
--asr_config "${asr_config}" \
--inference_config "${inference_config}" \
--train_set "${train_set}" \
--valid_set "${train_dev}" \
--test_sets "${test_set}" \
--bpe_train_text "data/${train_set}/text" \
--asr_tag "${asr_tag}" \
--lm_train_text "data/${train_set}/text" "$@"
done
done