Skip to content

Commit

Permalink
resolve conflicts and fix lm_train filenames
Browse files Browse the repository at this point in the history
  • Loading branch information
chintu619 committed May 13, 2022
1 parent ea44663 commit 3cac7bb
Showing 1 changed file with 33 additions and 32 deletions.
65 changes: 33 additions & 32 deletions egs2/TEMPLATE/st1/st.sh
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ if ! "${skip_data_prep}"; then
done
utils/combine_data.sh --extra_files "${utt_extra_files}" "data/${train_set}_sp" ${_dirs}
for extra_file in ${utt_extra_files}; do
python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp
python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp
mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
done
else
Expand Down Expand Up @@ -522,7 +522,7 @@ if ! "${skip_data_prep}"; then
for single_file in $(ls data/"${dset}"/${extra_file}*); do
cp ${single_file} "${data_feats}${_suf}/${dset}"
expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
done
done
done
echo "${expand_utt_extra_files}"
utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
Expand Down Expand Up @@ -567,7 +567,7 @@ if ! "${skip_data_prep}"; then
for single_file in $(ls data/"${dset}"/${extra_file}*); do
cp ${single_file} "${data_feats}${_suf}/${dset}"
expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
done
done
done
for extra_file in ${expand_utt_extra_files}; do
LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
Expand Down Expand Up @@ -616,7 +616,7 @@ if ! "${skip_data_prep}"; then
for single_file in $(ls data/"${dset}"/${extra_file}*); do
cp ${single_file} "${data_feats}${_suf}/${dset}"
expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
done
done
done
utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
for extra_file in ${expand_utt_extra_files}; do
Expand Down Expand Up @@ -706,11 +706,12 @@ if ! "${skip_data_prep}"; then
python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
> ${data_feats}/${dset}/${utt_extra_file}.tmp
mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
done
done
done

# shellcheck disable=SC2002
cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' \
> "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
fi

if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
Expand Down Expand Up @@ -783,10 +784,10 @@ if ! "${skip_data_prep}"; then

# Create word-list for word-LM training
if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
log "Generate word level token_list from ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
log "Generate word level token_list from ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
${python} -m espnet2.bin.tokenize_text \
--token_type word \
--input "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \
--input "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \
--field 2- \
--cleaner "${cleaner}" \
--g2p "${g2p}" \
Expand Down Expand Up @@ -872,7 +873,7 @@ fi
if ! "${skip_train}"; then
if "${use_lm}"; then
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"
log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"

_opts=
if [ -n "${lm_config}" ]; then
Expand All @@ -885,9 +886,9 @@ if ! "${skip_train}"; then
_logdir="${lm_stats_dir}/logdir"
mkdir -p "${_logdir}"
# Get the minimum number among ${nj} and the number lines of input files
_nj=$(min "${nj}" "$(<${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)")
_nj=$(min "${nj}" "$(<${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)")

key_file="${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
key_file="${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
split_scps=""
for n in $(seq ${_nj}); do
split_scps+=" ${_logdir}/train.${n}.scp"
Expand All @@ -911,7 +912,7 @@ if ! "${skip_train}"; then
log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
# NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
# but it's used only for deciding the sample ids.
# shellcheck disable=SC2086
# shellcheck disable=SC2046,SC2086
${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
${python} -m espnet2.bin.lm_train \
--collect_stats true \
Expand All @@ -922,12 +923,12 @@ if ! "${skip_train}"; then
--non_linguistic_symbols "${nlsyms_txt}" \
--cleaner "${cleaner}" \
--g2p "${g2p}" \
--train_data_path_and_name_and_type "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text" \
--train_data_path_and_name_and_type "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text" \
--valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
--train_shape_file "${_logdir}/train.JOB.scp" \
--valid_shape_file "${_logdir}/dev.JOB.scp" \
--output_dir "${_logdir}/stats.JOB" \
${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
${_opts} ${lm_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; }

# 4. Aggregate shape files
_opts=
Expand All @@ -949,7 +950,7 @@ if ! "${skip_train}"; then


if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"
log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"

_opts=
if [ -n "${lm_config}" ]; then
Expand All @@ -967,20 +968,20 @@ if ! "${skip_train}"; then
if [ ! -f "${_split_dir}/.done" ]; then
rm -f "${_split_dir}/.done"
${python} -m espnet2.bin.split_scps \
--scps "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
--scps "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
--num_splits "${num_splits_lm}" \
--output_dir "${_split_dir}"
touch "${_split_dir}/.done"
else
log "${_split_dir}/.done exists. Spliting is skipped"
fi

_opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text "
_opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text "
_opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
_opts+="--multiple_iterator true "

else
_opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text "
_opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text "
_opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
fi

Expand Down Expand Up @@ -1053,9 +1054,9 @@ if ! "${skip_train}"; then
fi
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
if "${use_ngram}"; then
log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
cut -f 2 -d " " ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
cut -f 2 -d " " ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
else
log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
fi
Expand Down Expand Up @@ -1407,7 +1408,7 @@ if ! "${skip_eval}"; then

# 2. Submit decoding jobs
log "Decoding started... log: '${_logdir}/st_inference.*.log'"
# shellcheck disable=SC2086
# shellcheck disable=SC2046,SC2086
${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
${python} -m ${st_inference_tool} \
--batch_size ${batch_size} \
Expand All @@ -1417,7 +1418,7 @@ if ! "${skip_eval}"; then
--st_train_config "${st_exp}"/config.yaml \
--st_model_file "${st_exp}"/"${inference_st_model}" \
--output_dir "${_logdir}"/output.JOB \
${_opts} ${inference_args}
${_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/st_inference.*.log) ; exit 1; }

# 3. Concatenates the output files from each jobs
for f in token token_int score text; do
Expand Down Expand Up @@ -1463,7 +1464,7 @@ if ! "${skip_eval}"; then
) \
<(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
>"${_scoredir}/hyp.trn.org"

# remove utterance id
perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
Expand All @@ -1484,7 +1485,7 @@ if ! "${skip_eval}"; then
-i "${_scoredir}/hyp.trn.detok" \
-m bleu chrf ter \
>> ${_scoredir}/result.tc.txt

log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
fi

Expand Down Expand Up @@ -1516,8 +1517,8 @@ if ! "${skip_eval}"; then
) \
<(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
>"${_scoredir}/ref.trn.org.${ref_idx}"
#

# remove utterance id
perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
Expand Down Expand Up @@ -1653,11 +1654,11 @@ if ! "${skip_upload_hf}"; then
gitlfs=$(git lfs --version 2> /dev/null || true)
[ -z "${gitlfs}" ] && \
log "ERROR: You need to install git-lfs first" && \
exit 1
exit 1

dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
[ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}

if command -v git &> /dev/null; then
_creator_name="$(git config user.name)"
_checkout="git checkout $(git show -s --format=%H)"
Expand All @@ -1670,13 +1671,13 @@ if ! "${skip_upload_hf}"; then
# foo/asr1 -> foo
_corpus="${_task%/*}"
_model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"

# copy files in ${dir_repo}
unzip -o ${packed_model} -d ${dir_repo}
# Generate description file
# shellcheck disable=SC2034
hf_task=speech-translation
# shellcheck disable=SC2034
# shellcheck disable=SC2034
espnet_task=ST
# shellcheck disable=SC2034
task_exp=${st_exp}
Expand Down

0 comments on commit 3cac7bb

Please sign in to comment.