Merge branch 'espnet:master' into master

espnet · Nov 22, 2022 · fbfe277 · fbfe277
2 parents 562f7dd + ca2193d
commit fbfe277
Show file tree

Hide file tree

Showing 25 changed files with 872 additions and 27 deletions.
diff --git a/egs2/TEMPLATE/asr1/asr.sh b/egs2/TEMPLATE/asr1/asr.sh
@@ -62,6 +62,7 @@ sos_eos="<sos/eos>" # sos and eos symbole
 bpe_input_sentence_size=100000000 # Size of input sentence for BPE.
 bpe_nlsyms=         # non-linguistic symbols list, separated by a comma or a file containing 1 symbol per line, for BPE
 bpe_char_cover=1.0  # character coverage when modeling BPE
+hugging_face_model_name_or_path="" # Hugging Face model or path for hugging_face tokenizer
 
 # Ngram model related
 use_ngram=false
@@ -306,6 +307,7 @@ bpeprefix="${bpedir}"/bpe
 bpemodel="${bpeprefix}".model
 bpetoken_list="${bpedir}"/tokens.txt
 chartoken_list="${token_listdir}"/char/tokens.txt
+hugging_face_token_list="${token_listdir}/hugging_face_"${hugging_face_model_name_or_path/\//-}/tokens.txt
 # NOTE: keep for future development.
 # shellcheck disable=SC2034
 wordtoken_list="${token_listdir}"/word/tokens.txt
@@ -318,6 +320,9 @@ elif [ "${token_type}" = char ]; then
 elif [ "${token_type}" = word ]; then
     token_list="${wordtoken_list}"
     bpemodel=none
+elif [ "${token_type}" = hugging_face ]; then
+    token_list="${hugging_face_token_list}"
+    bpemodel=${hugging_face_model_name_or_path}
 else
     log "Error: not supported --token_type '${token_type}'"
     exit 2
@@ -349,6 +354,9 @@ if [ -z "${asr_tag}" ]; then
     if [ "${token_type}" = bpe ]; then
         asr_tag+="${nbpe}"
     fi
+    if [ "${token_type}" = hugging_face ]; then
+        asr_tag+="_"${hugging_face_model_name_or_path/\//-}
+    fi
     # Add overwritten arg's info
     if [ -n "${asr_args}" ]; then
         asr_tag+="$(echo "${asr_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
@@ -387,6 +395,9 @@ if [ -z "${asr_stats_dir}" ]; then
     if [ "${token_type}" = bpe ]; then
         asr_stats_dir+="${nbpe}"
     fi
+    if [ "${token_type}" = hugging_face ]; then
+        asr_stats_dir+="_"${hugging_face_model_name_or_path/\//-}
+    fi
     if [ -n "${speed_perturb_factors}" ]; then
         asr_stats_dir+="_sp"
     fi
@@ -690,7 +701,14 @@ if ! "${skip_data_prep}"; then
                 --add_symbol "${blank}:0" \
                 --add_symbol "${oov}:1" \
                 --add_symbol "${sos_eos}:-1"
+        elif [ "${token_type}" = hugging_face ]; then
+            log "Stage 5: Generate hugging_face token_list from ${hugging_face_model_name_or_path}"
 
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.hugging_face_export_vocabulary  \
+                --model_name_or_path "${hugging_face_model_name_or_path}" \
+                --output "${token_list}"
         else
             log "Error: not supported --token_type '${token_type}'"
             exit 2

diff --git a/egs2/slurp_entity/asr1/README.md b/egs2/slurp_entity/asr1/README.md
@@ -48,6 +48,28 @@
 
 
 
+# Using XLS-R pretrained speech Encoder and mBART-50 Large pretrained text Encoder-Decoder
+
+- ASR config: [conf/tuning/train_asr_branchformer_xlsr_mbart.yaml](conf/tuning/train_asr_branchformer_xlsr_mbart.yaml)
+- #Params: 1.21 B
+
+## Environments
+- date: `Wed Sep  7 01:16:08 CEST 2022`
+- python version: `3.9.13 (main, Jun  9 2022, 00:00:00)  [GCC 11.3.1 20220421 (Red Hat 11.3.1-2)]`
+- espnet version: `espnet 202207`
+- pytorch version: `pytorch 1.12.1+cu116`
+- Git hash: `c9cb7c424c90e9d3a59ace324308793b91fedbe1`
+- Commit date: `Tue Aug 23 16:22:24 2022 +0200`
+
+## Intent Classification
+- Valid Intent Classification Result: 0.8933256616800921
+- Test Intent Classification Result: 0.8811744915124636
+
+## Entity
+|Slu f1|Precision|Recall|F-Measure|
+|:---:|:---:|:---:|:---:|
+|test|0.7949|0.7788|0.7868|
+
 # Initial Result
 
 ## Environments

diff --git a/egs2/slurp_entity/asr1/conf/decode_asr_hf.yaml b/egs2/slurp_entity/asr1/conf/decode_asr_hf.yaml
@@ -0,0 +1,3 @@
+beam_size: 5
+ctc_weight: 0.0
+hugging_face_decoder: True
diff --git a/egs2/slurp_entity/asr1/conf/tuning/train_asr_branchformer_xlsr_mbart.yaml b/egs2/slurp_entity/asr1/conf/tuning/train_asr_branchformer_xlsr_mbart.yaml
@@ -0,0 +1,82 @@
+# network architecture
+# encoder related
+encoder: branchformer
+encoder_conf:
+    output_size: 1024
+    use_attn: true
+    attention_heads: 8
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    use_cgmlp: true
+    cgmlp_linear_units: 4096
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    merge_method: concat
+    cgmlp_weight: 0.5               # used only if merge_method is "fixed_ave"
+    attn_branch_drop_rate: 0.0      # used only if merge_method is "learned_ave"
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    stochastic_depth_rate: 0.0
+
+postencoder: hugging_face_transformers
+postencoder_conf:
+    model_name_or_path: "akreal/mbart-large-50-finetuned-slurp"
+    length_adaptor_n_layers: 1
+    lang_token_id: 250004
+
+decoder: hugging_face_transformers
+decoder_conf:
+    model_name_or_path: "akreal/mbart-large-50-finetuned-slurp"
+
+use_amp: true
+num_workers: 2
+optim: adam
+batch_type: length
+batch_bins: 170000
+accum_grad: 4
+optim_conf:
+    lr: 0.00005
+    weight_decay: 0.000001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+    "frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: xls_r_300m  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+    # mBART dictionary customizations
+    ignore_id: 1
+    sym_blank: "<pad>"
+    sym_sos: "<s>"
+    sym_eos: "</s>"
+    lang_token_id: 250004
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/slurp_entity/asr1/local/convert_to_entity_file.py b/egs2/slurp_entity/asr1/local/convert_to_entity_file.py
@@ -3,18 +3,24 @@
 import os
 import sys
 
+from espnet2.utils.types import str2bool
 
-def generate_entity_file(line_arr, output_file="result_test.json"):
+
+def generate_entity_file(line_arr, output_file="result_test.json", token_type_bpe=True):
     fp = open(output_file, "w")
     for line in line_arr:
         scenario = line.strip().split("\t")[0].split("_")[0]
         action = "_".join(line.strip().split("\t")[0].split()[0].split("_")[1:])
+        if not token_type_bpe:
+            line = line.replace(" ", "▁")
         entity_names_arr = line.strip().split("▁SEP")[1:-1]
         ent_final_arr = []
         for entity in entity_names_arr:
             if len(entity.split("▁FILL")) != 2:
                 continue
             ent_type = entity.split("▁FILL")[0].strip()
+            if not token_type_bpe:
+                ent_type = ent_type.replace("▁", " ").strip()
             ent_val = entity.split("▁FILL")[1].strip().replace(" ", "")
             ent_val = ent_val.replace("▁", " ").strip().replace("'", "'")
             dict1 = {}
@@ -45,6 +51,12 @@ def generate_entity_file(line_arr, output_file="result_test.json"):
     default="decode_asr_asr_model_valid.acc.ave_10best/test/",
     help="Directory inside exp_root containing inference on test set",
 )
+parser.add_argument(
+    "--token_type_bpe",
+    type=str2bool,
+    default=True,
+    help="Whether text is encoded in BPE units",
+)
 
 args = parser.parse_args()
 
@@ -54,4 +66,8 @@ def generate_entity_file(line_arr, output_file="result_test.json"):
 
 gen_file = open(os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn"))
 line_arr = [line for line in gen_file]
-generate_entity_file(line_arr, output_file=os.path.join(exp_root, "result_test.json"))
+generate_entity_file(
+    line_arr,
+    output_file=os.path.join(exp_root, "result_test.json"),
+    token_type_bpe=args.token_type_bpe,
+)
diff --git a/egs2/slurp_entity/asr1/local/data.sh b/egs2/slurp_entity/asr1/local/data.sh
@@ -14,6 +14,7 @@ SECONDS=0
 
 stage=1
 stop_stage=100000
+token_type_bpe=true
 log "$0 $*"
 . utils/parse_options.sh
 
@@ -33,7 +34,7 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     if [ ! -e "${SLURP}/LICENSE.txt" ]; then
-	echo "stage 1: Download data to ${SLURP}"
+        echo "stage 1: Download data to ${SLURP}"
     else
         log "stage 1: ${SLURP}/LICENSE.txt is already existing. Skip data downloading"
     fi
@@ -56,7 +57,10 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     python local/prepare_entity_type.py
     for x in test devel train; do
         mv data/${x}/text data/${x}/text_old
-	mv data/${x}/text_new data/${x}/text
+        mv data/${x}/text_new data/${x}/text
+        if ! "${token_type_bpe}"; then
+            cp -a data_old/${x} data/${x}_char
+        fi
     done
 fi
 

diff --git a/egs2/slurp_entity/asr1/local/score.sh b/egs2/slurp_entity/asr1/local/score.sh
@@ -6,6 +6,7 @@
 # cmd=run.pl
 # stage=0
 # data=data/eval2000
+token_type_bpe=true
 # #end configuration section.
 
 [ -f ./path.sh ] && . ./path.sh
@@ -17,16 +18,25 @@ if [ $# -lt 1 ]; then
 fi
 . ./db.sh
 
+if [ -z "${SLURP}" ]; then
+    echo "Fill the value of 'SLURP' of db.sh"
+    exit 1
+fi
+
 asr_expdir=$1
 
 if [ $# -gt 1 ]; then
 	valid_inference_folder=$2
 	test_inference_folder=$3
-	python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
-	python local/convert_to_entity_file.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
 else
-	python local/score.py --exp_root ${asr_expdir}
-	python local/convert_to_entity_file.py --exp_root ${asr_expdir}
+	valid_inference_folder=$(ls ${asr_expdir}/*/devel*/score_wer/hyp.trn | head -n 1 | sed 's!//!/!g' | cut -d/ -f3,4)/
+	test_inference_folder=$(ls ${asr_expdir}/*/test*/score_wer/hyp.trn | head -n 1 | sed 's!//!/!g' | cut -d/ -f3,4)/
 fi
+python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+python local/convert_to_entity_file.py \
+	--exp_root ${asr_expdir} \
+	--valid_folder ${valid_inference_folder} \
+	--test_folder ${test_inference_folder} \
+	--token_type_bpe ${token_type_bpe}
 python local/evaluation/evaluate.py -g ${SLURP}/dataset/slurp/test.jsonl -p ${asr_expdir}/result_test.json
 exit 0
diff --git a/egs2/slurp_entity/asr1/run-hugging-face.sh b/egs2/slurp_entity/asr1/run-hugging-face.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train_char"
+valid_set="devel_char"
+test_sets="test_char devel_char"
+
+asr_config=conf/tuning/train_asr_branchformer_xlsr_mbart.yaml
+inference_config=conf/decode_asr_hf.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --token_type hugging_face \
+    --hugging_face_model_name_or_path facebook/mbart-large-50-many-to-many-mmt \
+    --local_data_opts "--token_type_bpe false" \
+    --local_score_opts "--token_type_bpe false" \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --feats_normalize utterance_mvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --inference_nj 1 \
+    --gpu_inference true \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --lm_train_text "data/${train_set}/text" \
+    --test_sets "${test_sets}" "$@"