espnet · mergify · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022
diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
@@ -117,6 +117,7 @@ YESNO=downloads
 YOLOXOCHITL_MIXTEC=downloads
 HOW2_TEXT=downloads/how2-300h-v1
 HOW2_FEATS=downloads/fbank_pitch_181516
+HOW2_2kH=downloads/how2_release
 ZEROTH_KOREAN=downloads
 JAVA=downloads
 RU_OPEN_STT=downloads

diff --git a/egs2/how2_2000h/asr1/README.md b/egs2/how2_2000h/asr1/README.md
@@ -1,8 +1,21 @@
 ## End to End Speech Recognition with How2-2000h
 
 
+# Data Download and Preparation
 HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
 
+You can request the use of this data using our (data request form)[https://docs.google.com/forms/d/e/1FAIpQLSfW2i8UnjuoH2KKSU0BvcKRbhnk_vL3HcNlM0QLsJGb_UEDVQ/viewform]
+
+For ASR and Summarization, please request the data labeled "(audio_2000) fbank+pitch features in Kaldi scp/ark format for 2000 hours"
+
+You will recieve a data download link shortly after you submit the form.
+Then you can prepare the data directory by providing your link as follows:
+
+
+```bash
+./run.sh --local_data_opts "--data_url <insert-link-here>"
+```
+
 
 # Results on ASR
 

diff --git a/egs2/how2_2000h/asr1/conf/fbank.conf b/egs2/how2_2000h/asr1/conf/fbank.conf
@@ -1,2 +1,2 @@
 --sample-frequency=16000 
---num-mel-bins=80
+--num-mel-bins=40
diff --git a/egs2/how2_2000h/asr1/local/data.sh b/egs2/how2_2000h/asr1/local/data.sh
@@ -17,8 +17,9 @@ stop_stage=1
 . ./path.sh
 . ./cmd.sh
 
-url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
-data_how2=how2_feats
+data_url=
+data_how2=${HOW2_2kH}
+
 
 log "$0 $*"
 . utils/parse_options.sh
@@ -38,16 +39,24 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     if [ -d ${data_how2} ]; then
         log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
     else
-        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
-        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+        wget ${data_url} -o out.tar.bz2 
+        tar -xvf out.tar.bz2 -C ${data_how2}
+        log "$0: Successfully downloaded and un-tarred how2_feats"
     fi
 fi
 
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     log "stage 1: Data preparation and verification"
-    mv how2_feats/data .
-    mv how2_feats/fbank .
+
+    mkdir -p data 
+    for dir in tr_2000h_utt cv05_utt dev5_test_utt; do  
+        [ -f data/${dir} ] || mv ${data_how2}/data/${dir} data/${dir}
+        [ -f "data/${dir}/feats.scp" ] ||  awk -F ' ' -v x="$(realpath $data_how2)" '{print $1,x"/audio/fbank_pitch/"$2}' < "${data_how2}/audio/fbank_pitch/all_utts_asr.scp" > "data/${dir}/feats.scp"
+        [ -f "data/${dir}/wav.scp" ] || cut -d ' ' -f2 "data/${dir}/segments" | sort | uniq | awk -F ' ' '{print $1,"<DUMMY>"}' > "data/${dir}/wav.scp"
+        utils/fix_data_dir.sh data/${dir}
+    done 
+
 fi 
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/sum1/README.md b/egs2/how2_2000h/sum1/README.md
@@ -2,12 +2,28 @@
 
 This recipe can be used to build E2E Speech Summarization models using restricted self-attention on the HowTo corpus of instructional videos. 
 
+# Data Download and Preparation
 HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
 
+You can request the use of this data using our (data request form)[https://docs.google.com/forms/d/e/1FAIpQLSfW2i8UnjuoH2KKSU0BvcKRbhnk_vL3HcNlM0QLsJGb_UEDVQ/viewform]
+
+For ASR and Summarization, please request the data labeled "(audio_2000) fbank+pitch features in Kaldi scp/ark format for 2000 hours"
+
+You will recieve a data download link shortly after you submit the form.
+Then you can prepare the data directory by providing your link as follows:
+
+
+```bash
+./run.sh --local_data_opts "--data_url <insert-link-here>"
+```
+
+# Two-stage Training 
+
 Training is done in two stages, (a) ASR Pretraining, and (b) Summarization fine-tuning
 
 First run ASR pretraining as follows:
 The recipe is based on asr1
+
 ```bash
 local/run_asr.sh --asr_tag asr_pretrain
 ``` 

diff --git a/egs2/how2_2000h/sum1/local/data.sh b/egs2/how2_2000h/sum1/local/data.sh
@@ -17,8 +17,9 @@ stop_stage=1
 . ./path.sh
 . ./cmd.sh
 
-url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
-data_how2=how2_feats
+data_url=
+data_how2=${HOW2_2kH}
+
 
 log "$0 $*"
 . utils/parse_options.sh
@@ -38,16 +39,24 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     if [ -d ${data_how2} ]; then
         log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
     else
-        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
-        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+        wget ${data_url} -o out.tar.bz2 
+        tar -xvf out.tar.bz2 -C ${data_how2}
+        log "$0: Successfully downloaded and un-tarred how2_feats"
     fi
 fi
 
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     log "stage 1: Data preparation and verification"
-    mv how2_feats/data .
-    mv how2_feats/fbank .
+
+    mkdir -p data 
+    for dir in tr_2000h_sum cv05_sum dev5_test_sum; do  
+        [ -f data/${dir} ] || mv ${data_how2}/data/${dir} data/${dir}
+        [ -f "data/${dir}/feats.scp" ] || awk -F ' ' -v x="$(realpath $data_how2)" '{print $1,x"/audio/fbank_pitch/"$2}' < "${data_how2}/audio/fbank_pitch/${dir}.scp"  > "data/${dir}/feats.scp"
+        [ -f "data/${dir}/wav.scp" ] || cut -d ' ' -f2 "data/${dir}/segments" | sort | uniq | awk -F ' ' '{print $1,"<DUMMY>"}' > "data/${dir}/wav.scp"
+        utils/fix_data_dir.sh data/${dir}
+    done 
+
 fi 
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/espnet2/asr/espnet_model.py b/espnet2/asr/espnet_model.py
@@ -365,10 +365,11 @@ def encode(
             encoder_out.size(),
             speech.size(0),
         )
-        assert encoder_out.size(-2) <= encoder_out_lens.max(), (
-            encoder_out.size(),
-            encoder_out_lens.max(),
-        )
+        if getattr(self.encoder, "selfattention_layer_type", None) != "lf_selfattn":
+            assert encoder_out.size(-2) <= encoder_out_lens.max(), (
+                encoder_out.size(),
+                encoder_out_lens.max(),
+            )
 
         if intermediate_outs is not None:
             return (encoder_out, intermediate_outs), encoder_out_lens