espnet · mergify · Jun 14, 2023 · Jun 12, 2023 · Jun 14, 2023 · Jun 14, 2023
diff --git a/egs2/chime7_task1/asr1/README.md b/egs2/chime7_task1/asr1/README.md
@@ -129,12 +129,12 @@ and `asr_batch_size` because you may want to adjust these based on your hardware
 --mixer6-root YOUR_PATH_TO_MIXER6 --stage 0 --ngpu YOUR_NUMBER_OF_GPUs
 ```
 **We also provide a pre-trained model**, you can run only inference on development set
-using:
+using (or evaluation set by using `--decode-only eval`):
 ```bash
 ./run.sh --chime6-root YOUR_PATH_TO_CHiME6 --dipco-root PATH_WHERE_DOWNLOAD_DIPCO \
 --mixer6-root YOUR_PATH_TO_MIXER6 --stage 0 --ngpu YOUR_NUMBER_OF_GPUs \
 --use-pretrained popcornell/chime7_task1_asr1_baseline \
---decode-only 1 --gss-max-batch-dur 30-360-DEPENDING_ON_GPU_MEM
+--decode-only dev --gss-max-batch-dur 30-360-DEPENDING_ON_GPU_MEM
 ```
 Note that `gss-max-batch-dur` affects a lot your inference time.
 Also note that getting this warning `Discarded recording P56_dipco_S34_431-120171_120933-mdm from AudioSource(type='file', channels=[8]`
@@ -447,7 +447,7 @@ There are two possible approaches.
 either the "style" of the baseline GSS ones or the ones belonging to close-talk mics.
 
 To evaluate the new enhanced data, e.g. `kaldi/chime6/dev/my_enhanced`, you need to include it into `asr_tt_set` in `run.sh` or
-from command line: `run.sh --stage 3 --asr-tt-set "kaldi/chime6/dev/gss" --decode-only 1 --use-pretrained popcornell/chime7_task1_asr1_baseline --asr-dprep-stage 4`.
+from command line: `run.sh --stage 3 --asr-tt-set "kaldi/chime6/dev/gss" --decode-only dev --use-pretrained popcornell/chime7_task1_asr1_baseline --asr-dprep-stage 4`.
 
 
 ## Acknowledgements

diff --git a/egs2/chime7_task1/asr1/local/da_wer_scoring.py b/egs2/chime7_task1/asr1/local/da_wer_scoring.py
@@ -167,25 +167,25 @@ def compute_wer(df_or_dict):
     return wer
 
 
-def compute_diar_errors(hyp_segs, ref_segs, uem_boundaries=None, collar=0.5):
+def compute_diar_errors(name, hyp_segs, ref_segs, uem_boundaries=None, collar=0.5):
     # computing all diarization errors for each session here.
     # find optimal mapping too, which will then be used to find the WER.
     if uem_boundaries is not None:
         uem = Timeline([Segment(start=uem_boundaries[0], end=uem_boundaries[-1])])
     else:
         uem = None
 
-    def to_annotation(segs):
-        out = Annotation()
+    def to_annotation(segs, name):
+        out = Annotation(uri=name)
         for s in segs:
             speaker = s["speaker"]
             start = float(s["start_time"])
             end = float(s["end_time"])
             out[Segment(start, end)] = speaker
         return out
 
-    hyp_annotation = to_annotation(hyp_segs)
-    ref_annotation = to_annotation(ref_segs)
+    hyp_annotation = to_annotation(hyp_segs, name)
+    ref_annotation = to_annotation(ref_segs, name)
 
     der_computer = DERComputer(collar=collar, skip_overlap=False)
     reference, hypothesis, uem = der_computer.get_uemified(
@@ -441,7 +441,7 @@ def get_sess2segs(segments):
                 reference,
                 hypothesis,
                 errors,
-            ) = compute_diar_errors(hyp_segs, ref_segs, c_uem, collar=collar)
+            ) = compute_diar_errors(session, hyp_segs, ref_segs, c_uem, collar=collar)
 
             log_diarization(sess_dir, reference, hypothesis, errors)
             # save ref hyps and errors in a folder

diff --git a/egs2/chime7_task1/asr1/local/data.sh b/egs2/chime7_task1/asr1/local/data.sh
@@ -26,7 +26,7 @@ background_snrs="20:10:15:5:0"
 gss_dsets=$(echo $gss_dsets | tr "," " ") # split by commas
 
 
-if [ $decode_only == 1 ]; then
+if [ -n "$decode_only" ]; then
   # stop after gss
   skip_stages=("1" "2")
 fi

diff --git a/egs2/chime7_task1/asr1/run.sh b/egs2/chime7_task1/asr1/run.sh
@@ -67,7 +67,7 @@ nbpe=500
 asr_max_epochs=8
 # put popcornell/chime7_task1_asr1_baseline if you want to test with pretrained model
 use_pretrained=
-decode_only=0
+decode_only=""
 diar_score=0
 
 . ./path.sh
@@ -80,9 +80,15 @@ asr_batch_size=$(calc_int 128*$ngpu) # reduce 128 bsz if you get OOMs errors
 asr_max_lr=$(calc_float $ngpu/10000.0)
 asr_warmup=$(calc_int 40000.0/$ngpu)
 
-if [ $decode_only -eq 1 ]; then
+if [ $decode_only == "dev" ]; then
   # apply gss only on dev
   gss_dsets="chime6_dev,dipco_dev,mixer6_dev"
+  asr_tt_set="kaldi/chime6/dev/gss kaldi/dipco/dev/gss/ kaldi/mixer6/dev/gss/"
+elif
+  [ $decode_only == "eval" ]; then
+  # apply gss only on dev
+  gss_dsets="chime6_eval,dipco_eval,mixer6_eval"
+  asr_tt_set="kaldi/chime6/eval/gss kaldi/dipco/eval/gss/ kaldi/mixer6/eval/gss/"
 fi
 
 if [ ${stage} -le 0 ] && [ $stop_stage -ge 0 ]; then
@@ -100,16 +106,24 @@ fi
 if [ ${stage} -le 1 ] && [ $stop_stage -ge 1 ]; then
   # parse all datasets to lhotse
   for dset in chime6 dipco mixer6; do
-    for dset_part in train dev; do
+    for dset_part in "train" "dev" "eval"; do
+
+
       if [ $dset == dipco ] && [ $dset_part == train ]; then
           continue # dipco has no train set
       fi
 
-      if [ $decode_only == 1 ] && [ $dset_part == train ]; then
+      if [ -n "$decode_only" ] && [ $dset_part == train ]; then
+        continue
+      fi
+
+      if [ ! -d $chime7_root/$dset/audio/$dset_part ]; then
+        log "Skipping $dset $dset_part because it does not exist on disk. This is
+        fine if you don't have evaluation set yet."
         continue
       fi
 
-      if [ $use_chime6_falign == 1 ] && [ $dset == chime6 ]; then
+      if [ $use_chime6_falign == 1 ] && [ $dset == chime6 ] && [ $dset_part != train ]; then
            if ! [ -d ./CHiME7_DASR_falign ]; then
                log "Getting forced alignment annotation for CHiME-6 Scenario"
                git clone https://github.com/chimechallenge/CHiME7_DASR_falign
@@ -153,6 +167,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     fi
 
     log "Running Guided Source Separation for ${dset_name}/${dset_part}, results will be in ${gss_dump_root}/${dset_name}/${dset_part}"
+    # shellcheck disable=SC2039
     local/run_gss.sh --manifests-dir $manifests_root --dset-name $dset_name \
           --dset-part $dset_part \
           --exp-dir $gss_dump_root \
@@ -220,6 +235,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     --lm_train_text "data/${asr_train_set}/text" ${pretrained_affix}
 fi
 
+if [ ${decode_only} == "eval" ]; then
+  log "Scoring not available for eval set till the end of the challenge."
+  exit
+fi
+
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # final scoring
   log "Scoring ASR predictions for CHiME-7 DASR challenge."
@@ -256,7 +276,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # the content of this output folder is what you should send for evaluation to the
     # organizers.
   done
-  split=dev
+  split=dev # participants cannot evaluate eval
   LOG_OUT=${asr_exp}/${inference_tag}/scoring/scoring.log
   python local/da_wer_scoring.py -s ${asr_exp}/${inference_tag}/chime7dasr_hyp/$split \
      -r $chime7_root -p $split -o ${asr_exp}/${inference_tag}/scoring -d $diar_score 2>&1 | tee $LOG_OUT

diff --git a/egs2/chime7_task1/diar_asr1/README.md b/egs2/chime7_task1/diar_asr1/README.md
@@ -132,12 +132,12 @@ ln -s ../asr1/chime7_task1 .
 
 #### Main Track with Pyannote-based Diarization System
 To reproduce our results which use our pre-trained ASR model [https://huggingface.co/popcornell/chime7_task1_asr1_baseline](https://huggingface.co/popcornell/chime7_task1_asr1_baseline) and pre-trained
-[pyannote segmentation model](https://huggingface.co/popcornell/pyannote-segmentation-chime6-mixer6)
+[pyannote segmentation model](https://huggingface.co/popcornell/pyannote-segmentation-chime6-mixer6), on the dev set,
 you can run:
 ```bash
 ./run.sh --chime7-root YOUR_PATH_TO_CHiME7_ROOT --stage 2 --ngpu YOUR_NUMBER_OF_GPUs \
 --use-pretrained popcornell/chime7_task1_asr1_baseline \
---decode-only 1 --gss-max-batch-dur 30-360-DEPENDING_ON_GPU_MEM \
+--decode-only dev --gss-max-batch-dur 30-360-DEPENDING_ON_GPU_MEM \
 --pyan-use-pretrained popcornell/pyannote-segmentation-chime6-mixer6
 ```
 You can also play with diarization hyperparameters such as:
@@ -147,16 +147,31 @@ You can also play with diarization hyperparameters such as:
 
 as said merge-closer can have quite an impact on the final WER.
 
+**NOTE**
+We found the diarization baseline to be highly sensitive to the `diar-merge-closer` parameter and
+to the CUDA/CUDNN version used. <br>
+For example, the best results on our side were obtained with `conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.6 -c pytorch -c nvidia`.
+This however was by using Ampere devices (A100) on our side, and the results might
+change for you if your machine is different. <br>
+See [this Pyannote issue](https://github.com/pyannote/pyannote-audio/issues/1370) related to replicability of the diarization baseline, where we have
+reported the full specs of our system and the conda environment used. <br>
+
+To enhance replicability, we provide in this [repository](https://github.com/popcornell/CHiME7DASRDiarizationBaselineJSONs) our pre-computed outputs
+for the diarization baseline.
+You can use them in this recipe by passing `--download-baseline-diarization 1 ` this
+will skip your "local" diarization baseline and instead download directly our predictions.
+
 ---
 If you want to run this recipe from scratch, **including dataset generation** and pyannote segmentation
-model fine-tuning you can run it from stage 0:
+model fine-tuning you can run it from stage 0 (use `--decode-only eval` for evaluation set):
 ```bash
 ./run.sh --chime6-root YOUR_PATH_TO_CHiME6 --dipco-root PATH_WHERE_DOWNLOAD_DIPCO \
 --mixer6-root YOUR_PATH_TO_MIXER6 --stage 0 --ngpu YOUR_NUMBER_OF_GPUs \
 --use-pretrained popcornell/chime7_task1_asr1_baseline \
---decode-only 1 --gss-max-batch-dur 30-360-DEPENDING_ON_GPU_MEM \
+--decode-only dev --gss-max-batch-dur 30-360-DEPENDING_ON_GPU_MEM \
 --pyan-use-pretrained popcornell/pyannote-segmentation-chime6-mixer6
 ```
+
 ---
 **If you want only to generate data you can run only stage 0.**
 ```bash

diff --git a/egs2/chime7_task1/diar_asr1/local/pyannote_diarize.py b/egs2/chime7_task1/diar_asr1/local/pyannote_diarize.py
@@ -39,8 +39,9 @@ def split_maxlen(utt_group, min_len=10):
 
 
 def merge_closer(annotation, delta=1.0, max_len=60, min_len=10):
+    name = annotation.uri
     speakers = annotation.labels()
-    new_annotation = Annotation()
+    new_annotation = Annotation(uri=name)
     for spk in speakers:
         c_segments = sorted(annotation.label_timeline(spk), key=lambda x: x.start)
         stack = []
@@ -97,6 +98,7 @@ def rttm2json(rttm_file):
 
 
 def diarize_session(
+    sess_name,
     pipeline,
     wav_files,
     uem_boundaries=None,
@@ -231,7 +233,7 @@ def diarize_session(
     )
     result = to_annotation(discrete_diarization)
     offset = uem_boundaries[0] / fs
-    new_annotation = Annotation()  # new annotation
+    new_annotation = Annotation(uri=sess_name)  # new annotation
     speakers = result.labels()
     for spk in speakers:
         for seg in result.label_timeline(spk):
@@ -424,6 +426,7 @@ def read_uem(uem_file):
             else:
                 c_uem = None
             c_result = diarize_session(
+                sess,
                 diarization_pipeline,
                 sess2audio[sess],
                 c_uem,

diff --git a/egs2/chime7_task1/diar_asr1/run.sh b/egs2/chime7_task1/diar_asr1/run.sh
@@ -38,7 +38,8 @@ diar_inf_dset="dev"
 pyan_merge_closer=0.5
 pyan_max_length_merged=20
 pyan_inf_max_batch=32
-pyan_use_pretrained= #popcornell/pyannote-segmentation-chime6-mixer6
+pyan_use_pretrained=popcornell/pyannote-segmentation-chime6-mixer6
+download_baseline_diarization=0
 # fine-tune
 pyan_finetune_dir=exp/pyannote_finetuned
 pyan_batch_size=32
@@ -51,7 +52,7 @@ gss_max_batch_dur=90
 
 # ASR config
 use_pretrained=
-decode_only=1
+decode_only=""
 
 gss_asr_stage=
 gss_asr_stop_stage=10
@@ -65,6 +66,22 @@ if [ -z "$gss_asr_stage" ]; then
 fi
 
 
+if [ "${decode_only}" == "eval" ]; then
+  diar_inf_dset="eval"
+fi
+
+
+if [ $download_baseline_diarization == 1 ]; then
+  log "Using organizer-provided JSON manifests from the baseline diarization system."
+  if [ ! -d CHiME7DASRDiarizationBaselineJSONs ]; then
+      git clone https://github.com/popcornell/CHiME7DASRDiarizationBaselineJSONs
+  fi
+  mkdir -p exp/diarization
+  cp -r CHiME7DASRDiarizationBaselineJSONs/diarization exp/
+  stage=3
+fi
+
+
 if [ ${stage} -le 0 ] && [ $stop_stage -ge 0 ]; then
   log "Generating CHiME-7 DASR Challenge data."
   # this script creates the task1 dataset
@@ -155,7 +172,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
 fi
 
 
-if [ ${stage} -le 4 ] && [ $stop_stage -ge 4 ]; then
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   log "Performing GSS+Channel Selection+ASR inference on diarized output"
   # now that we have diarized the dataset, we can run the sub-track 1 baseline
   # and use the diarization output in place of oracle diarization.