Merge pull request kaldi-asr#4 from jsalt2020-asrdiar/libricss

RNNLM rescoring and multichannel recipe
desh2608 · Jun 3, 2020 · 831c107 · 831c107
2 parents bba30e5 + 43ce69e
commit 831c107
Show file tree

Hide file tree

Showing 27 changed files with 1,124 additions and 21 deletions.
diff --git a/egs/libri_css/s5_7ch/cmd.sh b/egs/libri_css/s5_7ch/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/libri_css/s5_7ch/conf/beamformit.cfg b/egs/libri_css/s5_7ch/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/libri_css/s5_7ch/conf/mfcc.conf b/egs/libri_css/s5_7ch/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/libri_css/s5_7ch/conf/mfcc_hires.conf b/egs/libri_css/s5_7ch/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/libri_css/s5_7ch/conf/online_cmvn.conf b/egs/libri_css/s5_7ch/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/libri_css/s5_7ch/diarization b/egs/libri_css/s5_7ch/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
diff --git a/egs/libri_css/s5_7ch/local b/egs/libri_css/s5_7ch/local
@@ -0,0 +1 @@
+../s5_mono/local
diff --git a/egs/libri_css/s5_7ch/path.sh b/egs/libri_css/s5_7ch/path.sh
@@ -0,0 +1,9 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+export PATH=$PWD/dscore:$PATH
+export PYTHONPATH="${PYTHONPATH}:$PWD/dscore"
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/libri_css/s5_7ch/rnnlm b/egs/libri_css/s5_7ch/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
diff --git a/egs/libri_css/s5_7ch/run.sh b/egs/libri_css/s5_7ch/run.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+#
+# LibriCSS multi-channel baseline recipe.
+#
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+# Begin configuration section.
+nj=50
+decode_nj=20
+stage=0
+
+# Different stages
+data_prep_stage=0
+asr_stage=1
+diarizer_stage=0
+decode_stage=0
+rnnlm_rescore=true
+
+enhancement=beamformit
+wpe=true
+
+use_oracle_segments=true
+
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+dereverb=
+$wpe && dereverb=_dereverb
+
+test_sets="dev${dereverb}_${enhancement} eval${dereverb}_${enhancement}"
+
+set -e # exit on error
+
+# please change the path accordingly
+libricss_corpus=/export/corpora/LibriCSS
+librispeech_corpus=/export/corpora/LibriSpeech/
+
+##########################################################################
+# We first prepare the LibriCSS data (7ch) in the Kaldi data
+# format. We use session 0 for dev and others for eval. We also
+# apply online multichannel WPE for dereverberation and then combine
+# all channels using beamforming.
+##########################################################################
+if [ $stage -le 0 ]; then
+  local/data_prep_7ch.sh --stage $data_prep_stage --wpe $wpe \
+    --enhancement $enhancement $libricss_corpus
+fi
+
+#########################################################################
+# ASR MODEL TRAINING
+# In this stage, we prepare the Librispeech data and train our ASR model. 
+# This part is taken from the librispeech recipe, with parts related to 
+# decoding removed. We use the 100h clean subset to train most of the
+# GMM models, except the SAT model, which is trained on the 460h clean
+# subset. The nnet is trained on the full 960h (clean + other).
+# To avoid training the whole ASR from scratch, you can download the
+# chain model using:
+# wget http://kaldi-asr.org/models/13/0013_librispeech_s5.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0013_librispeech_s5.tar.gz
+# and copy the contents of the exp/ directory to your exp/. 
+#########################################################################
+if [ $stage -le 1 ]; then
+  local/train_asr.sh --stage $asr_stage --nj $nj $librispeech_corpus
+fi
+
+##########################################################################
+# DIARIZATION MODEL TRAINING
+# You can also download a pretrained diarization model using:
+# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz
+# and copy the contents of the exp/ directory to your exp/
+##########################################################################
+if [ $stage -le 2 ]; then
+  local/train_diarizer.sh --stage $diarizer_stage \
+    --data-dir data/train_other_500 \
+    --model-dir exp/xvector_nnet_1a
+fi
+
+##########################################################################
+# RNNLM TRAINING
+# We train a TDNN-LSTM based LM that will be used for rescoring the 
+# decoded lattices.
+##########################################################################
+if [ $stage -le 3 ]; then
+  local/rnnlm/train.sh --stage $rnnlm_stage
+fi
+
+##########################################################################
+# DECODING: We assume that we are just given the raw recordings (approx 10
+# mins each), without segments or speaker information, so we have to decode 
+# the whole pipeline, i.e., SAD -> Diarization -> ASR. This is done in the 
+# local/decode.sh script.
+##########################################################################
+if [ $stage -le 4 ]; then
+  local/decode.sh --stage $decode_stage \
+    --test-sets "$test_sets" \
+    --use-oracle-segments $use_oracle_segments
+fi
+
+exit 0;
+
diff --git a/egs/libri_css/s5_7ch/sid b/egs/libri_css/s5_7ch/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
diff --git a/egs/libri_css/s5_7ch/steps b/egs/libri_css/s5_7ch/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
diff --git a/egs/libri_css/s5_7ch/utils b/egs/libri_css/s5_7ch/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
diff --git a/egs/libri_css/s5_mono/local/data_prep_7ch.sh b/egs/libri_css/s5_mono/local/data_prep_7ch.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+#
+# Copyright  2020  Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+enhancement=
+wpe=
+stage=
+
+. ./utils/parse_options.sh  # accept options
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 1 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <corpus-dir>"
+  echo -e >&2 "eg:\n  $0 /export/corpora/LibriCSS"
+  exit 1
+fi
+
+corpus_dir=$1
+
+dereverb=
+$wpe && dereverb=_dereverb
+
+set -e -o pipefail
+
+if [ $stage -le 0 ]; then
+  # If data is not already present, then download and unzip
+  if [ ! -d $corpus_dir/for_release ]; then
+      echo "Downloading and unpacking LibriCSS data."    
+      CWD=`pwd`
+      mkdir -p $corpus_dir
+
+      cd $corpus_dir
+
+      # Download the data. If the data has already been downloaded, it
+      # does nothing. (See wget -c) 
+      wget -c --load-cookies /tmp/cookies.txt \
+        "https://docs.google.com/uc?export=download&confirm=$(wget --quiet \
+        --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
+        'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' \
+        -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" \
+        -O for_release.zip && rm -rf /tmp/cookies.txt
+
+      # unzip (skip if already extracted)
+      unzip -n for_release.zip
+
+      # segmentation
+      cd for_release
+      python3 segment_libricss.py -data_path .
+
+      cd $CWD
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+  # Process the downloaded data directory to get data in Kaldi format. Here we get all
+  # channels.
+  mkdir -p data/local/data/
+  local/prepare_data.py --srcpath $corpus_dir/for_release --tgtpath data/local/data
+fi
+
+if [ $stage -le 2 ] && $wpe; then
+  # Perform online multichannel WPE
+  local/run_wpe.sh --cmd "$train_cmd --mem 60G" \
+      data/local/data
+
+  # Change the path of the wav files to point to dereverberated file
+  mv data/local/data/wav.scp data/local/data/wav.scp.bak
+  cat data/local/data/wav.scp.bak | sed 's/wavs/wavs_dereverb/g' > data/local/data/wav.scp
+fi
+
+if [ $stage -le 3 ]; then
+  if [ $enhancement == "gss" ]; then
+    echo "$0: GSS not implemented yet"
+  elif [ $enhancement == "beamformit" ]; then
+    local/run_beamformit.sh --cmd "$train_cmd" \
+      data/local/data/wavs \
+      data/local/data_beamformit/wavs
+
+    # Also create other files. Note that we still name the beamformed file as CH0
+    # only for consistency in naming (for scoring purposes)
+    for file in wav.scp utt2spk text segments; do
+      cat data/local/data/$file | sed 's/_CH[0-9]/_CH0/g' | sort -u > data/local/data_beamformit/$file 
+    done
+    sed -i 's/data\/local\/data\/wavs_dereverb/data\/local\/data_beamformit\/wavs/g' data/local/data_beamformit/wav.scp
+
+  else
+    echo "$0: Enhancement type $enhancement not found"
+    exit 1 
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  # Create dev and eval splits based on sessions. In total we have 10 sessions (session0 to 
+  # session9) of approximately 1 hour each. In the below strings, separate each session by
+  # '\|' to perform grep at once.
+  dev_sessions="session0"
+  eval_sessions="session1\|session2\|session3\|session4\|session5\|session6\|session7\|session8\|session9"
+
+  mkdir -p data/dev${dereverb}_${enhancement}
+  for file in wav.scp utt2spk text segments; do
+    grep $dev_sessions data/local/data_${enhancement}/"$file" | sort > data/dev${dereverb}_${enhancement}/"$file" 
+  done
+
+  mkdir -p data/eval${dereverb}_${enhancement}
+  for file in wav.scp utt2spk text segments; do
+    grep $eval_sessions data/local/data_${enhancement}/"$file" | sort > data/eval${dereverb}_${enhancement}/"$file" 
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # Move the utt2spk, segments, and text file to .bak so that they are only used
+  # in the last scoring stage. We also prepare a dummy utt2spk and spk2utt for
+  # these.
+  for dataset in dev eval; do
+    datadir=${dataset}${dereverb}_${enhancement}
+    for file in text utt2spk segments; do
+      mv data/$datadir/$file data/$datadir/$file.bak
+    done
+
+    awk '{print $1, $1}' data/$datadir/wav.scp > data/$datadir/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/$datadir/utt2spk > data/$datadir/spk2utt
+  done
+fi