diff --git a/egs/libri_css/s5_7ch/cmd.sh b/egs/libri_css/s5_7ch/cmd.sh new file mode 100644 index 00000000000..86514d94d4d --- /dev/null +++ b/egs/libri_css/s5_7ch/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/libri_css/s5_7ch/conf/beamformit.cfg b/egs/libri_css/s5_7ch/conf/beamformit.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/libri_css/s5_7ch/conf/beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/libri_css/s5_7ch/conf/mfcc.conf b/egs/libri_css/s5_7ch/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/libri_css/s5_7ch/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/libri_css/s5_7ch/conf/mfcc_hires.conf b/egs/libri_css/s5_7ch/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/libri_css/s5_7ch/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/libri_css/s5_7ch/conf/online_cmvn.conf b/egs/libri_css/s5_7ch/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/libri_css/s5_7ch/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/libri_css/s5_7ch/diarization b/egs/libri_css/s5_7ch/diarization new file mode 120000 index 00000000000..bad937c1444 --- /dev/null +++ b/egs/libri_css/s5_7ch/diarization @@ -0,0 +1 @@ +../../callhome_diarization/v1/diarization \ No newline at end of file diff --git a/egs/libri_css/s5_7ch/local b/egs/libri_css/s5_7ch/local new file mode 120000 index 00000000000..2757f389a5b --- /dev/null +++ b/egs/libri_css/s5_7ch/local @@ -0,0 +1 @@ +../s5_mono/local \ No newline at end of file diff --git a/egs/libri_css/s5_7ch/path.sh b/egs/libri_css/s5_7ch/path.sh new file mode 100644 index 00000000000..2f4e4e4fb21 --- /dev/null +++ b/egs/libri_css/s5_7ch/path.sh @@ -0,0 +1,9 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +export PATH=$PWD/dscore:$PATH +export PYTHONPATH="${PYTHONPATH}:$PWD/dscore" +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/libri_css/s5_7ch/rnnlm b/egs/libri_css/s5_7ch/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/libri_css/s5_7ch/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/libri_css/s5_7ch/run.sh b/egs/libri_css/s5_7ch/run.sh new file mode 100755 index 00000000000..9bfae493ce5 --- /dev/null +++ b/egs/libri_css/s5_7ch/run.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# +# LibriCSS multi-channel baseline recipe. +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +# Begin configuration section. +nj=50 +decode_nj=20 +stage=0 + +# Different stages +data_prep_stage=0 +asr_stage=1 +diarizer_stage=0 +decode_stage=0 +rnnlm_rescore=true + +enhancement=beamformit +wpe=true + +use_oracle_segments=true + +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + +dereverb= +$wpe && dereverb=_dereverb + +test_sets="dev${dereverb}_${enhancement} eval${dereverb}_${enhancement}" + +set -e # exit on error + +# please change the path accordingly +libricss_corpus=/export/corpora/LibriCSS +librispeech_corpus=/export/corpora/LibriSpeech/ + +########################################################################## +# We first prepare the LibriCSS data (7ch) in the Kaldi data +# format. We use session 0 for dev and others for eval. We also +# apply online multichannel WPE for dereverberation and then combine +# all channels using beamforming. +########################################################################## +if [ $stage -le 0 ]; then + local/data_prep_7ch.sh --stage $data_prep_stage --wpe $wpe \ + --enhancement $enhancement $libricss_corpus +fi + +######################################################################### +# ASR MODEL TRAINING +# In this stage, we prepare the Librispeech data and train our ASR model. +# This part is taken from the librispeech recipe, with parts related to +# decoding removed. We use the 100h clean subset to train most of the +# GMM models, except the SAT model, which is trained on the 460h clean +# subset. The nnet is trained on the full 960h (clean + other). +# To avoid training the whole ASR from scratch, you can download the +# chain model using: +# wget http://kaldi-asr.org/models/13/0013_librispeech_s5.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0013_librispeech_s5.tar.gz +# and copy the contents of the exp/ directory to your exp/. +######################################################################### +if [ $stage -le 1 ]; then + local/train_asr.sh --stage $asr_stage --nj $nj $librispeech_corpus +fi + +########################################################################## +# DIARIZATION MODEL TRAINING +# You can also download a pretrained diarization model using: +# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz +# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz +# and copy the contents of the exp/ directory to your exp/ +########################################################################## +if [ $stage -le 2 ]; then + local/train_diarizer.sh --stage $diarizer_stage \ + --data-dir data/train_other_500 \ + --model-dir exp/xvector_nnet_1a +fi + +########################################################################## +# RNNLM TRAINING +# We train a TDNN-LSTM based LM that will be used for rescoring the +# decoded lattices. +########################################################################## +if [ $stage -le 3 ]; then + local/rnnlm/train.sh --stage $rnnlm_stage +fi + +########################################################################## +# DECODING: We assume that we are just given the raw recordings (approx 10 +# mins each), without segments or speaker information, so we have to decode +# the whole pipeline, i.e., SAD -> Diarization -> ASR. This is done in the +# local/decode.sh script. +########################################################################## +if [ $stage -le 4 ]; then + local/decode.sh --stage $decode_stage \ + --test-sets "$test_sets" \ + --use-oracle-segments $use_oracle_segments +fi + +exit 0; + diff --git a/egs/libri_css/s5_7ch/sid b/egs/libri_css/s5_7ch/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/libri_css/s5_7ch/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/libri_css/s5_7ch/steps b/egs/libri_css/s5_7ch/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/libri_css/s5_7ch/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/libri_css/s5_7ch/utils b/egs/libri_css/s5_7ch/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/libri_css/s5_7ch/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/data_prep_7ch.sh b/egs/libri_css/s5_mono/local/data_prep_7ch.sh new file mode 100755 index 00000000000..a7f765defbf --- /dev/null +++ b/egs/libri_css/s5_mono/local/data_prep_7ch.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +# Begin configuration section. +# End configuration section +enhancement= +wpe= +stage= + +. ./utils/parse_options.sh # accept options + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 1 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /export/corpora/LibriCSS" + exit 1 +fi + +corpus_dir=$1 + +dereverb= +$wpe && dereverb=_dereverb + +set -e -o pipefail + +if [ $stage -le 0 ]; then + # If data is not already present, then download and unzip + if [ ! -d $corpus_dir/for_release ]; then + echo "Downloading and unpacking LibriCSS data." + CWD=`pwd` + mkdir -p $corpus_dir + + cd $corpus_dir + + # Download the data. If the data has already been downloaded, it + # does nothing. (See wget -c) + wget -c --load-cookies /tmp/cookies.txt \ + "https://docs.google.com/uc?export=download&confirm=$(wget --quiet \ + --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \ + 'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' \ + -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" \ + -O for_release.zip && rm -rf /tmp/cookies.txt + + # unzip (skip if already extracted) + unzip -n for_release.zip + + # segmentation + cd for_release + python3 segment_libricss.py -data_path . + + cd $CWD + fi +fi + +if [ $stage -le 1 ]; then + # Process the downloaded data directory to get data in Kaldi format. Here we get all + # channels. + mkdir -p data/local/data/ + local/prepare_data.py --srcpath $corpus_dir/for_release --tgtpath data/local/data +fi + +if [ $stage -le 2 ] && $wpe; then + # Perform online multichannel WPE + local/run_wpe.sh --cmd "$train_cmd --mem 60G" \ + data/local/data + + # Change the path of the wav files to point to dereverberated file + mv data/local/data/wav.scp data/local/data/wav.scp.bak + cat data/local/data/wav.scp.bak | sed 's/wavs/wavs_dereverb/g' > data/local/data/wav.scp +fi + +if [ $stage -le 3 ]; then + if [ $enhancement == "gss" ]; then + echo "$0: GSS not implemented yet" + elif [ $enhancement == "beamformit" ]; then + local/run_beamformit.sh --cmd "$train_cmd" \ + data/local/data/wavs \ + data/local/data_beamformit/wavs + + # Also create other files. Note that we still name the beamformed file as CH0 + # only for consistency in naming (for scoring purposes) + for file in wav.scp utt2spk text segments; do + cat data/local/data/$file | sed 's/_CH[0-9]/_CH0/g' | sort -u > data/local/data_beamformit/$file + done + sed -i 's/data\/local\/data\/wavs_dereverb/data\/local\/data_beamformit\/wavs/g' data/local/data_beamformit/wav.scp + + else + echo "$0: Enhancement type $enhancement not found" + exit 1 + fi +fi + +if [ $stage -le 4 ]; then + # Create dev and eval splits based on sessions. In total we have 10 sessions (session0 to + # session9) of approximately 1 hour each. In the below strings, separate each session by + # '\|' to perform grep at once. + dev_sessions="session0" + eval_sessions="session1\|session2\|session3\|session4\|session5\|session6\|session7\|session8\|session9" + + mkdir -p data/dev${dereverb}_${enhancement} + for file in wav.scp utt2spk text segments; do + grep $dev_sessions data/local/data_${enhancement}/"$file" | sort > data/dev${dereverb}_${enhancement}/"$file" + done + + mkdir -p data/eval${dereverb}_${enhancement} + for file in wav.scp utt2spk text segments; do + grep $eval_sessions data/local/data_${enhancement}/"$file" | sort > data/eval${dereverb}_${enhancement}/"$file" + done +fi + +if [ $stage -le 5 ]; then + # Move the utt2spk, segments, and text file to .bak so that they are only used + # in the last scoring stage. We also prepare a dummy utt2spk and spk2utt for + # these. + for dataset in dev eval; do + datadir=${dataset}${dereverb}_${enhancement} + for file in text utt2spk segments; do + mv data/$datadir/$file data/$datadir/$file.bak + done + + awk '{print $1, $1}' data/$datadir/wav.scp > data/$datadir/utt2spk + utils/utt2spk_to_spk2utt.pl data/$datadir/utt2spk > data/$datadir/spk2utt + done +fi diff --git a/egs/libri_css/s5_mono/local/decode.sh b/egs/libri_css/s5_mono/local/decode.sh index 2fdc1314860..d94bea7699d 100755 --- a/egs/libri_css/s5_mono/local/decode.sh +++ b/egs/libri_css/s5_mono/local/decode.sh @@ -13,13 +13,23 @@ stage=0 score_sad=true diarizer_stage=0 decode_diarize_stage=0 -decode_oracle_stage=1 +decode_oracle_stage=0 score_stage=0 -affix=1d # This should be the affix of the tdnn model you want to decode with +nnet3_affix=_cleaned # affix for the chain directory name +affix=1d # affix for the TDNN directory name # If the following is set to true, we use the oracle speaker and segment # information instead of performing SAD and diarization. use_oracle_segments= +rnnlm_rescore=true + +# RNNLM rescore options +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true +rnnlm_dir=exp/rnnlm_lstm_1a test_sets="dev eval" @@ -28,7 +38,11 @@ test_sets="dev eval" . ./cmd.sh . ./path.sh -$use_oracle_segments && [ $stage -le 6 ] && stage=6 +# Get dev and eval set names from the test_sets +dev_set=$( echo $test_sets | cut -d " " -f1 ) +eval_set=$( echo $test_sets | cut -d " " -f2 ) + +$use_oracle_segments && [ $stage -le 8 ] && stage=8 ####################################################################### # Perform SAD on the dev/eval data using py-webrtcvad package @@ -112,7 +126,7 @@ if [ $stage -le 4 ]; then local/decode_diarized.sh --nj $asr_nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ --lm-suffix "_tgsmall" \ exp/${datadir}_diarization data/$datadir data/lang_nosp_test_tgsmall \ - exp/chain_cleaned/tdnn_${affix}_sp exp/nnet3_cleaned \ + exp/chain${nnet3_affix}/tdnn_${affix}_sp exp/nnet3${nnet3_affix} \ data/${datadir}_diarized || exit 1 done fi @@ -125,10 +139,43 @@ if [ $stage -le 5 ]; then # please specify both dev and eval set directories so that the search parameters # (insertion penalty and language model weight) will be tuned using the dev set local/score_reco_diarized.sh --stage $score_stage \ - --dev_decodedir exp/chain_cleaned/tdnn_${affix}_sp/decode_dev_diarized_2stage \ - --dev_datadir dev_diarized_hires \ - --eval_decodedir exp/chain_cleaned/tdnn_${affix}_sp/decode_eval_diarized_2stage \ - --eval_datadir eval_diarized_hires + --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${dev_set}_diarized_2stage \ + --dev_datadir ${dev_set}_diarized_hires \ + --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${eval_set}_diarized_2stage \ + --eval_datadir ${eval_set}_diarized_hires +fi + +############################################################################ +# RNNLM rescoring +############################################################################ +if $rnnlm_rescore; then + if [ $stage -le 6 ]; then + echo "$0: Perform RNNLM lattice-rescoring" + pruned= + ac_model_dir=exp/chain${nnet3_affix}/tdnn_${affix}_sp + if $pruned_rescore; then + pruned=_pruned + fi + for decode_set in $test_sets; do + decode_dir=${ac_model_dir}/decode_${decode_set}_diarized_2stage + # Lattice rescoring + rnnlm/lmrescore$pruned.sh \ + --cmd "$decode_cmd --mem 8G" \ + --weight 0.45 --max-ngram-order $ngram_order \ + data/lang_nosp_test_tgsmall $rnnlm_dir \ + data/${decode_set}_diarized_hires ${decode_dir} \ + ${ac_model_dir}/decode_${decode_set}_diarized_2stage_rescore + done + fi + + if [ $stage -le 7 ]; then + echo "$0: WERs after rescoring with $rnnlm_dir" + local/score_reco_diarized.sh --stage $score_stage \ + --dev_decodedir exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${dev_set}_diarized_2stage_rescore \ + --dev_datadir ${dev_set}_diarized_hires \ + --eval_decodedir exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${eval_set}_diarized_2stage_rescore \ + --eval_datadir ${eval_set}_diarized_hires + fi fi $use_oracle_segments || exit 0 @@ -136,7 +183,7 @@ $use_oracle_segments || exit 0 ###################################################################### # Here we decode using oracle speaker and segment information ###################################################################### -if [ $stage -le 6 ]; then +if [ $stage -le 8 ]; then # mfccdir should be some place with a largish disk where you # want to store MFCC features. mfccdir=mfcc @@ -156,11 +203,12 @@ if [ $stage -le 6 ]; then done fi -if [ $stage -le 7 ]; then +if [ $stage -le 9 ]; then local/decode_oracle.sh --stage $decode_oracle_stage \ --affix $affix \ --lang-dir data/lang_nosp_test_tgsmall \ --lm-suffix "_tgsmall" \ + --rnnlm-rescore $rnnlm_rescore \ --test_sets "$test_sets" fi diff --git a/egs/libri_css/s5_mono/local/decode_oracle.sh b/egs/libri_css/s5_mono/local/decode_oracle.sh index da39f46037b..a0e3abab049 100755 --- a/egs/libri_css/s5_mono/local/decode_oracle.sh +++ b/egs/libri_css/s5_mono/local/decode_oracle.sh @@ -13,7 +13,9 @@ stage=0 test_sets= lang_dir= lm_suffix= +nnet3_affix=_cleaned # affix for the chain directory name affix=1d # affix for the TDNN directory name +rnnlm_rescore=false # End configuration section . ./utils/parse_options.sh @@ -21,8 +23,20 @@ affix=1d # affix for the TDNN directory name . ./cmd.sh . ./path.sh +# RNNLM rescore options +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true +rnnlm_dir=exp/rnnlm_lstm_1a + dir=exp/chain${nnet3_affix}/tdnn_${affix}_sp +# Get dev and eval set names from the test_sets +dev_set=$( echo $test_sets | cut -d " " -f1 ) +eval_set=$( echo $test_sets | cut -d " " -f2 ) + set -e # exit on error @@ -30,8 +44,6 @@ set -e # exit on error # DECODING: we perform 2 stage decoding. ########################################################################## -nnet3_affix=_cleaned - if [ $stage -le 0 ]; then # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). @@ -69,6 +81,7 @@ if [ $stage -le 0 ]; then [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 fi + ########################################################################## # Scoring: here we obtain wer per condition and overall WER ########################################################################## @@ -77,6 +90,37 @@ if [ $stage -le 1 ]; then # please specify both dev and eval set directories so that the search parameters # (insertion penalty and language model weight) will be tuned using the dev set local/score_reco_oracle.sh \ - --dev exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_dev_oracle_2stage \ - --eval exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_eval_oracle_2stage + --dev exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${dev_set}_oracle_2stage \ + --eval exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${eval_set}_oracle_2stage fi + +############################################################################ +# RNNLM rescoring +############################################################################ +if $rnnlm_rescore; then + if [ $stage -le 2 ]; then + echo "$0: Perform RNNLM lattice-rescoring" + pruned= + ac_model_dir=exp/chain${nnet3_affix}/tdnn_${affix}_sp + if $pruned_rescore; then + pruned=_pruned + fi + for decode_set in $test_sets; do + decode_dir=${ac_model_dir}/decode_${decode_set}_oracle_2stage + # Lattice rescoring + rnnlm/lmrescore$pruned.sh \ + --cmd "$decode_cmd --mem 8G" \ + --weight 0.45 --max-ngram-order $ngram_order \ + data/lang_nosp_test_tgsmall $rnnlm_dir \ + data/${decode_set}_oracle_hires ${decode_dir} \ + ${ac_model_dir}/decode_${decode_set}_oracle_2stage_rescore + done + fi + + if [ $stage -le 3 ]; then + echo "$0: WERs after rescoring with $rnnlm_dir" + local/score_reco_oracle.sh \ + --dev exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${dev_set}_oracle_2stage_rescore \ + --eval exp/chain${nnet3_affix}/tdnn_${affix}_sp/decode_${eval_set}_oracle_2stage_rescore + fi +fi \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/make_voxceleb1.pl b/egs/libri_css/s5_mono/local/make_voxceleb1.pl new file mode 100755 index 00000000000..2268c20ab52 --- /dev/null +++ b/egs/libri_css/s5_mono/local/make_voxceleb1.pl @@ -0,0 +1,130 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_test_dir = "$out_dir/voxceleb1_test"; +my $out_train_dir = "$out_dir/voxceleb1_train"; + +if (system("mkdir -p $out_test_dir") != 0) { + die "Error making directory $out_test_dir"; +} + +if (system("mkdir -p $out_train_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (! -e "$data_base/voxceleb1_test.txt") { + system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); +} + +if (! -e "$data_base/vox1_meta.csv") { + system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv"); +} + +open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; +open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; +open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; +open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; +open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; +open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + +my %id2spkr = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split; + $id2spkr{$vox_id} = $spkr_id; +} + +my $test_spkrs = (); +while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($spkr_id, $filename) = split('/', $path1); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my ($spkr_id, $filename) = split('/', $path2); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; +} + +foreach (@spkr_dirs) { + my $spkr_id = $_; + my $new_spkr_id = $spkr_id; + # If we're using a newer version of VoxCeleb1, we need to "deanonymize" + # the speaker labels. + if (exists $id2spkr{$spkr_id}) { + $new_spkr_id = $id2spkr{$spkr_id}; + } + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + my $utt_id = "$new_spkr_id-$rec_id-$segment"; + if (exists $test_spkrs{$new_spkr_id}) { + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; + } else { + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n"; + } + } +} + +close(SPKR_TEST) or die; +close(WAV_TEST) or die; +close(SPKR_TRAIN) or die; +close(WAV_TRAIN) or die; +close(TRIAL_OUT) or die; +close(TRIAL_IN) or die; +close(META_IN) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_test_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { + die "Error validating directory $out_test_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_train_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { + die "Error validating directory $out_train_dir"; +} diff --git a/egs/libri_css/s5_mono/local/make_voxceleb2.pl b/egs/libri_css/s5_mono/local/make_voxceleb2.pl new file mode 100755 index 00000000000..34c1591eba3 --- /dev/null +++ b/egs/libri_css/s5_mono/local/make_voxceleb2.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# +# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh); + closedir $dh; + + foreach (@files) { + my $name = $_; + my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/libri_css/s5_mono/local/rnnlm/train.sh b/egs/libri_css/s5_mono/local/rnnlm/train.sh new file mode 120000 index 00000000000..8e647598556 --- /dev/null +++ b/egs/libri_css/s5_mono/local/rnnlm/train.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1a.sh \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh b/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh new file mode 100755 index 00000000000..0fcf4c354b1 --- /dev/null +++ b/egs/libri_css/s5_mono/local/rnnlm/tuning/run_tdnn_lstm_1a.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# 2018 Ke Li + +# This script trains LMs on the librispeech-lm-norm.txt.gz. + +# rnnlm/train_rnnlm.sh: best iteration (out of 143) was 142, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 109.2 / 110.7. +# Train objf: -5.74 -5.54 -5.44 -5.37 -5.32 -5.28 -5.25 -5.23 -5.20 -5.18 -5.15 -5.14 -5.12 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.96 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.92 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.78 -4.79 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.70 -4.70 -4.70 -4.70 -4.70 -4.69 -4.69 -4.69 -4.69 -4.69 -4.69 -4.68 -4.68 +# Dev objf: -5.99 -5.65 -5.53 -5.44 -5.38 -5.34 -5.30 -5.27 -5.22 -5.20 -5.18 -5.16 -5.14 -5.12 -5.11 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.97 0.00 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.91 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.87 -4.84 -4.84 -4.84 -4.83 -4.91 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.79 -4.79 -4.78 -4.78 -4.79 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 + +# WER summary on dev and test sets +# System tdnn_1d_sp +lattice_rescore +nbest_rescore +# WER on dev(fglarge) 3.34 2.71 2.62 +# WER on dev(tglarge) 3.44 2.75 2.66 +# WER on dev_other(fglarge) 8.70 7.37 7.55 +# WER on dev_other(tglarge) 9.25 7.56 7.73 +# WER on test(fglarge) 3.77 3.12 3.06 +# WER on test(tglarge) 3.85 3.18 3.11 +# WER on test_other(fglarge) 8.91 7.63 7.68 +# WER on test_other(tglarge) 9.31 7.83 7.95 + +# command to get the WERs above: +# tdnn_1d_sp +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}/wer* | best_wer.sh; done; done +# tdnn_1d_sp with lattice rescoring +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_rescore/wer* | best_wer.sh; done; done +# tdnn_1d_sp with nbest rescoring +# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_nbest_rescore/wer* | best_wer.sh; done; done + +# Begin configuration section. + +dir=exp/rnnlm_lstm_1a +embedding_dim=1024 +lstm_rpd=256 +lstm_nrpd=256 +stage=-10 +train_stage=-10 +epochs=4 + +# variables for lattice rescoring +run_lat_rescore=true +run_nbest_rescore=true +run_backward_rnnlm=false +ac_model_dir=exp/chain_cleaned/tdnn_1d_sp +decode_dir_suffix=rnnlm_1a +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true + +. ./cmd.sh +. ./utils/parse_options.sh + +text=data/local/lm/librispeech-lm-norm.txt.gz +lexicon=data/lang_nosp/words.txt +text_dir=data/rnnlm/text +mkdir -p $dir/config +set -e + +for f in $lexicon; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; search for run.sh in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + if [ ! -f $text ]; then + wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm + fi + echo -n >$text_dir/dev.txt + # hold out one in every 2000 lines as dev data. + gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/librispeech.txt +fi + +if [ $stage -le 1 ]; then + cp $lexicon $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --top-word-features=5000 \ + --use-constant-feature=true \ + --special-words=',,,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig < " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --bmf \"1 2 3 4\" # microphones used for beamforming" + exit 1; +fi + +sdir=$1 +odir=$2 +expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'` + +if ! command -v BeamformIt &>/dev/null ; then + echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1 +fi + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +mkdir -p $odir +mkdir -p $expdir/log + +echo "Will use the following channels: $bmf" +# number of channels +numch=`echo $bmf | tr ' ' '\n' | wc -l` +echo "the number of channels: $numch" + +# We create a list of all wav files +basename -a $sdir/* > $expdir/channels_input +output_wavfiles=$expdir/wavfiles.list +cat $expdir/channels_input | sed 's/_CH[0-9]/_CH0/g' | sed 's/.wav//g' | sort -u > $output_wavfiles + +# this is an input file list of the microphones +# format: 1st_wav 2nd_wav ... nth_wav +input_arrays=$expdir/channels_$numch +for x in `cat $output_wavfiles`; do + echo -n "$x" + for ch in $bmf; do + wav_name=$( echo $x | sed 's/_CH0//g' | awk -F'_' -v CHANNEL="$ch" '{print $1"_CH"CHANNEL"_"$2".wav"}' ) + echo -n " $wav_name" + done + echo "" +done > $input_arrays + +# split the list for parallel processing +# number of jobs are set by the number of WAV files +nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'` +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Beamforming\n" +# making a shell script for each job +for n in `seq $nj`; do +cat << EOF > $expdir/log/beamform.$n.sh +while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ + --config_file `pwd`/conf/beamformit.cfg \ + --source_dir $sdir \ + --result_dir $odir +done < $output_wavfiles.$n +EOF +done + +chmod a+x $expdir/log/beamform.*.sh +$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \ + $expdir/log/beamform.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/libri_css/s5_mono/local/run_cleanup_segmentation.sh b/egs/libri_css/s5_mono/local/run_cleanup_segmentation.sh new file mode 100755 index 00000000000..b048758df7f --- /dev/null +++ b/egs/libri_css/s5_mono/local/run_cleanup_segmentation.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# Copyright 2016 Vimal Manohar +# 2016 Yiming Wang +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script demonstrates how to re-segment training data selecting only the +# "good" audio that matches the transcripts. +# The basic idea is to decode with an existing in-domain acoustic model, and a +# biased language model built from the reference, and then work out the +# segmentation from a ctm like file. + +# For nnet3 and chain results after cleanup, see the scripts in +# local/nnet3/run_tdnn.sh and local/chain/run_tdnn_6z.sh + +# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets +# [will add these later]. + +set -e +set -o pipefail +set -u + +stage=0 +cleanup_stage=0 +data=data/train_960 +cleanup_affix=cleaned +srcdir=exp/tri6b +nj=100 +decode_nj=16 +decode_num_threads=4 + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +cleaned_data=${data}_${cleanup_affix} + +dir=${srcdir}_${cleanup_affix}_work +cleaned_dir=${srcdir}_${cleanup_affix} + +if [ $stage -le 1 ]; then + # This does the actual data cleanup. + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \ + $data data/lang $srcdir $dir $cleaned_data +fi + +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix} +fi + +if [ $stage -le 3 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 7000 150000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} +fi + +if [ $stage -le 4 ]; then + # Test with the models trained on cleaned-up data. + utils/mkgraph.sh data/lang_test_tgsmall ${cleaned_dir} ${cleaned_dir}/graph_tgsmall +fi + +exit 0; diff --git a/egs/libri_css/s5_mono/local/run_wpe.py b/egs/libri_css/s5_mono/local/run_wpe.py new file mode 100644 index 00000000000..a1e427192b1 --- /dev/null +++ b/egs/libri_css/s5_mono/local/run_wpe.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# Copyright 2020 Johns Hopkins University (Author: Bar Ben-Yair) +# Apache 2.0 +# Works with both python2 and python3 +# This script assumes that WPE (nara_wpe) is installed locally using miniconda. +# ../../../tools/extras/install_miniconda.sh and ../../../tools/extras/install_wpe.sh +# needs to be run and this script needs to be launched run with that version of +# python. +# See local/run_wpe.sh for example. +# (Updated to work with LibriCSS data) + +import numpy as np +import soundfile as sf +import time +import os, errno +from tqdm import tqdm +import argparse + +# to avoid huge memory consumption we decided to use `online_wpe` instead of the offline one +# following the advice from Christoph Boeddeker at Paderborn University +from nara_wpe.wpe import wpe_v8 as wpe,OnlineWPE +from nara_wpe.utils import stft, istft +from nara_wpe import project_root + +# Input is a list of wav files corresponding to all the channels of a single recording, and +# and output directory path where the dereverberated wavs will be stored. +parser = argparse.ArgumentParser() +parser.add_argument('in_wavs') +parser.add_argument('out_dir') +args = parser.parse_args() + +# Create output directory +try: + os.makedirs(args.out_dir) +except OSError as e: + if e.errno != errno.EEXIST: + raise + +# Read all input channels +in_wavs = [] +with open(args.in_wavs,'r') as f: + for line in f.readlines(): + in_wavs.append(line.strip()) + +stft_options = dict( + size=512, + shift=128, + window_length=None, + fading=True, + pad=True, + symmetric_window=False +) + +channels=len(in_wavs) +sampling_rate = 16000 +delay = 3 +iterations = 5 +taps = 10 +alpha= 0.9999 + +def aquire_framebuffer(): + buffer = list(Y[:taps+delay+1, :, :]) + for t in range(taps+delay+1, T): + arr = np.array(buffer) + yield arr + buffer.append(Y[t, :, :]) + buffer.pop(0) + +# Read all input channels +signal_list = [sf.read(in_wav,dtype='int16')[0] for in_wav in in_wavs] + +# Perform processing on stacked channel input +signal_list_len=len(signal_list) +y = np.stack(signal_list, axis=0) +del signal_list +Y = stft(y, **stft_options).transpose(1, 2, 0) +del y + +T, _, _ = Y.shape +Z_list = [] + +online_wpe = OnlineWPE( + taps=taps, + delay=delay, + alpha=alpha, + frequency_bins=Y.shape[1], + channel=channels +) + +for Y_step in tqdm(aquire_framebuffer()): + if np.sum(Y_step.flatten())!=0: + Z_list.append(online_wpe.step_frame(Y_step)) + else: + Z_list.append(Y_step[0,:,:].reshape((Y_step.shape[1],Y_step.shape[2]))) +del Y +Z = np.asarray(np.stack(Z_list)).transpose(2, 0, 1) +z = istft(Z, size=stft_options['size'], shift=stft_options['shift']).astype('int16') +del Z + +# Write dereverberated output +for d in range(signal_list_len): + filename = os.path.basename(in_wavs[d]) + out_path = os.path.join(args.out_dir,filename) + sf.write(out_path,z[d,:],sampling_rate) \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/run_wpe.sh b/egs/libri_css/s5_mono/local/run_wpe.sh new file mode 100755 index 00000000000..44b210aa390 --- /dev/null +++ b/egs/libri_css/s5_mono/local/run_wpe.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugan Subramamian) +# 2020 Bar Ben-Yair +# 2020 Desh Raj +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +# Config: +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 1 ]; then + echo "Wrong #arguments ($#, expected 1)" + echo "Usage: local/run_wpe.sh [options] " + echo "main options (for others, see top of script file)" + echo " --cmd # Command to run in parallel with" + echo " --nj 50 # number of jobs for parallel processing" + exit 1; +fi + +datadir=$1 + +expdir=exp/wpe/ +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '$KALDI_ROOT/tools/extras/install_miniconda.sh'." + exit 1 +fi + +# check if WPE is installed +result=`$miniconda_dir/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "1" ]; then + echo "WPE is installed" +else + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" + exit 1 +fi + +mkdir -p $datadir/wavs_dereverb/ +mkdir -p $expdir/log + +# We create a list of all wav files +ls ${datadir}/wavs/* > $expdir/channels_input + +# split the list for parallel processing. We will create one job +# for each recording. +reco_ids=$( cat $expdir/channels_input | sed 's/_CH[0-7]//g' | sort -u ) +nj=$( echo $reco_ids | wc -w ) + +for n in `seq $nj`; do + # Find all channels for that recording and add to the wav list + pattern=$( echo $reco_ids | tr ' ' '\n' | sed "${n}q;d" | awk -F'_' '{print $1"_CH._"$2}' ) + grep $pattern $expdir/channels_input > $expdir/split_wav.$n +done + +echo -e "Dereverberation using online WPE..\n" +# making a shell script for each job +for n in `seq $nj`; do + cat <<-EOF > $expdir/log/wpe.$n.sh + $miniconda_dir/bin/python local/run_wpe.py \ + $expdir/split_wav.$n $datadir/wavs_dereverb +EOF +done + +chmod a+x $expdir/log/wpe.*.sh +$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \ + $expdir/log/wpe.JOB.sh + +rm -r $expdir + +echo "`basename $0` Done." \ No newline at end of file diff --git a/egs/libri_css/s5_mono/local/train_asr.sh b/egs/libri_css/s5_mono/local/train_asr.sh index f5f6ca760e2..30849ebe07a 100755 --- a/egs/libri_css/s5_mono/local/train_asr.sh +++ b/egs/libri_css/s5_mono/local/train_asr.sh @@ -69,11 +69,9 @@ if [ $stage -le 4 ]; then utils/combine_data.sh \ data/train_clean_460 data/train_clean_100 data/train_clean_360 - if [ $gmm_only == "false" ]; then - # And combine all 960h data, which will be used to train the nnet - utils/combine_data.sh \ - data/train_960 data/train_clean_460 data/train_other_500 - fi + # And combine all 960h data, which will be used to train the nnet + utils/combine_data.sh \ + data/train_960 data/train_clean_460 data/train_other_500 fi if [ $stage -le 5 ]; then diff --git a/egs/libri_css/s5_mono/rnnlm b/egs/libri_css/s5_mono/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/libri_css/s5_mono/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/libri_css/s5_mono/run.sh b/egs/libri_css/s5_mono/run.sh index a6b4a15d671..a402bd1a373 100755 --- a/egs/libri_css/s5_mono/run.sh +++ b/egs/libri_css/s5_mono/run.sh @@ -14,8 +14,10 @@ stage=0 asr_stage=1 diarizer_stage=0 decode_stage=0 +rnnlm_rescore=true use_oracle_segments=false +wpe=false # End configuration section . ./utils/parse_options.sh @@ -69,16 +71,26 @@ if [ $stage -le 2 ]; then --model-dir exp/xvector_nnet_1a fi +########################################################################## +# RNNLM TRAINING +# We train a TDNN-LSTM based LM that will be used for rescoring the +# decoded lattices. +########################################################################## +if [ $stage -le 3 ]; then + local/rnnlm/train.sh --stage $rnnlm_stage +fi + ########################################################################## # DECODING: We assume that we are just given the raw recordings (approx 10 # mins each), without segments or speaker information, so we have to decode # the whole pipeline, i.e., SAD -> Diarization -> ASR. This is done in the # local/decode.sh script. ########################################################################## -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then local/decode.sh --stage $decode_stage \ --test-sets "$test_sets" \ - --use-oracle-segments $use_oracle_segments + --use-oracle-segments $use_oracle_segments \ + --rnnlm-rescore true fi exit 0;