Skip to content

Commit

Permalink
Merge pull request kaldi-asr#4 from jsalt2020-asrdiar/libricss
Browse files Browse the repository at this point in the history
RNNLM rescoring and multichannel recipe
  • Loading branch information
desh2608 committed Jun 3, 2020
2 parents bba30e5 + 43ce69e commit 831c107
Show file tree
Hide file tree
Showing 27 changed files with 1,124 additions and 21 deletions.
14 changes: 14 additions & 0 deletions egs/libri_css/s5_7ch/cmd.sh
@@ -0,0 +1,14 @@
# you can change cmd.sh depending on what type of queue you are using.
# If you have no queueing system and want to run on a local machine, you
# can change all instances 'queue.pl' to run.pl (but be careful and run
# commands one by one: most recipes will exhaust the memory on your
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
# with slurm. Different queues are configured differently, with different
# queue names and different ways of specifying things like memory;
# to account for these differences you can create and edit the file
# conf/queue.conf to match your queue's configuration. Search for
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.

export train_cmd="retry.pl queue.pl --mem 2G"
export decode_cmd="queue.pl --mem 4G"
50 changes: 50 additions & 0 deletions egs/libri_css/s5_7ch/conf/beamformit.cfg
@@ -0,0 +1,50 @@
#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)

# scrolling size to compute the delays
scroll_size = 250

# cross correlation computation window size
window_size = 500

#amount of maximum points for the xcorrelation taken into account
nbest_amount = 4

#flag wether to apply an automatic noise thresholding
do_noise_threshold = 1

#Percentage of frames with lower xcorr taken as noisy
noise_percent = 10

######## acoustic modelling parameters

#transition probabilities weight for multichannel decoding
trans_weight_multi = 25
trans_weight_nbest = 25

###

#flag wether to print the feaures after setting them, or not
print_features = 1

#flag wether to use the bad frames in the sum process
do_avoid_bad_frames = 1

#flag to use the best channel (SNR) as a reference
#defined from command line
do_compute_reference = 1

#flag wether to use a uem file or not(process all the file)
do_use_uem_file = 0

#flag wether to use an adaptative weights scheme or fixed weights
do_adapt_weights = 1

#flag wether to output the sph files or just run the system to create the auxiliary files
do_write_sph_files = 1

####directories where to store/retrieve info####
#channels_file = ./cfg-files/channels

#show needs to be passed as argument normally, here a default one is given just in case
#show_id = Ttmp

2 changes: 2 additions & 0 deletions egs/libri_css/s5_7ch/conf/mfcc.conf
@@ -0,0 +1,2 @@
--use-energy=false
--sample-frequency=16000
10 changes: 10 additions & 0 deletions egs/libri_css/s5_7ch/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
# config for high-resolution MFCC features, intended for neural network training.
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--sample-frequency=16000
--num-mel-bins=40
--num-ceps=40
--low-freq=40
--high-freq=-400
1 change: 1 addition & 0 deletions egs/libri_css/s5_7ch/conf/online_cmvn.conf
@@ -0,0 +1 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
1 change: 1 addition & 0 deletions egs/libri_css/s5_7ch/diarization
1 change: 1 addition & 0 deletions egs/libri_css/s5_7ch/local
9 changes: 9 additions & 0 deletions egs/libri_css/s5_7ch/path.sh
@@ -0,0 +1,9 @@
export KALDI_ROOT=`pwd`/../../..
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
export PATH=$PWD/dscore:$PATH
export PYTHONPATH="${PYTHONPATH}:$PWD/dscore"
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
export LC_ALL=C

1 change: 1 addition & 0 deletions egs/libri_css/s5_7ch/rnnlm
105 changes: 105 additions & 0 deletions egs/libri_css/s5_7ch/run.sh
@@ -0,0 +1,105 @@
#!/usr/bin/env bash
#
# LibriCSS multi-channel baseline recipe.
#
# Copyright 2020 Johns Hopkins University (Author: Desh Raj)
# Apache 2.0

# Begin configuration section.
nj=50
decode_nj=20
stage=0

# Different stages
data_prep_stage=0
asr_stage=1
diarizer_stage=0
decode_stage=0
rnnlm_rescore=true

enhancement=beamformit
wpe=true

use_oracle_segments=true

# End configuration section
. ./utils/parse_options.sh

. ./cmd.sh
. ./path.sh

dereverb=
$wpe && dereverb=_dereverb

test_sets="dev${dereverb}_${enhancement} eval${dereverb}_${enhancement}"

set -e # exit on error

# please change the path accordingly
libricss_corpus=/export/corpora/LibriCSS
librispeech_corpus=/export/corpora/LibriSpeech/

##########################################################################
# We first prepare the LibriCSS data (7ch) in the Kaldi data
# format. We use session 0 for dev and others for eval. We also
# apply online multichannel WPE for dereverberation and then combine
# all channels using beamforming.
##########################################################################
if [ $stage -le 0 ]; then
local/data_prep_7ch.sh --stage $data_prep_stage --wpe $wpe \
--enhancement $enhancement $libricss_corpus
fi

#########################################################################
# ASR MODEL TRAINING
# In this stage, we prepare the Librispeech data and train our ASR model.
# This part is taken from the librispeech recipe, with parts related to
# decoding removed. We use the 100h clean subset to train most of the
# GMM models, except the SAT model, which is trained on the 460h clean
# subset. The nnet is trained on the full 960h (clean + other).
# To avoid training the whole ASR from scratch, you can download the
# chain model using:
# wget http://kaldi-asr.org/models/13/0013_librispeech_s5.tar.gz
# Once it is downloaded, extract using: tar -xvzf 0013_librispeech_s5.tar.gz
# and copy the contents of the exp/ directory to your exp/.
#########################################################################
if [ $stage -le 1 ]; then
local/train_asr.sh --stage $asr_stage --nj $nj $librispeech_corpus
fi

##########################################################################
# DIARIZATION MODEL TRAINING
# You can also download a pretrained diarization model using:
# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz
# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz
# and copy the contents of the exp/ directory to your exp/
##########################################################################
if [ $stage -le 2 ]; then
local/train_diarizer.sh --stage $diarizer_stage \
--data-dir data/train_other_500 \
--model-dir exp/xvector_nnet_1a
fi

##########################################################################
# RNNLM TRAINING
# We train a TDNN-LSTM based LM that will be used for rescoring the
# decoded lattices.
##########################################################################
if [ $stage -le 3 ]; then
local/rnnlm/train.sh --stage $rnnlm_stage
fi

##########################################################################
# DECODING: We assume that we are just given the raw recordings (approx 10
# mins each), without segments or speaker information, so we have to decode
# the whole pipeline, i.e., SAD -> Diarization -> ASR. This is done in the
# local/decode.sh script.
##########################################################################
if [ $stage -le 4 ]; then
local/decode.sh --stage $decode_stage \
--test-sets "$test_sets" \
--use-oracle-segments $use_oracle_segments
fi

exit 0;

1 change: 1 addition & 0 deletions egs/libri_css/s5_7ch/sid
1 change: 1 addition & 0 deletions egs/libri_css/s5_7ch/steps
1 change: 1 addition & 0 deletions egs/libri_css/s5_7ch/utils
130 changes: 130 additions & 0 deletions egs/libri_css/s5_mono/local/data_prep_7ch.sh
@@ -0,0 +1,130 @@
#!/usr/bin/env bash
#
# Copyright 2020 Johns Hopkins University (Author: Desh Raj)
# Apache 2.0

# Begin configuration section.
# End configuration section
enhancement=
wpe=
stage=

. ./utils/parse_options.sh # accept options

. ./path.sh

echo >&2 "$0" "$@"
if [ $# -ne 1 ] ; then
echo >&2 "$0" "$@"
echo >&2 "$0: Error: wrong number of arguments"
echo -e >&2 "Usage:\n $0 [opts] <corpus-dir>"
echo -e >&2 "eg:\n $0 /export/corpora/LibriCSS"
exit 1
fi

corpus_dir=$1

dereverb=
$wpe && dereverb=_dereverb

set -e -o pipefail

if [ $stage -le 0 ]; then
# If data is not already present, then download and unzip
if [ ! -d $corpus_dir/for_release ]; then
echo "Downloading and unpacking LibriCSS data."
CWD=`pwd`
mkdir -p $corpus_dir

cd $corpus_dir

# Download the data. If the data has already been downloaded, it
# does nothing. (See wget -c)
wget -c --load-cookies /tmp/cookies.txt \
"https://docs.google.com/uc?export=download&confirm=$(wget --quiet \
--save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' \
-O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" \
-O for_release.zip && rm -rf /tmp/cookies.txt

# unzip (skip if already extracted)
unzip -n for_release.zip

# segmentation
cd for_release
python3 segment_libricss.py -data_path .

cd $CWD
fi
fi

if [ $stage -le 1 ]; then
# Process the downloaded data directory to get data in Kaldi format. Here we get all
# channels.
mkdir -p data/local/data/
local/prepare_data.py --srcpath $corpus_dir/for_release --tgtpath data/local/data
fi

if [ $stage -le 2 ] && $wpe; then
# Perform online multichannel WPE
local/run_wpe.sh --cmd "$train_cmd --mem 60G" \
data/local/data

# Change the path of the wav files to point to dereverberated file
mv data/local/data/wav.scp data/local/data/wav.scp.bak
cat data/local/data/wav.scp.bak | sed 's/wavs/wavs_dereverb/g' > data/local/data/wav.scp
fi

if [ $stage -le 3 ]; then
if [ $enhancement == "gss" ]; then
echo "$0: GSS not implemented yet"
elif [ $enhancement == "beamformit" ]; then
local/run_beamformit.sh --cmd "$train_cmd" \
data/local/data/wavs \
data/local/data_beamformit/wavs

# Also create other files. Note that we still name the beamformed file as CH0
# only for consistency in naming (for scoring purposes)
for file in wav.scp utt2spk text segments; do
cat data/local/data/$file | sed 's/_CH[0-9]/_CH0/g' | sort -u > data/local/data_beamformit/$file
done
sed -i 's/data\/local\/data\/wavs_dereverb/data\/local\/data_beamformit\/wavs/g' data/local/data_beamformit/wav.scp

else
echo "$0: Enhancement type $enhancement not found"
exit 1
fi
fi

if [ $stage -le 4 ]; then
# Create dev and eval splits based on sessions. In total we have 10 sessions (session0 to
# session9) of approximately 1 hour each. In the below strings, separate each session by
# '\|' to perform grep at once.
dev_sessions="session0"
eval_sessions="session1\|session2\|session3\|session4\|session5\|session6\|session7\|session8\|session9"

mkdir -p data/dev${dereverb}_${enhancement}
for file in wav.scp utt2spk text segments; do
grep $dev_sessions data/local/data_${enhancement}/"$file" | sort > data/dev${dereverb}_${enhancement}/"$file"
done

mkdir -p data/eval${dereverb}_${enhancement}
for file in wav.scp utt2spk text segments; do
grep $eval_sessions data/local/data_${enhancement}/"$file" | sort > data/eval${dereverb}_${enhancement}/"$file"
done
fi

if [ $stage -le 5 ]; then
# Move the utt2spk, segments, and text file to .bak so that they are only used
# in the last scoring stage. We also prepare a dummy utt2spk and spk2utt for
# these.
for dataset in dev eval; do
datadir=${dataset}${dereverb}_${enhancement}
for file in text utt2spk segments; do
mv data/$datadir/$file data/$datadir/$file.bak
done

awk '{print $1, $1}' data/$datadir/wav.scp > data/$datadir/utt2spk
utils/utt2spk_to_spk2utt.pl data/$datadir/utt2spk > data/$datadir/spk2utt
done
fi

0 comments on commit 831c107

Please sign in to comment.