diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py index 62676d64510..2b57bd12862 100755 --- a/egs/callhome_diarization/v1/diarization/VB_diarization.py +++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py @@ -1,5 +1,6 @@ -#!/usr/bin/env python3 -# Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz) +#!/usr/bin/env python + +# Copyright 2013-2019 Lukas Burget, Mireia Diez (burget@fit.vutbr.cz, mireia@fit.vutbr.cz) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,63 +16,57 @@ # # Revision History -# L. Burget 16/07/13 01:00AM - original version -# L. Burget 20/06/17 12:07AM - np.asarray replaced by .toarray() -# - minor bug fix in initializing q -# - minor bug fix in ELBO calculation -# - few more optimizations +# 16/07/13 01:00AM - original version +# 20/06/17 12:07AM - np.asarray replaced by .toarray() +# - minor bug fix in initializing q(Z) +# - minor bug fix in ELBO calculation +# - few more optimizations +# 03/10/19 02:27PM - speaker regularization coefficient Fb added +# import numpy as np from scipy.sparse import coo_matrix import scipy.linalg as spl -#import numexpr as ne # the dependency on this modul can be avoided by replacing -# # logsumexp_ne and exp_ne with logsumexp and np.exp +import numexpr as ne # the dependency on this modul can be avoided by replacing + # logsumexp_ne and exp_ne with logsumexp and np.exp -#[q sp Li] = -def VB_diarization(X, m, iE, w, V, sp=None, q=None, +#[gamma pi Li] = +def VB_diarization(X, m, invSigma, w, V, pi=None, gamma=None, maxSpeakers = 10, maxIters = 10, epsilon = 1e-4, loopProb = 0.99, statScale = 1.0, - alphaQInit = 1.0, downsample = None, VtiEV = None, ref=None, - plot=False, sparsityThr=0.001, llScale=1.0, minDur=1): + alphaQInit = 1.0, downsample = None, VtinvSigmaV = None, ref=None, + plot=False, sparsityThr=0.001, llScale=1.0, minDur=1, Fa=1.0, Fb=1.0): """ This a generalized version of speaker diarization described in: - Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors, - Montreal, CRIM, May 2008. - - Kenny, P., Reynolds, D., and Castaldo, F. Diarization of Telephone - Conversations using Factor Analysis IEEE Journal of Selected Topics in Signal - Processing, December 2010. + Diez. M., Burget. L., Landini. F., Cernocky. J. + Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors - The generalization introduced in this implementation lies in using an HMM - instead of the simple mixture model when modeling generation of segments - (or even frames) from speakers. HMM limits the probability of switching - between speakers when changing frames, which makes it possible to use - the model on frame-by-frame bases without any need to iterate between - 1) clustering speech segments and 2) re-segmentation (i.e. as it was done in - the paper above). + Variable names and equation numbers refer to those used the paper Inputs: X - T x D array, where columns are D dimensional feature vectors for T frames m - C x D array of GMM component means - iE - C x D array of GMM component inverse covariance matrix diagonals + invSigma - C x D array of GMM component inverse covariance matrix diagonals w - C dimensional column vector of GMM component weights V - R x C x D array of eigenvoices maxSpeakers - maximum number of speakers expected in the utterance maxIters - maximum number of algorithm iterations epsilon - stop iterating, if obj. fun. improvement is less than epsilon loopProb - probability of not switching speakers between frames - statScale - scale sufficient statiscits collected using UBM + statScale - deprecated, use Fa instead + Fa - scale sufficient statiscits collected using UBM + Fb - speaker regularization coefficient Fb (controls final # of speaker) llScale - scale UBM likelihood (i.e. llScale < 1.0 make atribution of frames to UBM componets more uncertain) sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory as the posteriors are represented by sparse matrix) - alphaQInit - Dirichlet concentraion parameter for initializing q + alphaQInit - Dirichlet concentraion parameter for initializing gamma downsample - perform diarization on input downsampled by this factor - VtiEV - C x (R**2+R)/2 matrix normally calculated by VB_diarization when - VtiEV is None. However, it can be pre-calculated using function - precalculate_VtiEV(V) and used across calls of VB_diarization. + VtinvSigmaV - C x (R**2+R)/2 matrix normally calculated by VB_diarization when + VtinvSigmaV is None. However, it can be pre-calculated using function + precalculate_VtinvSigmaV(V) and used across calls of VB_diarization. minDur - minimum number of frames between speaker turns imposed by linear chains of HMM states corresponding to each speaker. All the states in a chain share the same output distribution @@ -80,192 +75,190 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None, plot - if set to True, plot per-frame speaker posteriors. Outputs: - q - S x T matrix of posteriors attribution each frame to one of S possible + gamma - S x T matrix of posteriors attribution each frame to one of S possible speakers, where S is given by opts.maxSpeakers - sp - S dimensional column vector of ML learned speaker priors. Ideally, these + pi - S dimensional column vector of ML learned speaker priors. Ideally, these should allow to estimate # of speaker in the utterance as the probabilities of the redundant speaker should converge to zero. - Li - values of auxiliary function (and DER and frame cross-entropy between q + Li - values of auxiliary function (and DER and frame cross-entropy between gamma and reference if 'ref' is provided) over iterations. """ - # The references to equations corresponds to the technical report: - # Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors, - # Montreal, CRIM, May 2008. + # The references to equations corresponds to + # Diez. M., Burget. L., Landini. F., Cernocky. J. + # Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors D=X.shape[1] # feature dimensionality C=len(w) # number of mixture components R=V.shape[0] # subspace rank nframes=X.shape[0] - if VtiEV is None: - VtiEV = precalculate_VtiEV(V, iE) + if VtinvSigmaV is None: + VtinvSigmaV = precalculate_VtinvSigmaV(V, invSigma) V = V.reshape(V.shape[0],-1) - if sp is None: - sp = np.ones(maxSpeakers)/maxSpeakers + if pi is None: + pi = np.ones(maxSpeakers)/maxSpeakers else: - maxSpeakers = len(sp) + maxSpeakers = len(pi) - if q is None: - # initialize q from flat Dirichlet prior with concentrsaion parameter alphaQInit - q = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers)) - q = q / q.sum(1, keepdims=True) + if gamma is None: + # initialize gamma from flat Dirichlet prior with concentration parameter alphaQInit + gamma = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers)) + gamma = gamma / gamma.sum(1, keepdims=True) # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics) - ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi)) + ll = (X**2).dot(-0.5*invSigma.T) + X.dot(invSigma.T*m.T)-0.5*((invSigma * m**2 - np.log(invSigma)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi)) ll *= llScale - G = logsumexp(ll, axis=1) - NN = np.exp(ll - G[:,np.newaxis]) * statScale - NN[NN 0 and L - Li[-2][0] < epsilon: - if L - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!') + if ii > 0 and ELBO - Li[-2][0] < epsilon: + if ELBO - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!') break if downsample is not None: - #upsample resulting q to match number of frames in the input utterance - q = downsampler.T.dot(q) + # upsample resulting gamma to match number of frames in the input utterance + gamma = downsampler.T.dot(gamma) - return q, sp, Li + return gamma, pi, Li -def precalculate_VtiEV(V, iE): +def precalculate_VtinvSigmaV(V, invSigma): tril_ind = np.tril_indices(V.shape[0]) - VtiEV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype) + VtinvSigmaV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype) for c in range(V.shape[1]): - VtiEV[c,:] = np.dot(V[:,c,:]*iE[np.newaxis,c,:], V[:,c,:].T)[tril_ind] - return VtiEV + VtinvSigmaV[c,:] = np.dot(V[:,c,:]*invSigma[np.newaxis,c,:], V[:,c,:].T)[tril_ind] + return VtinvSigmaV -# Initialize q (per-frame speaker posteriors) from a reference +# Initialize gamma (per-frame speaker posteriors) from a reference # (vector of per-frame zero based integer speaker IDs) -def frame_labels2posterior_mx(labels, maxSpeakers): - #initialize from reference - #pmx = np.zeros((len(labels), labels.max()+1)) - pmx = np.zeros((len(labels), maxSpeakers)) +def frame_labels2posterior_mx(labels): + # initialize from reference + pmx = np.zeros((len(labels), labels.max()+1)) pmx[np.arange(len(labels)), labels] = 1 return pmx + # Calculates Diarization Error Rate (DER) or per-frame cross-entropy between -# reference (vector of per-frame zero based integer speaker IDs) and q (per-frame -# speaker posteriors). If expected=False, q is converted into hard labels before -# calculating DER. If expected=TRUE, posteriors in q are used to calculated -# "expected" DER. -def DER(q, ref, expected=True, xentropy=False): +# reference (vector of per-frame zero based integer speaker IDs) and gamma +# (per-frame speaker posteriors). If expected=False, gamma is converted into +# hard labels before calculating DER. If expected=TRUE, posteriors in gamma +# are used to calculate "expected" DER. +def DER(gamma, ref, expected=True, xentropy=False): from itertools import permutations if not expected: - # replce probabiities in q by zeros and ones - hard_labels = q.argmax(1) - q = np.zeros_like(q) - q[range(len(q)), hard_labels] = 1 + # replace probabiities in gamma by zeros and ones + hard_labels = gamma.argmax(1) + gamma = np.zeros_like(gamma) + gamma[range(len(gamma)), hard_labels] = 1 - err_mx = np.empty((ref.max()+1, q.shape[1])) + err_mx = np.empty((ref.max()+1, gamma.shape[1])) for s in range(err_mx.shape[0]): - tmpq = q[ref == s,:] - err_mx[s] = (-np.log(tmpq) if xentropy else tmpq).sum(0) + tmpgamma = gamma[ref == s,:] + err_mx[s] = (-np.log(tmpgamma) if xentropy else tmpgamma).sum(0) if err_mx.shape[0] < err_mx.shape[1]: err_mx = err_mx.T # try all alignments (permutations) of reference and detected speaker - #could be written in more efficient way using dynamic programing + # could be written in more efficient way using dynamic programing acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum() for perm in permutations(range(err_mx.shape[0]))] if xentropy: @@ -342,7 +335,7 @@ def forward_backward(lls, tr, ip): lfw[0] = lls[0] + np.log(ip) lbw[-1] = 0.0 - for ii in range(1,len(lls)): + for ii in range(1,len(lls)): lfw[ii] = lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1) for ii in reversed(range(len(lls)-1)): @@ -350,4 +343,4 @@ def forward_backward(lls, tr, ip): tll = logsumexp(lfw[-1]) sp = np.exp(lfw + lbw - tll) - return sp, tll, lfw, lbw + return sp, tll, lfw, lbw \ No newline at end of file diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py new file mode 100644 index 00000000000..2907cc2d114 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +# This script is based on the Bayesian HMM-based xvector clustering +# code released by BUTSpeech at: https://github.com/BUTSpeechFIT/VBx. +# Note that this assumes that the provided labels are for a single +# recording. So this should be called from a script such as +# vb_hmm_xvector.sh which can divide all labels into per recording +# labels. + +import sys, argparse, struct +import numpy as np +import itertools +import kaldi_io + +from scipy.special import softmax + +import VB_diarization + +########### HELPER FUNCTIONS ##################################### + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script performs Bayesian HMM-based + clustering of x-vectors for one recording""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--init-smoothing", type=float, default=10, + help="AHC produces hard assignments of x-vetors to speakers." + " These are smoothed to soft assignments as the initialization" + " for VB-HMM. This parameter controls the amount of smoothing." + " Not so important, high value (e.g. 10) is OK => keeping hard assigment") + parser.add_argument("--loop-prob", type=float, default=0.80, + help="probability of not switching speakers between frames") + parser.add_argument("--fa", type=float, default=0.4, + help="scale sufficient statistics collected using UBM") + parser.add_argument("--fb", type=float, default=11, + help="speaker regularization coefficient Fb (controls final # of speaker)") + parser.add_argument("xvector_ark_file", type=str, + help="Ark file containing xvectors for all subsegments") + parser.add_argument("plda", type=str, + help="path to PLDA model") + parser.add_argument("input_label_file", type=str, + help="path of input label file") + parser.add_argument("output_label_file", type=str, + help="path of output label file") + args = parser.parse_args() + return args + +def read_labels_file(label_file): + segments = [] + labels = [] + with open(label_file, 'r') as f: + for line in f.readlines(): + segment, label = line.strip().split() + segments.append(segment) + labels.append(int(label)) + return segments, labels + +def write_labels_file(seg2label, out_file): + f = open(out_file, 'w') + for seg in sorted(seg2label.keys()): + f.write("{} {}\n".format(seg, seg2label[seg])) + f.close() + return + +def read_args(args): + segments, labels = read_labels_file(args.input_label_file) + xvec_all = dict(kaldi_io.read_vec_flt_ark(args.xvector_ark_file)) + xvectors = [] + for segment in segments: + xvectors.append(xvec_all[segment]) + _, _, plda_psi = kaldi_io.read_plda(args.plda) + return xvectors, segments, labels, plda_psi + + +################################################################### + +def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, fa, fb): + x = np.array(xvectors) + dim = x.shape[1] + + # Smooth the hard labels obtained from AHC to soft assignments of x-vectors to speakers + q_init = np.zeros((len(in_labels), np.max(in_labels)+1)) + q_init[range(len(in_labels)), in_labels] = 1.0 + q_init = softmax(q_init*init_smoothing, axis=1) + + # Prepare model for VB-HMM clustering + ubmWeights = np.array([1.0]) + ubmMeans = np.zeros((1,dim)) + invSigma= np.ones((1,dim)) + V=np.diag(np.sqrt(plda_psi[:dim]))[:,np.newaxis,:] + + # Use VB-HMM for x-vector clustering. Instead of i-vector extractor model, we use PLDA + # => GMM with only 1 component, V derived across-class covariance, and invSigma is inverse + # within-class covariance (i.e. identity) + q, _, _ = VB_diarization.VB_diarization(x, ubmMeans, invSigma, ubmWeights, V, pi=None, + gamma=q_init, maxSpeakers=q_init.shape[1], maxIters=40, epsilon=1e-6, loopProb=loop_prob, + Fa=fa, Fb=fb) + + labels = np.unique(q.argmax(1), return_inverse=True)[1] + + return {seg:label for seg,label in zip(segments,labels)} + +def main(): + args = get_args() + xvectors, segments, labels, plda_psi = read_args(args) + + seg2label_vb = vb_hmm(segments, labels, xvectors, plda_psi, args.init_smoothing, + args.loop_prob, args.fa, args.fb) + write_labels_file(seg2label_vb, args.output_label_file) + +if __name__=="__main__": + main() + diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh new file mode 100755 index 00000000000..70cd245e90a --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash + +# Copyright 2020 Desh Raj +# Apache 2.0. + +# This script performs Bayesian HMM on top of labels produced +# by a first-pass AHC clustering. See https://arxiv.org/abs/1910.08847 +# for details about the model. + +# Begin configuration section. +cmd="run.pl" +stage=0 +nj=10 +cleanup=true +rttm_channel=0 + +# The hyperparameters used here are taken from the DIHARD +# optimal hyperparameter values reported in: +# http://www.fit.vutbr.cz/research/groups/speech/publi/2019/diez_IEEE_ACM_2019_08910412.pdf +# These may require tuning for different datasets. +loop_prob=0.85 +fa=0.2 +fb=1 + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/ exp/xvectors_dev exp/xvector_nnet_1a/plda" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --stage # To control partial reruns" + echo " --cleanup # If true, remove temporary files" + exit 1; +fi + +dir=$1 +xvec_dir=$2 +plda=$3 + +mkdir -p $dir/tmp + +for f in $dir/labels ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +# check if numexpr is installed. Also install +# a modified version of kaldi_io with extra functions +# needed to read the PLDA file +result=`python3 -c "\ +try: + import kaldi_io, numexpr + print (int(hasattr(kaldi_io, 'read_plda'))) +except ImportError: + print('0')"` + +if [ "$result" == "0" ]; then + echo "Installing kaldi_io and numexpr" + python3 -m pip install git+https://github.com/desh2608/kaldi-io-for-python.git@vbx + python3 -m pip install numexpr +fi + +if [ $stage -le 0 ]; then + # Mean subtraction (If original x-vectors are high-dim, e.g. 512, you should + # consider also applying LDA to reduce dimensionality to, say, 200) + $cmd $xvec_dir/log/transform.log \ + ivector-subtract-global-mean scp:$xvec_dir/xvector.scp ark:$xvec_dir/xvector_norm.ark +fi + +echo -e "Performing bayesian HMM based x-vector clustering..\n" +# making a shell script for each job +for n in `seq $nj`; do + cat <<-EOF > $dir/tmp/vb_hmm.$n.sh + python3 diarization/vb_hmm_xvector.py \ + --loop-prob $loop_prob --fa $fa --fb $fb \ + $xvec_dir/xvector_norm.ark $plda $dir/labels.$n $dir/labels.vb.$n +EOF +done + +chmod a+x $dir/tmp/vb_hmm.*.sh +$cmd JOB=1:$nj $dir/log/vb_hmm.JOB.log \ + $dir/tmp/vb_hmm.JOB.sh + +if [ $stage -le 1 ]; then + echo "$0: combining labels" + for j in $(seq $nj); do cat $dir/labels.vb.$j; done > $dir/labels.vb || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing RTTM" + diarization/make_rttm.py --rttm-channel $rttm_channel $xvec_dir/plda_scores/segments $dir/labels.vb $dir/rttm.vb || exit 1; +fi + +if $cleanup ; then + rm -r $dir/tmp || exit 1; +fi diff --git a/egs/libri_css/s5_mono/local/decode.sh b/egs/libri_css/s5_mono/local/decode.sh index a0d65d47bc8..6c1fe57ff5a 100755 --- a/egs/libri_css/s5_mono/local/decode.sh +++ b/egs/libri_css/s5_mono/local/decode.sh @@ -111,7 +111,7 @@ if [ $stage -le 3 ]; then [ ! -d exp/xvector_nnet_1a ] && ./local/download_diarizer.sh - local/diarize.sh --nj $diar_nj --cmd "$train_cmd" --stage $diarizer_stage \ + local/diarize_bhmm.sh --nj $diar_nj --cmd "$train_cmd" --stage $diarizer_stage \ --ref-rttm $ref_rttm \ exp/xvector_nnet_1a \ data/${datadir} \ @@ -127,7 +127,7 @@ if [ $stage -le 4 ]; then asr_nj=$(wc -l < "data/$datadir/wav.scp") local/decode_diarized.sh --nj $asr_nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ --lm-suffix "_tgsmall" \ - exp/${datadir}_diarization data/$datadir data/lang_nosp_test_tgsmall \ + exp/${datadir}_diarization/rttm.vb data/$datadir data/lang_test_tgsmall \ exp/chain${nnet3_affix}/tdnn_${affix}_sp exp/nnet3${nnet3_affix} \ data/${datadir}_diarized || exit 1 done @@ -163,7 +163,7 @@ if $rnnlm_rescore; then rnnlm/lmrescore$pruned.sh \ --cmd "$decode_cmd --mem 8G" \ --weight 0.45 --max-ngram-order $ngram_order \ - data/lang_nosp_test_tgsmall $rnnlm_dir \ + data/lang_test_tgsmall $rnnlm_dir \ data/${decode_set}_diarized_hires ${decode_dir} \ ${ac_model_dir}/decode_${decode_set}_diarized_2stage_rescore done @@ -207,7 +207,7 @@ fi if [ $stage -le 9 ]; then local/decode_oracle.sh --stage $decode_oracle_stage \ --affix $affix \ - --lang-dir data/lang_nosp_test_tgsmall \ + --lang-dir data/lang_test_tgsmall \ --lm-suffix "_tgsmall" \ --rnnlm-rescore $rnnlm_rescore \ --test_sets "$test_sets" diff --git a/egs/libri_css/s5_mono/local/decode_diarized.sh b/egs/libri_css/s5_mono/local/decode_diarized.sh index b81515f22b4..6eda74506c5 100755 --- a/egs/libri_css/s5_mono/local/decode_diarized.sh +++ b/egs/libri_css/s5_mono/local/decode_diarized.sh @@ -15,7 +15,7 @@ echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . utils/parse_options.sh || exit 1; if [ $# != 6 ]; then - echo "Usage: $0 " + echo "Usage: $0 " echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain/tdnn_1a \ exp/nnet3_cleaned data/dev_diarized" echo "Options: " @@ -24,14 +24,14 @@ if [ $# != 6 ]; then exit 1; fi -rttm_dir=$1 +rttm=$1 data_in=$2 lang_dir=$3 asr_model_dir=$4 ivector_extractor=$5 out_dir=$6 -for f in $rttm_dir/rttm $data_in/wav.scp $data_in/text.bak \ +for f in $rttm $data_in/wav.scp $data_in/text.bak \ $lang_dir/L.fst $asr_model_dir/graph${lm_suffix}/HCLG.fst \ $asr_model_dir/final.mdl; do [ ! -f $f ] && echo "$0: No such file $f" && exit 1; @@ -46,8 +46,8 @@ fi if [ $stage -le 1 ]; then echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " - local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm \ - <(awk '{print $2" "$2" "$3}' $rttm_dir/rttm |sort -u) \ + local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm \ + <(awk '{print $2" "$2" "$3}' $rttm |sort -u) \ ${out_dir}_hires/utt2spk ${out_dir}_hires/segments utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt diff --git a/egs/libri_css/s5_mono/local/diarize_bhmm.sh b/egs/libri_css/s5_mono/local/diarize_bhmm.sh new file mode 100755 index 00000000000..30ae2def3f6 --- /dev/null +++ b/egs/libri_css/s5_mono/local/diarize_bhmm.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# Copyright 2019 David Snyder +# 2020 Desh Raj + +# Apache 2.0. +# +# This script takes an input directory that has a segments file (and +# a feats.scp file), and performs diarization on it, using BUTs +# Bayesian HMM-based diarization model. A first-pass of AHC is performed +# first followed by VB-HMM. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= +score_overlaps_only=true + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref_rttm ./local/dev_rttm # the location of the reference RTTM file" + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/segments $model_dir/plda \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform PLDA scoring +if [ $stage -le 3 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + echo "$0: performing PLDA scoring between all pairs of x-vectors" + diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \ + --target-energy 0.5 \ + --nj $nj $model_dir/ $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/plda_scores +fi + +if [ $stage -le 4 ]; then + echo "$0: performing clustering using PLDA scores (threshold tuned on dev)" + diarization/cluster.sh --cmd "$cmd" --nj $nj \ + --rttm-channel 1 --threshold 0.4 \ + $out_dir/xvectors_${name}/plda_scores $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" +fi + +if [ $stage -le 5 ]; then + echo "$0: performing VB-HMM on top of first-pass AHC" + diarization/vb_hmm_xvector.sh --nj $nj --rttm-channel 1 \ + $out_dir $out_dir/xvectors_${name} $model_dir/plda +fi + +hyp_rttm=${out_dir}/rttm.vb + +# For scoring the diarization system, we use the same tool that was +# used in the DIHARD II challenge. This is available at: +# https://github.com/nryant/dscore +if [ $stage -le 6 ]; then + echo "Diarization results for "${name} + if ! [ -d dscore ]; then + git clone https://github.com/desh2608/dscore.git -b libricss --single-branch || exit 1; + cd dscore + python -m pip install --user -r requirements.txt + cd .. + fi + + # Create per condition ref and hyp RTTM files for scoring per condition + mkdir -p tmp + conditions="0L 0S OV10 OV20 OV30 OV40" + cp $ref_rttm tmp/ref.all + cp $hyp_rttm tmp/hyp.all + for rttm in ref hyp; do + for cond in $conditions; do + cat tmp/$rttm.all | grep $cond > tmp/$rttm.$cond + done + done + + echo "Scoring all regions..." + for cond in $conditions 'all'; do + echo -n "Condition: $cond: " + ref_rttm_path=$(readlink -f tmp/ref.$cond) + hyp_rttm_path=$(readlink -f tmp/hyp.$cond) + cd dscore && python score.py -r $ref_rttm_path -s $hyp_rttm_path --global_only && cd .. || exit 1; + done + + # We also score overlapping regions only + if [ $score_overlaps_only == "true" ]; then + echo "Scoring overlapping regions..." + for cond in $conditions 'all'; do + echo -n "Condition: $cond: " + ref_rttm_path=$(readlink -f tmp/ref.$cond) + hyp_rttm_path=$(readlink -f tmp/hyp.$cond) + cd dscore && python score.py -r $ref_rttm_path -s $hyp_rttm_path --overlap_only --global_only && cd .. || exit 1; + done + fi + + rm -r tmp +fi diff --git a/egs/libri_css/s5_mono/local/download_diarizer.sh b/egs/libri_css/s5_mono/local/download_diarizer.sh index 959e8729215..753475ee3fd 100755 --- a/egs/libri_css/s5_mono/local/download_diarizer.sh +++ b/egs/libri_css/s5_mono/local/download_diarizer.sh @@ -31,7 +31,8 @@ tar -xvzf 0012_diarization_v1.tar.gz rm -f 0012_diarization_v1.tar.gz # Download PLDA model trained on augmented Librispeech data -wget https://desh2608.github.io/static/files/jsalt/plda 0012_diarization_v1/exp/xvector_nnet_1a/ +rm 0012_diarization_v1/exp/xvector_nnet_1a/plda +wget https://desh2608.github.io/static/files/jsalt/plda -P 0012_diarization_v1/exp/xvector_nnet_1a/ cd ../.. cp -r ${dir}/0012_diarization_v1/exp . rm -rf ${dir}