From 3172d0f67873a9f95687667dca25be3d4f1d4eb0 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 8 Jun 2020 21:38:50 -0400 Subject: [PATCH 1/5] added BUT's VBx diarization --- .../v1/diarization/VB_diarization.py | 249 +++-- .../v1/diarization/kaldi_io.py | 847 ++++++++++++++++++ .../v1/diarization/vb_hmm_xvector.py | 115 +++ .../v1/diarization/vb_hmm_xvector.sh | 93 ++ egs/libri_css/s5_mono/local/decode.sh | 10 +- .../s5_mono/local/decode_diarized.sh | 10 +- egs/libri_css/s5_mono/local/diarize_bhmm.sh | 129 +++ 7 files changed, 1315 insertions(+), 138 deletions(-) create mode 100644 egs/callhome_diarization/v1/diarization/kaldi_io.py create mode 100644 egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py create mode 100755 egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh create mode 100755 egs/libri_css/s5_mono/local/diarize_bhmm.sh diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py index 62676d64510..2b57bd12862 100755 --- a/egs/callhome_diarization/v1/diarization/VB_diarization.py +++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py @@ -1,5 +1,6 @@ -#!/usr/bin/env python3 -# Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz) +#!/usr/bin/env python + +# Copyright 2013-2019 Lukas Burget, Mireia Diez (burget@fit.vutbr.cz, mireia@fit.vutbr.cz) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,63 +16,57 @@ # # Revision History -# L. Burget 16/07/13 01:00AM - original version -# L. Burget 20/06/17 12:07AM - np.asarray replaced by .toarray() -# - minor bug fix in initializing q -# - minor bug fix in ELBO calculation -# - few more optimizations +# 16/07/13 01:00AM - original version +# 20/06/17 12:07AM - np.asarray replaced by .toarray() +# - minor bug fix in initializing q(Z) +# - minor bug fix in ELBO calculation +# - few more optimizations +# 03/10/19 02:27PM - speaker regularization coefficient Fb added +# import numpy as np from scipy.sparse import coo_matrix import scipy.linalg as spl -#import numexpr as ne # the dependency on this modul can be avoided by replacing -# # logsumexp_ne and exp_ne with logsumexp and np.exp +import numexpr as ne # the dependency on this modul can be avoided by replacing + # logsumexp_ne and exp_ne with logsumexp and np.exp -#[q sp Li] = -def VB_diarization(X, m, iE, w, V, sp=None, q=None, +#[gamma pi Li] = +def VB_diarization(X, m, invSigma, w, V, pi=None, gamma=None, maxSpeakers = 10, maxIters = 10, epsilon = 1e-4, loopProb = 0.99, statScale = 1.0, - alphaQInit = 1.0, downsample = None, VtiEV = None, ref=None, - plot=False, sparsityThr=0.001, llScale=1.0, minDur=1): + alphaQInit = 1.0, downsample = None, VtinvSigmaV = None, ref=None, + plot=False, sparsityThr=0.001, llScale=1.0, minDur=1, Fa=1.0, Fb=1.0): """ This a generalized version of speaker diarization described in: - Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors, - Montreal, CRIM, May 2008. - - Kenny, P., Reynolds, D., and Castaldo, F. Diarization of Telephone - Conversations using Factor Analysis IEEE Journal of Selected Topics in Signal - Processing, December 2010. + Diez. M., Burget. L., Landini. F., Cernocky. J. + Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors - The generalization introduced in this implementation lies in using an HMM - instead of the simple mixture model when modeling generation of segments - (or even frames) from speakers. HMM limits the probability of switching - between speakers when changing frames, which makes it possible to use - the model on frame-by-frame bases without any need to iterate between - 1) clustering speech segments and 2) re-segmentation (i.e. as it was done in - the paper above). + Variable names and equation numbers refer to those used the paper Inputs: X - T x D array, where columns are D dimensional feature vectors for T frames m - C x D array of GMM component means - iE - C x D array of GMM component inverse covariance matrix diagonals + invSigma - C x D array of GMM component inverse covariance matrix diagonals w - C dimensional column vector of GMM component weights V - R x C x D array of eigenvoices maxSpeakers - maximum number of speakers expected in the utterance maxIters - maximum number of algorithm iterations epsilon - stop iterating, if obj. fun. improvement is less than epsilon loopProb - probability of not switching speakers between frames - statScale - scale sufficient statiscits collected using UBM + statScale - deprecated, use Fa instead + Fa - scale sufficient statiscits collected using UBM + Fb - speaker regularization coefficient Fb (controls final # of speaker) llScale - scale UBM likelihood (i.e. llScale < 1.0 make atribution of frames to UBM componets more uncertain) sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory as the posteriors are represented by sparse matrix) - alphaQInit - Dirichlet concentraion parameter for initializing q + alphaQInit - Dirichlet concentraion parameter for initializing gamma downsample - perform diarization on input downsampled by this factor - VtiEV - C x (R**2+R)/2 matrix normally calculated by VB_diarization when - VtiEV is None. However, it can be pre-calculated using function - precalculate_VtiEV(V) and used across calls of VB_diarization. + VtinvSigmaV - C x (R**2+R)/2 matrix normally calculated by VB_diarization when + VtinvSigmaV is None. However, it can be pre-calculated using function + precalculate_VtinvSigmaV(V) and used across calls of VB_diarization. minDur - minimum number of frames between speaker turns imposed by linear chains of HMM states corresponding to each speaker. All the states in a chain share the same output distribution @@ -80,192 +75,190 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None, plot - if set to True, plot per-frame speaker posteriors. Outputs: - q - S x T matrix of posteriors attribution each frame to one of S possible + gamma - S x T matrix of posteriors attribution each frame to one of S possible speakers, where S is given by opts.maxSpeakers - sp - S dimensional column vector of ML learned speaker priors. Ideally, these + pi - S dimensional column vector of ML learned speaker priors. Ideally, these should allow to estimate # of speaker in the utterance as the probabilities of the redundant speaker should converge to zero. - Li - values of auxiliary function (and DER and frame cross-entropy between q + Li - values of auxiliary function (and DER and frame cross-entropy between gamma and reference if 'ref' is provided) over iterations. """ - # The references to equations corresponds to the technical report: - # Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors, - # Montreal, CRIM, May 2008. + # The references to equations corresponds to + # Diez. M., Burget. L., Landini. F., Cernocky. J. + # Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors D=X.shape[1] # feature dimensionality C=len(w) # number of mixture components R=V.shape[0] # subspace rank nframes=X.shape[0] - if VtiEV is None: - VtiEV = precalculate_VtiEV(V, iE) + if VtinvSigmaV is None: + VtinvSigmaV = precalculate_VtinvSigmaV(V, invSigma) V = V.reshape(V.shape[0],-1) - if sp is None: - sp = np.ones(maxSpeakers)/maxSpeakers + if pi is None: + pi = np.ones(maxSpeakers)/maxSpeakers else: - maxSpeakers = len(sp) + maxSpeakers = len(pi) - if q is None: - # initialize q from flat Dirichlet prior with concentrsaion parameter alphaQInit - q = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers)) - q = q / q.sum(1, keepdims=True) + if gamma is None: + # initialize gamma from flat Dirichlet prior with concentration parameter alphaQInit + gamma = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers)) + gamma = gamma / gamma.sum(1, keepdims=True) # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics) - ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi)) + ll = (X**2).dot(-0.5*invSigma.T) + X.dot(invSigma.T*m.T)-0.5*((invSigma * m**2 - np.log(invSigma)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi)) ll *= llScale - G = logsumexp(ll, axis=1) - NN = np.exp(ll - G[:,np.newaxis]) * statScale - NN[NN 0 and L - Li[-2][0] < epsilon: - if L - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!') + if ii > 0 and ELBO - Li[-2][0] < epsilon: + if ELBO - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!') break if downsample is not None: - #upsample resulting q to match number of frames in the input utterance - q = downsampler.T.dot(q) + # upsample resulting gamma to match number of frames in the input utterance + gamma = downsampler.T.dot(gamma) - return q, sp, Li + return gamma, pi, Li -def precalculate_VtiEV(V, iE): +def precalculate_VtinvSigmaV(V, invSigma): tril_ind = np.tril_indices(V.shape[0]) - VtiEV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype) + VtinvSigmaV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype) for c in range(V.shape[1]): - VtiEV[c,:] = np.dot(V[:,c,:]*iE[np.newaxis,c,:], V[:,c,:].T)[tril_ind] - return VtiEV + VtinvSigmaV[c,:] = np.dot(V[:,c,:]*invSigma[np.newaxis,c,:], V[:,c,:].T)[tril_ind] + return VtinvSigmaV -# Initialize q (per-frame speaker posteriors) from a reference +# Initialize gamma (per-frame speaker posteriors) from a reference # (vector of per-frame zero based integer speaker IDs) -def frame_labels2posterior_mx(labels, maxSpeakers): - #initialize from reference - #pmx = np.zeros((len(labels), labels.max()+1)) - pmx = np.zeros((len(labels), maxSpeakers)) +def frame_labels2posterior_mx(labels): + # initialize from reference + pmx = np.zeros((len(labels), labels.max()+1)) pmx[np.arange(len(labels)), labels] = 1 return pmx + # Calculates Diarization Error Rate (DER) or per-frame cross-entropy between -# reference (vector of per-frame zero based integer speaker IDs) and q (per-frame -# speaker posteriors). If expected=False, q is converted into hard labels before -# calculating DER. If expected=TRUE, posteriors in q are used to calculated -# "expected" DER. -def DER(q, ref, expected=True, xentropy=False): +# reference (vector of per-frame zero based integer speaker IDs) and gamma +# (per-frame speaker posteriors). If expected=False, gamma is converted into +# hard labels before calculating DER. If expected=TRUE, posteriors in gamma +# are used to calculate "expected" DER. +def DER(gamma, ref, expected=True, xentropy=False): from itertools import permutations if not expected: - # replce probabiities in q by zeros and ones - hard_labels = q.argmax(1) - q = np.zeros_like(q) - q[range(len(q)), hard_labels] = 1 + # replace probabiities in gamma by zeros and ones + hard_labels = gamma.argmax(1) + gamma = np.zeros_like(gamma) + gamma[range(len(gamma)), hard_labels] = 1 - err_mx = np.empty((ref.max()+1, q.shape[1])) + err_mx = np.empty((ref.max()+1, gamma.shape[1])) for s in range(err_mx.shape[0]): - tmpq = q[ref == s,:] - err_mx[s] = (-np.log(tmpq) if xentropy else tmpq).sum(0) + tmpgamma = gamma[ref == s,:] + err_mx[s] = (-np.log(tmpgamma) if xentropy else tmpgamma).sum(0) if err_mx.shape[0] < err_mx.shape[1]: err_mx = err_mx.T # try all alignments (permutations) of reference and detected speaker - #could be written in more efficient way using dynamic programing + # could be written in more efficient way using dynamic programing acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum() for perm in permutations(range(err_mx.shape[0]))] if xentropy: @@ -342,7 +335,7 @@ def forward_backward(lls, tr, ip): lfw[0] = lls[0] + np.log(ip) lbw[-1] = 0.0 - for ii in range(1,len(lls)): + for ii in range(1,len(lls)): lfw[ii] = lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1) for ii in reversed(range(len(lls)-1)): @@ -350,4 +343,4 @@ def forward_backward(lls, tr, ip): tll = logsumexp(lfw[-1]) sp = np.exp(lfw + lbw - tll) - return sp, tll, lfw, lbw + return sp, tll, lfw, lbw \ No newline at end of file diff --git a/egs/callhome_diarization/v1/diarization/kaldi_io.py b/egs/callhome_diarization/v1/diarization/kaldi_io.py new file mode 100644 index 00000000000..9a2d090c01b --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/kaldi_io.py @@ -0,0 +1,847 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Brno University of Technology (author: Karel Vesely) +# Licensed under the Apache License, Version 2.0 (the "License") + +from __future__ import print_function +from __future__ import division + +import numpy as np +import sys, os, re, gzip, struct + +################################################# +# Adding kaldi tools to shell path, + +# Select kaldi, +if not 'KALDI_ROOT' in os.environ: + # Default! To change run python with 'export KALDI_ROOT=/some_dir python' + sys.exit(1) + +# Add kaldi tools to path, +os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] + + +################################################# +# Define all custom exceptions, +class UnsupportedDataType(Exception): pass +class UnknownVectorHeader(Exception): pass +class UnknownMatrixHeader(Exception): pass + +class BadSampleSize(Exception): pass +class BadInputFormat(Exception): pass + +class SubprocessFailed(Exception): pass + +################################################# +# Data-type independent helper functions, + +def open_or_fd(file, mode='rb'): + """ fd = open_or_fd(file) + Open file, gzipped file, pipe, or forward the file-descriptor. + Eventually seeks in the 'file' argument contains ':offset' suffix. + """ + offset = None + try: + # strip 'ark:' prefix from r{x,w}filename (optional), + if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): + (prefix,file) = file.split(':',1) + # separate offset from filename (optional), + if re.search(':[0-9]+$', file): + (file,offset) = file.rsplit(':',1) + # input pipe? + if file[-1] == '|': + fd = popen(file[:-1], 'rb') # custom, + # output pipe? + elif file[0] == '|': + fd = popen(file[1:], 'wb') # custom, + # is it gzipped? + elif file.split('.')[-1] == 'gz': + fd = gzip.open(file, mode) + # a normal file... + else: + fd = open(file, mode) + except TypeError: + # 'file' is opened file descriptor, + fd = file + # Eventually seek to offset, + if offset != None: fd.seek(int(offset)) + return fd + +# based on '/usr/local/lib/python3.6/os.py' +def popen(cmd, mode="rb"): + if not isinstance(cmd, str): + raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) + + import subprocess, io, threading + + # cleanup function for subprocesses, + def cleanup(proc, cmd): + ret = proc.wait() + if ret > 0: + raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) + return + + # text-mode, + if mode == "r": + proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=sys.stderr) + threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, + return io.TextIOWrapper(proc.stdout) + elif mode == "w": + proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stderr=sys.stderr) + threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, + return io.TextIOWrapper(proc.stdin) + # binary, + elif mode == "rb": + proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=sys.stderr) + threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, + return proc.stdout + elif mode == "wb": + proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stderr=sys.stderr) + threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, + return proc.stdin + # sanity, + else: + raise ValueError("invalid mode %s" % mode) + + +def read_key(fd): + """ [key] = read_key(fd) + Read the utterance-key from the opened ark/stream descriptor 'fd'. + """ + key = '' + while 1: + char = fd.read(1).decode("latin1") + if char == '' : break + if char == ' ' : break + key += char + key = key.strip() + if key == '': return None # end of file, + assert(re.match('^\S+$',key) != None) # check format (no whitespace!) + return key + + +################################################# +# Integer vectors (alignments, ...), + +def read_ali_ark(file_or_fd): + """ Alias to 'read_vec_int_ark()' """ + return read_vec_int_ark(file_or_fd) + +def read_vec_int_ark(file_or_fd): + """ generator(key,vec) = read_vec_int_ark(file_or_fd) + Create generator of (key,vector) tuples, which reads from the ark file/stream. + file_or_fd : ark, gzipped ark, pipe or opened file descriptor. + + Read ark to a 'dictionary': + d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + ali = read_vec_int(fd) + yield key, ali + key = read_key(fd) + finally: + if fd is not file_or_fd: fd.close() + +def read_vec_int(file_or_fd): + """ [int-vec] = read_vec_int(file_or_fd) + Read kaldi integer vector, ascii or binary input, + """ + fd = open_or_fd(file_or_fd) + binary = fd.read(2).decode() + if binary == '\0B': # binary flag + assert(fd.read(1).decode() == '\4'); # int-size + vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim + if vec_size == 0: + return np.array([], dtype='int32') + # Elements from int32 vector are sored in tuples: (sizeof(int32), value), + vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) + assert(vec[0]['size'] == 4) # int32 size, + ans = vec[:]['value'] # values are in 2nd column, + else: # ascii, + arr = (binary + fd.readline().decode()).strip().split() + try: + arr.remove('['); arr.remove(']') # optionally + except ValueError: + pass + ans = np.array(arr, dtype=int) + if fd is not file_or_fd : fd.close() # cleanup + return ans + +# Writing, +def write_vec_int(file_or_fd, v, key=''): + """ write_vec_int(f, v, key='') + Write a binary kaldi integer vector to filename or stream. + Arguments: + file_or_fd : filename or opened file descriptor for writing, + v : the vector to be stored, + key (optional) : used for writing ark-file, the utterance-id gets written before the vector. + + Example of writing single vector: + kaldi_io.write_vec_int(filename, vec) + + Example of writing arkfile: + with open(ark_file,'w') as f: + for key,vec in dict.iteritems(): + kaldi_io.write_vec_flt(f, vec, key=key) + """ + fd = open_or_fd(file_or_fd, mode='wb') + if sys.version_info[0] == 3: assert(fd.mode == 'wb') + try: + if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), + fd.write('\0B'.encode()) # we write binary! + # dim, + fd.write('\4'.encode()) # int32 type, + fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) + # data, + for i in range(len(v)): + fd.write('\4'.encode()) # int32 type, + fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, + finally: + if fd is not file_or_fd : fd.close() + + +################################################# +# Float vectors (confidences, ivectors, ...), + +# Reading, +def read_vec_flt_scp(file_or_fd): + """ generator(key,mat) = read_vec_flt_scp(file_or_fd) + Returns generator of (key,vector) tuples, read according to kaldi scp. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the scp: + for key,vec in kaldi_io.read_vec_flt_scp(file): + ... + + Read scp to a 'dictionary': + d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } + """ + fd = open_or_fd(file_or_fd) + try: + for line in fd: + (key,rxfile) = line.decode().split(' ') + vec = read_vec_flt(rxfile) + yield key, vec + finally: + if fd is not file_or_fd : fd.close() + +def read_vec_flt_ark(file_or_fd): + """ generator(key,vec) = read_vec_flt_ark(file_or_fd) + Create generator of (key,vector) tuples, reading from an ark file/stream. + file_or_fd : ark, gzipped ark, pipe or opened file descriptor. + + Read ark to a 'dictionary': + d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + ali = read_vec_flt(fd) + yield key, ali + key = read_key(fd) + finally: + if fd is not file_or_fd : fd.close() + +def read_vec_flt(file_or_fd): + """ [flt-vec] = read_vec_flt(file_or_fd) + Read kaldi float vector, ascii or binary input, + """ + fd = open_or_fd(file_or_fd) + binary = fd.read(2).decode() + if binary == '\0B': # binary flag + ans = _read_vec_flt_binary(fd) + else: # ascii, + arr = (binary + fd.readline().decode()).strip().split() + try: + arr.remove('['); arr.remove(']') # optionally + except ValueError: + pass + ans = np.array(arr, dtype=float) + if fd is not file_or_fd : fd.close() # cleanup + return ans + +def _read_vec_flt_binary(fd): + header = fd.read(3).decode() + if header == 'FV ' : sample_size = 4 # floats + elif header == 'DV ' : sample_size = 8 # doubles + else : raise UnknownVectorHeader("The header contained '%s'" % header) + assert (sample_size > 0) + # Dimension, + assert (fd.read(1).decode() == '\4'); # int-size + vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim + if vec_size == 0: + return np.array([], dtype='float32') + # Read whole vector, + buf = fd.read(vec_size * sample_size) + if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') + elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') + else : raise BadSampleSize + return ans + + +# Writing, +def write_vec_flt(file_or_fd, v, key=''): + """ write_vec_flt(f, v, key='') + Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. + Arguments: + file_or_fd : filename or opened file descriptor for writing, + v : the vector to be stored, + key (optional) : used for writing ark-file, the utterance-id gets written before the vector. + + Example of writing single vector: + kaldi_io.write_vec_flt(filename, vec) + + Example of writing arkfile: + with open(ark_file,'w') as f: + for key,vec in dict.iteritems(): + kaldi_io.write_vec_flt(f, vec, key=key) + """ + fd = open_or_fd(file_or_fd, mode='wb') + if sys.version_info[0] == 3: assert(fd.mode == 'wb') + try: + if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), + fd.write('\0B'.encode()) # we write binary! + # Data-type, + if v.dtype == 'float32': fd.write('FV '.encode()) + elif v.dtype == 'float64': fd.write('DV '.encode()) + else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) + # Dim, + fd.write('\04'.encode()) + fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim + # Data, + fd.write(v.tobytes()) + finally: + if fd is not file_or_fd : fd.close() + + +################################################# +# Float matrices (features, transformations, ...), + +# Reading, +def read_mat_scp(file_or_fd): + """ generator(key,mat) = read_mat_scp(file_or_fd) + Returns generator of (key,matrix) tuples, read according to kaldi scp. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the scp: + for key,mat in kaldi_io.read_mat_scp(file): + ... + + Read scp to a 'dictionary': + d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } + """ + fd = open_or_fd(file_or_fd) + try: + for line in fd: + (key,rxfile) = line.decode().split(' ') + mat = read_mat(rxfile) + yield key, mat + finally: + if fd is not file_or_fd : fd.close() + +def read_mat_ark(file_or_fd): + """ generator(key,mat) = read_mat_ark(file_or_fd) + Returns generator of (key,matrix) tuples, read from ark file/stream. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the ark: + for key,mat in kaldi_io.read_mat_ark(file): + ... + + Read ark to a 'dictionary': + d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + mat = read_mat(fd) + yield key, mat + key = read_key(fd) + finally: + if fd is not file_or_fd : fd.close() + +def read_mat(file_or_fd): + """ [mat] = read_mat(file_or_fd) + Reads single kaldi matrix, supports ascii and binary. + file_or_fd : file, gzipped file, pipe or opened file descriptor. + """ + fd = open_or_fd(file_or_fd) + try: + binary = fd.read(2).decode() + if binary == '\0B' : + mat = _read_mat_binary(fd) + else: + assert(binary == ' [') + mat = _read_mat_ascii(fd) + finally: + if fd is not file_or_fd: fd.close() + return mat + +def _read_mat_binary(fd): + # Data type + header = fd.read(3).decode() + # 'CM', 'CM2', 'CM3' are possible values, + if header.startswith('CM'): return _read_compressed_mat(fd, header) + elif header.startswith('SM'): return _read_sparse_mat(fd, header) + elif header == 'FM ': sample_size = 4 # floats + elif header == 'DM ': sample_size = 8 # doubles + else: raise UnknownMatrixHeader("The header contained '%s'" % header) + assert(sample_size > 0) + # Dimensions + s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] + # Read whole matrix + buf = fd.read(rows * cols * sample_size) + if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') + elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') + else : raise BadSampleSize + mat = np.reshape(vec,(rows,cols)) + return mat + +def _read_mat_ascii(fd): + rows = [] + while 1: + line = fd.readline().decode() + if (len(line) == 0) : raise BadInputFormat # eof, should not happen! + if len(line.strip()) == 0 : continue # skip empty line + arr = line.strip().split() + if arr[-1] != ']': + rows.append(np.array(arr,dtype='float32')) # not last line + else: + rows.append(np.array(arr[:-1],dtype='float32')) # last line + mat = np.vstack(rows) + return mat + + +def _read_compressed_mat(fd, format): + """ Read a compressed matrix, + see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h + methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), + """ + assert(format == 'CM ') # The formats CM2, CM3 are not supported... + + # Format of header 'struct', + global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, + per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) + + # Read global header, + globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] + + # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] + # { cols }{ size } + col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) + col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32) + data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, + + mat = np.zeros((cols,rows), dtype='float32') + p0 = col_headers[:, 0].reshape(-1, 1) + p25 = col_headers[:, 1].reshape(-1, 1) + p75 = col_headers[:, 2].reshape(-1, 1) + p100 = col_headers[:, 3].reshape(-1, 1) + mask_0_64 = (data <= 64) + mask_193_255 = (data > 192) + mask_65_192 = (~(mask_0_64 | mask_193_255)) + + mat += (p0 + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32) + mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32) + mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32) + + return mat.T # transpose! col-major -> row-major, + +def _read_sparse_mat(fd, format): + """ Read a sparse matrix, + """ + from scipy.sparse import csr_matrix + assert (format == 'SM ') + + # Mapping for matrix elements, + def read_sparse_vector(fd): + _format = fd.read(3).decode() + assert (_format == 'SV ') + _, dim = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] + _, num_elems = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] + col = [] + data = [] + for j in range(num_elems): + size = np.frombuffer(fd.read(1), dtype='int8', count=1)[0] + dtype = 'int32' if size == 4 else 'int64' + c = np.frombuffer(fd.read(size), dtype=dtype, count=1)[0] + size = np.frombuffer(fd.read(1), dtype='int8', count=1)[0] + dtype = 'float32' if size == 4 else 'float64' + d = np.frombuffer(fd.read(size), dtype=dtype, count=1)[0] + col.append(c) + data.append(d) + return col, data, dim + + _, num_rows = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] + + rows = [] + cols = [] + all_data = [] + max_dim = 0 + for i in range(num_rows): + col, data, dim = read_sparse_vector(fd) + rows += [i] * len(col) + cols += col + all_data += data + max_dim = max(dim, max_dim) + sparse_mat = csr_matrix((all_data, (rows, cols)), shape=(num_rows, max_dim)) + return sparse_mat + +# Writing, +def write_mat(file_or_fd, m, key=''): + """ write_mat(f, m, key='') + Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. + Arguments: + file_or_fd : filename of opened file descriptor for writing, + m : the matrix to be stored, + key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. + + Example of writing single matrix: + kaldi_io.write_mat(filename, mat) + + Example of writing arkfile: + with open(ark_file,'w') as f: + for key,mat in dict.iteritems(): + kaldi_io.write_mat(f, mat, key=key) + """ + fd = open_or_fd(file_or_fd, mode='wb') + if sys.version_info[0] == 3: assert(fd.mode == 'wb') + try: + if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), + fd.write('\0B'.encode()) # we write binary! + # Data-type, + if m.dtype == 'float32': fd.write('FM '.encode()) + elif m.dtype == 'float64': fd.write('DM '.encode()) + else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) + # Dims, + fd.write('\04'.encode()) + fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows + fd.write('\04'.encode()) + fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols + # Data, + fd.write(m.tobytes()) + finally: + if fd is not file_or_fd : fd.close() + + +################################################# +# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) +# Corresponds to: vector > > +# - outer vector: time axis +# - inner vector: records at the time +# - tuple: int = index, float = value +# + +def read_cnet_ark(file_or_fd): + """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ + return read_post_ark(file_or_fd) + +def read_post_rxspec(file_): + """ adaptor to read both 'ark:...' and 'scp:...' inputs of posteriors, + """ + if file_.startswith("ark:"): + return read_post_ark(file_) + elif file_.startswith("scp:"): + return read_post_scp(file_) + else: + print("unsupported intput type: %s" % file_) + print("it should begint with 'ark:' or 'scp:'") + sys.exit(1) + +def read_post_scp(file_or_fd): + """ generator(key,post) = read_post_scp(file_or_fd) + Returns generator of (key,post) tuples, read according to kaldi scp. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the scp: + for key,post in kaldi_io.read_post_scp(file): + ... + + Read scp to a 'dictionary': + d = { key:post for key,post in kaldi_io.read_post_scp(file) } + """ + fd = open_or_fd(file_or_fd) + try: + for line in fd: + (key,rxfile) = line.decode().split(' ') + post = read_post(rxfile) + yield key, post + finally: + if fd is not file_or_fd : fd.close() + +def read_post_ark(file_or_fd): + """ generator(key,vec>) = read_post_ark(file) + Returns generator of (key,posterior) tuples, read from ark file. + file_or_fd : ark, gzipped ark, pipe or opened file descriptor. + + Iterate the ark: + for key,post in kaldi_io.read_post_ark(file): + ... + + Read ark to a 'dictionary': + d = { key:post for key,post in kaldi_io.read_post_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + post = read_post(fd) + yield key, post + key = read_key(fd) + finally: + if fd is not file_or_fd: fd.close() + +def read_post(file_or_fd): + """ [post] = read_post(file_or_fd) + Reads single kaldi 'Posterior' in binary format. + + The 'Posterior' is C++ type 'vector > >', + the outer-vector is usually time axis, inner-vector are the records + at given time, and the tuple is composed of an 'index' (integer) + and a 'float-value'. The 'float-value' can represent a probability + or any other numeric value. + + Returns vector of vectors of tuples. + """ + fd = open_or_fd(file_or_fd) + ans=[] + binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag + assert(fd.read(1).decode() == '\4'); # int-size + outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) + + # Loop over 'outer-vector', + for i in range(outer_vec_size): + assert(fd.read(1).decode() == '\4'); # int-size + inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) + data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) + assert(data[0]['size_idx'] == 4) + assert(data[0]['size_post'] == 4) + ans.append(data[['idx','post']].tolist()) + + if fd is not file_or_fd: fd.close() + return ans + + +################################################# +# Kaldi Confusion Network bin begin/end times, +# (kaldi stores CNs time info separately from the Posterior). +# + +def read_cntime_ark(file_or_fd): + """ generator(key,vec>) = read_cntime_ark(file_or_fd) + Returns generator of (key,cntime) tuples, read from ark file. + file_or_fd : file, gzipped file, pipe or opened file descriptor. + + Iterate the ark: + for key,time in kaldi_io.read_cntime_ark(file): + ... + + Read ark to a 'dictionary': + d = { key:time for key,time in kaldi_io.read_post_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + cntime = read_cntime(fd) + yield key, cntime + key = read_key(fd) + finally: + if fd is not file_or_fd : fd.close() + +def read_cntime(file_or_fd): + """ [cntime] = read_cntime(file_or_fd) + Reads single kaldi 'Confusion Network time info', in binary format: + C++ type: vector >. + (begin/end times of bins at the confusion network). + + Binary layout is ' ...' + + file_or_fd : file, gzipped file, pipe or opened file descriptor. + + Returns vector of tuples. + """ + fd = open_or_fd(file_or_fd) + binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary + + assert(fd.read(1).decode() == '\4'); # int-size + vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) + + data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) + assert(data[0]['size_beg'] == 4) + assert(data[0]['size_end'] == 4) + ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), + + if fd is not file_or_fd : fd.close() + return ans + + +################################################# +# Segments related, +# + +# Segments as 'Bool vectors' can be handy, +# - for 'superposing' the segmentations, +# - for frame-selection in Speaker-ID experiments, +def read_segments_as_bool_vec(segments_file): + """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) + using kaldi 'segments' file for 1 wav, format : ' ' + - t-beg, t-end is in seconds, + - assumed 100 frames/second, + """ + segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) + # Sanity checks, + assert(len(segs) > 0) # empty segmentation is an error, + assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, + # Convert time to frame-indexes, + start = np.rint([100 * rec[2] for rec in segs]).astype(int) + end = np.rint([100 * rec[3] for rec in segs]).astype(int) + # Taken from 'read_lab_to_bool_vec', htk.py, + frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], + np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) + assert np.sum(end-start) == np.sum(frms) + return frms + +########################################################## +# For reading archieves (eg files) into the feature format +# Not Fully Tested +########################################################## +def read_token(fd, expected_token=None): + token = '' + while True: + char = fd.read(1).decode() + if char == '': + break + if char == ' ': + break + token += char + if expected_token is not None: + assert token == expected_token + return token + + +def read_index_vector(fd): + def read_index(fd, prev_index=None): + c = np.frombuffer(fd.read(1), dtype='int8', count=1)[0] + if prev_index is None: + if abs(c) < 125: + n = x = 0 + t = int(c) + else: + assert c == 127 + _, n, _, t, _, x = np.frombuffer(fd.read(15), dtype='int8,int32,int8,int32,int8,int32', count=1)[0] + else: + if abs(c) < 125: + n, t, x = prev_index[0], prev_index[1] + c, prev_index[2] + else: + assert c == 127 + _, n, _, t, _, x = np.frombuffer(fd.read(15), dtype='int8,int32,int8,int32,int8,int32', count=1)[0] + return n, t, x + + read_token(fd, "") + _, size = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] + prev_index = None + for i in range(size): + prev_index = read_index(fd, prev_index) + +def read_egs_ark(file_or_fd): + """ + THERE MAY BE SOME BUGS ! + generator(key,mat) = read_egs_ark(file_or_fd) + Returns generator of (key, matrix) tuples, read from ark file/stream. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the ark: + for key,mat in kaldi_io.read_egs_ark(file): + ... + + Read ark to a 'dictionary': + d = { key:mat for key,mat in kaldi_io.read_egs_ark(file) } + """ + fd = kaldi_io.open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + # print(key) + binary = fd.read(2).decode() + assert binary == '\0B' + read_token(fd, "") + read_token(fd, "") + _, examples_count = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] + # assert examples_count == 2 + read_token(fd, "") + read_token(fd, "input") + read_index_vector(fd) + mat = kaldi_io._read_mat_binary(fd) + read_token(fd, "") + # output + read_token(fd, "") + read_token(fd, "output") + read_index_vector(fd) + sparse_lab = kaldi_io._read_mat_binary(fd) + read_token(fd, "") + read_token(fd, "") + yield key, mat + key = read_token(fd) + + finally: + if fd is not file_or_fd : fd.close() +########################################################## + + + +################################################# +# Following code added by Lukas Burget + +def _read_vec_binary(fd): + # Data type, + type = fd.read(3) + if type == b'FV ': sample_size = 4 # floats + if type == b'DV ': sample_size = 8 # doubles + assert(sample_size > 0) + # Dimension, + assert(fd.read(1) == b'\4'); # int-size + vec_size = struct.unpack(' ') + plda_mean = _read_vec_binary(fd) + plda_trans = _read_mat_binary(fd) + plda_psi = _read_vec_binary(fd) + else: + assert(binary+fd.read(5) == b' ') + #plda_mean = _read_vec_ascii(fd, binary) + plda_mean = np.array(fd.readline().strip(' \n[]').split(), dtype=float) + assert(fd.read(2) == b' [') + plda_trans = _read_mat_ascii(fd) + plda_psi = np.array(fd.readline().strip(' \n[]').split(), dtype=float) + assert(fd.read(8) == b' ') + finally: + if fd is not file_or_fd: fd.close() + return plda_mean, plda_trans, plda_psi \ No newline at end of file diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py new file mode 100644 index 00000000000..2907cc2d114 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# Copyright 2020 Johns Hopkins University (Author: Desh Raj) +# Apache 2.0 + +# This script is based on the Bayesian HMM-based xvector clustering +# code released by BUTSpeech at: https://github.com/BUTSpeechFIT/VBx. +# Note that this assumes that the provided labels are for a single +# recording. So this should be called from a script such as +# vb_hmm_xvector.sh which can divide all labels into per recording +# labels. + +import sys, argparse, struct +import numpy as np +import itertools +import kaldi_io + +from scipy.special import softmax + +import VB_diarization + +########### HELPER FUNCTIONS ##################################### + +def get_args(): + parser = argparse.ArgumentParser( + description="""This script performs Bayesian HMM-based + clustering of x-vectors for one recording""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--init-smoothing", type=float, default=10, + help="AHC produces hard assignments of x-vetors to speakers." + " These are smoothed to soft assignments as the initialization" + " for VB-HMM. This parameter controls the amount of smoothing." + " Not so important, high value (e.g. 10) is OK => keeping hard assigment") + parser.add_argument("--loop-prob", type=float, default=0.80, + help="probability of not switching speakers between frames") + parser.add_argument("--fa", type=float, default=0.4, + help="scale sufficient statistics collected using UBM") + parser.add_argument("--fb", type=float, default=11, + help="speaker regularization coefficient Fb (controls final # of speaker)") + parser.add_argument("xvector_ark_file", type=str, + help="Ark file containing xvectors for all subsegments") + parser.add_argument("plda", type=str, + help="path to PLDA model") + parser.add_argument("input_label_file", type=str, + help="path of input label file") + parser.add_argument("output_label_file", type=str, + help="path of output label file") + args = parser.parse_args() + return args + +def read_labels_file(label_file): + segments = [] + labels = [] + with open(label_file, 'r') as f: + for line in f.readlines(): + segment, label = line.strip().split() + segments.append(segment) + labels.append(int(label)) + return segments, labels + +def write_labels_file(seg2label, out_file): + f = open(out_file, 'w') + for seg in sorted(seg2label.keys()): + f.write("{} {}\n".format(seg, seg2label[seg])) + f.close() + return + +def read_args(args): + segments, labels = read_labels_file(args.input_label_file) + xvec_all = dict(kaldi_io.read_vec_flt_ark(args.xvector_ark_file)) + xvectors = [] + for segment in segments: + xvectors.append(xvec_all[segment]) + _, _, plda_psi = kaldi_io.read_plda(args.plda) + return xvectors, segments, labels, plda_psi + + +################################################################### + +def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, fa, fb): + x = np.array(xvectors) + dim = x.shape[1] + + # Smooth the hard labels obtained from AHC to soft assignments of x-vectors to speakers + q_init = np.zeros((len(in_labels), np.max(in_labels)+1)) + q_init[range(len(in_labels)), in_labels] = 1.0 + q_init = softmax(q_init*init_smoothing, axis=1) + + # Prepare model for VB-HMM clustering + ubmWeights = np.array([1.0]) + ubmMeans = np.zeros((1,dim)) + invSigma= np.ones((1,dim)) + V=np.diag(np.sqrt(plda_psi[:dim]))[:,np.newaxis,:] + + # Use VB-HMM for x-vector clustering. Instead of i-vector extractor model, we use PLDA + # => GMM with only 1 component, V derived across-class covariance, and invSigma is inverse + # within-class covariance (i.e. identity) + q, _, _ = VB_diarization.VB_diarization(x, ubmMeans, invSigma, ubmWeights, V, pi=None, + gamma=q_init, maxSpeakers=q_init.shape[1], maxIters=40, epsilon=1e-6, loopProb=loop_prob, + Fa=fa, Fb=fb) + + labels = np.unique(q.argmax(1), return_inverse=True)[1] + + return {seg:label for seg,label in zip(segments,labels)} + +def main(): + args = get_args() + xvectors, segments, labels, plda_psi = read_args(args) + + seg2label_vb = vb_hmm(segments, labels, xvectors, plda_psi, args.init_smoothing, + args.loop_prob, args.fa, args.fb) + write_labels_file(seg2label_vb, args.output_label_file) + +if __name__=="__main__": + main() + diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh new file mode 100755 index 00000000000..0512da39df5 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +# Copyright 2020 Desh Raj +# Apache 2.0. + +# This script performs Bayesian HMM on top of labels produced +# by a first-pass AHC clustering. See https://arxiv.org/abs/1910.08847 +# for details about the model. + +# Begin configuration section. +cmd="run.pl" +stage=0 +nj=10 +cleanup=true +rttm_channel=0 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/ exp/xvectors_dev exp/xvector_nnet_1a/plda" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --stage # To control partial reruns" + echo " --cleanup # If true, remove temporary files" + exit 1; +fi + +dir=$1 +xvec_dir=$2 +plda=$3 + +mkdir -p $dir/tmp + +for f in $dir/labels ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +# check if kaldi_io and numexpr are installed +result=`python3 -c "\ +try: + import kaldi_io, numexpr + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "0" ]; then + echo "Installing kaldi_io and numexpr" + python3 -m pip install kaldi_io + python3 -m pip install numexpr +fi + +if [ $stage -le 0 ]; then + # Mean subtraction (If original x-vectors are high-dim, e.g. 512, you should + # consider also applying LDA to reduce dimensionality to, say, 200) + $cmd $xvec_dir/log/transform.log \ + ivector-subtract-global-mean scp:$xvec_dir/xvector.scp ark:$xvec_dir/xvector_norm.ark +fi + +echo -e "Performing bayesian HMM based x-vector clustering..\n" +# making a shell script for each job +for n in `seq $nj`; do + cat <<-EOF > $dir/tmp/vb_hmm.$n.sh + python3 diarization/vb_hmm_xvector.py \ + --loop-prob 0.85 --fa 0.2 --fb 1 \ + $xvec_dir/xvector_norm.ark $plda $dir/labels.$n $dir/labels.vb.$n +EOF +done + +chmod a+x $dir/tmp/vb_hmm.*.sh +$cmd JOB=1:$nj $dir/log/vb_hmm.JOB.log \ + $dir/tmp/vb_hmm.JOB.sh + +if [ $stage -le 1 ]; then + echo "$0: combining labels" + for j in $(seq $nj); do cat $dir/labels.vb.$j; done > $dir/labels.vb || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing RTTM" + diarization/make_rttm.py --rttm-channel $rttm_channel $xvec_dir/plda_scores/segments $dir/labels.vb $dir/rttm.vb || exit 1; +fi + +if $cleanup ; then + rm -r $dir/tmp || exit 1; +fi diff --git a/egs/libri_css/s5_mono/local/decode.sh b/egs/libri_css/s5_mono/local/decode.sh index a0d65d47bc8..3778003f3e1 100755 --- a/egs/libri_css/s5_mono/local/decode.sh +++ b/egs/libri_css/s5_mono/local/decode.sh @@ -13,7 +13,7 @@ stage=0 score_sad=true diarizer_stage=0 decode_diarize_stage=0 -decode_oracle_stage=0 +decode_oracle_stage=2 score_stage=0 nnet3_affix=_cleaned # affix for the chain directory name affix=1d # affix for the TDNN directory name @@ -111,7 +111,7 @@ if [ $stage -le 3 ]; then [ ! -d exp/xvector_nnet_1a ] && ./local/download_diarizer.sh - local/diarize.sh --nj $diar_nj --cmd "$train_cmd" --stage $diarizer_stage \ + local/diarize_bhmm.sh --nj $diar_nj --cmd "$train_cmd" --stage $diarizer_stage \ --ref-rttm $ref_rttm \ exp/xvector_nnet_1a \ data/${datadir} \ @@ -127,7 +127,7 @@ if [ $stage -le 4 ]; then asr_nj=$(wc -l < "data/$datadir/wav.scp") local/decode_diarized.sh --nj $asr_nj --cmd "$decode_cmd" --stage $decode_diarize_stage \ --lm-suffix "_tgsmall" \ - exp/${datadir}_diarization data/$datadir data/lang_nosp_test_tgsmall \ + exp/${datadir}_diarization/rttm.vb data/$datadir data/lang_test_tgsmall \ exp/chain${nnet3_affix}/tdnn_${affix}_sp exp/nnet3${nnet3_affix} \ data/${datadir}_diarized || exit 1 done @@ -163,7 +163,7 @@ if $rnnlm_rescore; then rnnlm/lmrescore$pruned.sh \ --cmd "$decode_cmd --mem 8G" \ --weight 0.45 --max-ngram-order $ngram_order \ - data/lang_nosp_test_tgsmall $rnnlm_dir \ + data/lang_test_tgsmall $rnnlm_dir \ data/${decode_set}_diarized_hires ${decode_dir} \ ${ac_model_dir}/decode_${decode_set}_diarized_2stage_rescore done @@ -207,7 +207,7 @@ fi if [ $stage -le 9 ]; then local/decode_oracle.sh --stage $decode_oracle_stage \ --affix $affix \ - --lang-dir data/lang_nosp_test_tgsmall \ + --lang-dir data/lang_test_tgsmall \ --lm-suffix "_tgsmall" \ --rnnlm-rescore $rnnlm_rescore \ --test_sets "$test_sets" diff --git a/egs/libri_css/s5_mono/local/decode_diarized.sh b/egs/libri_css/s5_mono/local/decode_diarized.sh index b81515f22b4..6eda74506c5 100755 --- a/egs/libri_css/s5_mono/local/decode_diarized.sh +++ b/egs/libri_css/s5_mono/local/decode_diarized.sh @@ -15,7 +15,7 @@ echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . utils/parse_options.sh || exit 1; if [ $# != 6 ]; then - echo "Usage: $0 " + echo "Usage: $0 " echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain/tdnn_1a \ exp/nnet3_cleaned data/dev_diarized" echo "Options: " @@ -24,14 +24,14 @@ if [ $# != 6 ]; then exit 1; fi -rttm_dir=$1 +rttm=$1 data_in=$2 lang_dir=$3 asr_model_dir=$4 ivector_extractor=$5 out_dir=$6 -for f in $rttm_dir/rttm $data_in/wav.scp $data_in/text.bak \ +for f in $rttm $data_in/wav.scp $data_in/text.bak \ $lang_dir/L.fst $asr_model_dir/graph${lm_suffix}/HCLG.fst \ $asr_model_dir/final.mdl; do [ ! -f $f ] && echo "$0: No such file $f" && exit 1; @@ -46,8 +46,8 @@ fi if [ $stage -le 1 ]; then echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel " - local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm \ - <(awk '{print $2" "$2" "$3}' $rttm_dir/rttm |sort -u) \ + local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm \ + <(awk '{print $2" "$2" "$3}' $rttm |sort -u) \ ${out_dir}_hires/utt2spk ${out_dir}_hires/segments utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt diff --git a/egs/libri_css/s5_mono/local/diarize_bhmm.sh b/egs/libri_css/s5_mono/local/diarize_bhmm.sh new file mode 100755 index 00000000000..30ae2def3f6 --- /dev/null +++ b/egs/libri_css/s5_mono/local/diarize_bhmm.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# Copyright 2019 David Snyder +# 2020 Desh Raj + +# Apache 2.0. +# +# This script takes an input directory that has a segments file (and +# a feats.scp file), and performs diarization on it, using BUTs +# Bayesian HMM-based diarization model. A first-pass of AHC is performed +# first followed by VB-HMM. + +stage=0 +nj=10 +cmd="run.pl" +ref_rttm= +score_overlaps_only=true + +echo "$0 $@" # Print the command line for logging +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 exp/xvector_nnet_1a data/dev exp/dev_diarization" + echo "Options: " + echo " --nj # number of parallel jobs." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --ref_rttm ./local/dev_rttm # the location of the reference RTTM file" + exit 1; +fi + +model_dir=$1 +data_in=$2 +out_dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/segments $model_dir/plda \ + $model_dir/final.raw $model_dir/extract.config; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +if [ $stage -le 1 ]; then + echo "$0: computing features for x-vector extractor" + utils/fix_data_dir.sh data/${name} + rm -rf data/${name}_cmn + local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/segments exp/${name}_cmn/ + utils/fix_data_dir.sh data/${name}_cmn +fi + +if [ $stage -le 2 ]; then + echo "$0: extracting x-vectors for all segments" + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \ + --nj $nj --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $model_dir \ + data/${name}_cmn $out_dir/xvectors_${name} +fi + +# Perform PLDA scoring +if [ $stage -le 3 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + echo "$0: performing PLDA scoring between all pairs of x-vectors" + diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \ + --target-energy 0.5 \ + --nj $nj $model_dir/ $out_dir/xvectors_${name} \ + $out_dir/xvectors_${name}/plda_scores +fi + +if [ $stage -le 4 ]; then + echo "$0: performing clustering using PLDA scores (threshold tuned on dev)" + diarization/cluster.sh --cmd "$cmd" --nj $nj \ + --rttm-channel 1 --threshold 0.4 \ + $out_dir/xvectors_${name}/plda_scores $out_dir + echo "$0: wrote RTTM to output directory ${out_dir}" +fi + +if [ $stage -le 5 ]; then + echo "$0: performing VB-HMM on top of first-pass AHC" + diarization/vb_hmm_xvector.sh --nj $nj --rttm-channel 1 \ + $out_dir $out_dir/xvectors_${name} $model_dir/plda +fi + +hyp_rttm=${out_dir}/rttm.vb + +# For scoring the diarization system, we use the same tool that was +# used in the DIHARD II challenge. This is available at: +# https://github.com/nryant/dscore +if [ $stage -le 6 ]; then + echo "Diarization results for "${name} + if ! [ -d dscore ]; then + git clone https://github.com/desh2608/dscore.git -b libricss --single-branch || exit 1; + cd dscore + python -m pip install --user -r requirements.txt + cd .. + fi + + # Create per condition ref and hyp RTTM files for scoring per condition + mkdir -p tmp + conditions="0L 0S OV10 OV20 OV30 OV40" + cp $ref_rttm tmp/ref.all + cp $hyp_rttm tmp/hyp.all + for rttm in ref hyp; do + for cond in $conditions; do + cat tmp/$rttm.all | grep $cond > tmp/$rttm.$cond + done + done + + echo "Scoring all regions..." + for cond in $conditions 'all'; do + echo -n "Condition: $cond: " + ref_rttm_path=$(readlink -f tmp/ref.$cond) + hyp_rttm_path=$(readlink -f tmp/hyp.$cond) + cd dscore && python score.py -r $ref_rttm_path -s $hyp_rttm_path --global_only && cd .. || exit 1; + done + + # We also score overlapping regions only + if [ $score_overlaps_only == "true" ]; then + echo "Scoring overlapping regions..." + for cond in $conditions 'all'; do + echo -n "Condition: $cond: " + ref_rttm_path=$(readlink -f tmp/ref.$cond) + hyp_rttm_path=$(readlink -f tmp/hyp.$cond) + cd dscore && python score.py -r $ref_rttm_path -s $hyp_rttm_path --overlap_only --global_only && cd .. || exit 1; + done + fi + + rm -r tmp +fi From 8c9047b5295d089a616f5032ca89e3910de34c0e Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 8 Jun 2020 21:41:13 -0400 Subject: [PATCH 2/5] minor fix --- egs/libri_css/s5_mono/local/decode.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/libri_css/s5_mono/local/decode.sh b/egs/libri_css/s5_mono/local/decode.sh index 3778003f3e1..6c1fe57ff5a 100755 --- a/egs/libri_css/s5_mono/local/decode.sh +++ b/egs/libri_css/s5_mono/local/decode.sh @@ -13,7 +13,7 @@ stage=0 score_sad=true diarizer_stage=0 decode_diarize_stage=0 -decode_oracle_stage=2 +decode_oracle_stage=0 score_stage=0 nnet3_affix=_cleaned # affix for the chain directory name affix=1d # affix for the TDNN directory name From d75aa0202b0b95de6571e4d390f52e0ef6bc62f4 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Tue, 9 Jun 2020 09:09:06 -0400 Subject: [PATCH 3/5] added pip install for kaldi_io vbx --- .../v1/diarization/kaldi_io.py | 847 ------------------ .../v1/diarization/vb_hmm_xvector.sh | 8 +- 2 files changed, 5 insertions(+), 850 deletions(-) delete mode 100644 egs/callhome_diarization/v1/diarization/kaldi_io.py diff --git a/egs/callhome_diarization/v1/diarization/kaldi_io.py b/egs/callhome_diarization/v1/diarization/kaldi_io.py deleted file mode 100644 index 9a2d090c01b..00000000000 --- a/egs/callhome_diarization/v1/diarization/kaldi_io.py +++ /dev/null @@ -1,847 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2019 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -from __future__ import print_function -from __future__ import division - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - sys.exit(1) - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.6/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=sys.stderr) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stderr=sys.stderr) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=sys.stderr) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stderr=sys.stderr) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - if vec_size == 0: - return np.array([], dtype='int32') - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - ans = _read_vec_flt_binary(fd) - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -def _read_vec_flt_binary(fd): - header = fd.read(3).decode() - if header == 'FV ' : sample_size = 4 # floats - elif header == 'DV ' : sample_size = 8 # doubles - else : raise UnknownVectorHeader("The header contained '%s'" % header) - assert (sample_size > 0) - # Dimension, - assert (fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - if vec_size == 0: - return np.array([], dtype='float32') - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header.startswith('SM'): return _read_sparse_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.zeros((cols,rows), dtype='float32') - p0 = col_headers[:, 0].reshape(-1, 1) - p25 = col_headers[:, 1].reshape(-1, 1) - p75 = col_headers[:, 2].reshape(-1, 1) - p100 = col_headers[:, 3].reshape(-1, 1) - mask_0_64 = (data <= 64) - mask_193_255 = (data > 192) - mask_65_192 = (~(mask_0_64 | mask_193_255)) - - mat += (p0 + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32) - mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32) - mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32) - - return mat.T # transpose! col-major -> row-major, - -def _read_sparse_mat(fd, format): - """ Read a sparse matrix, - """ - from scipy.sparse import csr_matrix - assert (format == 'SM ') - - # Mapping for matrix elements, - def read_sparse_vector(fd): - _format = fd.read(3).decode() - assert (_format == 'SV ') - _, dim = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] - _, num_elems = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] - col = [] - data = [] - for j in range(num_elems): - size = np.frombuffer(fd.read(1), dtype='int8', count=1)[0] - dtype = 'int32' if size == 4 else 'int64' - c = np.frombuffer(fd.read(size), dtype=dtype, count=1)[0] - size = np.frombuffer(fd.read(1), dtype='int8', count=1)[0] - dtype = 'float32' if size == 4 else 'float64' - d = np.frombuffer(fd.read(size), dtype=dtype, count=1)[0] - col.append(c) - data.append(d) - return col, data, dim - - _, num_rows = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] - - rows = [] - cols = [] - all_data = [] - max_dim = 0 - for i in range(num_rows): - col, data, dim = read_sparse_vector(fd) - rows += [i] * len(col) - cols += col - all_data += data - max_dim = max(dim, max_dim) - sparse_mat = csr_matrix((all_data, (rows, cols)), shape=(num_rows, max_dim)) - return sparse_mat - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_rxspec(file_): - """ adaptor to read both 'ark:...' and 'scp:...' inputs of posteriors, - """ - if file_.startswith("ark:"): - return read_post_ark(file_) - elif file_.startswith("scp:"): - return read_post_scp(file_) - else: - print("unsupported intput type: %s" % file_) - print("it should begint with 'ark:' or 'scp:'") - sys.exit(1) - -def read_post_scp(file_or_fd): - """ generator(key,post) = read_post_scp(file_or_fd) - Returns generator of (key,post) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,post in kaldi_io.read_post_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - post = read_post(rxfile) - yield key, post - finally: - if fd is not file_or_fd : fd.close() - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - -########################################################## -# For reading archieves (eg files) into the feature format -# Not Fully Tested -########################################################## -def read_token(fd, expected_token=None): - token = '' - while True: - char = fd.read(1).decode() - if char == '': - break - if char == ' ': - break - token += char - if expected_token is not None: - assert token == expected_token - return token - - -def read_index_vector(fd): - def read_index(fd, prev_index=None): - c = np.frombuffer(fd.read(1), dtype='int8', count=1)[0] - if prev_index is None: - if abs(c) < 125: - n = x = 0 - t = int(c) - else: - assert c == 127 - _, n, _, t, _, x = np.frombuffer(fd.read(15), dtype='int8,int32,int8,int32,int8,int32', count=1)[0] - else: - if abs(c) < 125: - n, t, x = prev_index[0], prev_index[1] + c, prev_index[2] - else: - assert c == 127 - _, n, _, t, _, x = np.frombuffer(fd.read(15), dtype='int8,int32,int8,int32,int8,int32', count=1)[0] - return n, t, x - - read_token(fd, "") - _, size = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] - prev_index = None - for i in range(size): - prev_index = read_index(fd, prev_index) - -def read_egs_ark(file_or_fd): - """ - THERE MAY BE SOME BUGS ! - generator(key,mat) = read_egs_ark(file_or_fd) - Returns generator of (key, matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_egs_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_egs_ark(file) } - """ - fd = kaldi_io.open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - # print(key) - binary = fd.read(2).decode() - assert binary == '\0B' - read_token(fd, "") - read_token(fd, "") - _, examples_count = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] - # assert examples_count == 2 - read_token(fd, "") - read_token(fd, "input") - read_index_vector(fd) - mat = kaldi_io._read_mat_binary(fd) - read_token(fd, "") - # output - read_token(fd, "") - read_token(fd, "output") - read_index_vector(fd) - sparse_lab = kaldi_io._read_mat_binary(fd) - read_token(fd, "") - read_token(fd, "") - yield key, mat - key = read_token(fd) - - finally: - if fd is not file_or_fd : fd.close() -########################################################## - - - -################################################# -# Following code added by Lukas Burget - -def _read_vec_binary(fd): - # Data type, - type = fd.read(3) - if type == b'FV ': sample_size = 4 # floats - if type == b'DV ': sample_size = 8 # doubles - assert(sample_size > 0) - # Dimension, - assert(fd.read(1) == b'\4'); # int-size - vec_size = struct.unpack(' ') - plda_mean = _read_vec_binary(fd) - plda_trans = _read_mat_binary(fd) - plda_psi = _read_vec_binary(fd) - else: - assert(binary+fd.read(5) == b' ') - #plda_mean = _read_vec_ascii(fd, binary) - plda_mean = np.array(fd.readline().strip(' \n[]').split(), dtype=float) - assert(fd.read(2) == b' [') - plda_trans = _read_mat_ascii(fd) - plda_psi = np.array(fd.readline().strip(' \n[]').split(), dtype=float) - assert(fd.read(8) == b' ') - finally: - if fd is not file_or_fd: fd.close() - return plda_mean, plda_trans, plda_psi \ No newline at end of file diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh index 0512da39df5..78d63c61cdb 100755 --- a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh +++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh @@ -43,17 +43,19 @@ for f in $dir/labels ; do [ ! -f $f ] && echo "No such file $f" && exit 1; done -# check if kaldi_io and numexpr are installed +# check if numexpr is installed. Also install +# a modified version of kaldi_io with extra functions +# needed to read the PLDA file result=`python3 -c "\ try: import kaldi_io, numexpr - print('1') + print (int(hasattr(kaldi_io, 'read_plda'))) except ImportError: print('0')"` if [ "$result" == "0" ]; then echo "Installing kaldi_io and numexpr" - python3 -m pip install kaldi_io + python3 -m pip install git+https://github.com/desh2608/kaldi-io-for-python.git@vbx python3 -m pip install numexpr fi From 782d26f6ed5c46fa644d3133c5246be62e764121 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Tue, 9 Jun 2020 10:02:31 -0400 Subject: [PATCH 4/5] added comment for hyperparameter tuning for vbx --- .../v1/diarization/vb_hmm_xvector.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh index 78d63c61cdb..70cd245e90a 100755 --- a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh +++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh @@ -13,6 +13,15 @@ stage=0 nj=10 cleanup=true rttm_channel=0 + +# The hyperparameters used here are taken from the DIHARD +# optimal hyperparameter values reported in: +# http://www.fit.vutbr.cz/research/groups/speech/publi/2019/diez_IEEE_ACM_2019_08910412.pdf +# These may require tuning for different datasets. +loop_prob=0.85 +fa=0.2 +fb=1 + # End configuration section. echo "$0 $@" # Print the command line for logging @@ -71,7 +80,7 @@ echo -e "Performing bayesian HMM based x-vector clustering..\n" for n in `seq $nj`; do cat <<-EOF > $dir/tmp/vb_hmm.$n.sh python3 diarization/vb_hmm_xvector.py \ - --loop-prob 0.85 --fa 0.2 --fb 1 \ + --loop-prob $loop_prob --fa $fa --fb $fb \ $xvec_dir/xvector_norm.ark $plda $dir/labels.$n $dir/labels.vb.$n EOF done From ee69d970710fc82a2d36b2bedc07c1080f325b5d Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Tue, 9 Jun 2020 13:11:00 -0400 Subject: [PATCH 5/5] bug fix for PLDA download --- egs/libri_css/s5_mono/local/download_diarizer.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/libri_css/s5_mono/local/download_diarizer.sh b/egs/libri_css/s5_mono/local/download_diarizer.sh index 959e8729215..753475ee3fd 100755 --- a/egs/libri_css/s5_mono/local/download_diarizer.sh +++ b/egs/libri_css/s5_mono/local/download_diarizer.sh @@ -31,7 +31,8 @@ tar -xvzf 0012_diarization_v1.tar.gz rm -f 0012_diarization_v1.tar.gz # Download PLDA model trained on augmented Librispeech data -wget https://desh2608.github.io/static/files/jsalt/plda 0012_diarization_v1/exp/xvector_nnet_1a/ +rm 0012_diarization_v1/exp/xvector_nnet_1a/plda +wget https://desh2608.github.io/static/files/jsalt/plda -P 0012_diarization_v1/exp/xvector_nnet_1a/ cd ../.. cp -r ${dir}/0012_diarization_v1/exp . rm -rf ${dir}