diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py
index 62676d64510..2b57bd12862 100755
--- a/egs/callhome_diarization/v1/diarization/VB_diarization.py
+++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py
@@ -1,5 +1,6 @@
-#!/usr/bin/env python3
-# Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz)
+#!/usr/bin/env python
+
+# Copyright 2013-2019 Lukas Burget, Mireia Diez (burget@fit.vutbr.cz, mireia@fit.vutbr.cz)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,63 +16,57 @@
 
 #
 # Revision History
-#   L. Burget   16/07/13 01:00AM - original version
-#   L. Burget   20/06/17 12:07AM - np.asarray replaced by .toarray()
-#                                - minor bug fix in initializing q
-#                                - minor bug fix in ELBO calculation
-#                                - few more optimizations
+#   16/07/13 01:00AM - original version
+#   20/06/17 12:07AM - np.asarray replaced by .toarray()
+#                    - minor bug fix in initializing q(Z)
+#                    - minor bug fix in ELBO calculation
+#                    - few more optimizations
+#   03/10/19 02:27PM - speaker regularization coefficient Fb added
+#
 
 import numpy as np
 from scipy.sparse import coo_matrix
 import scipy.linalg as spl
-#import numexpr as ne # the dependency on this modul can be avoided by replacing
-#                       # logsumexp_ne and exp_ne with logsumexp and np.exp
+import numexpr as ne # the dependency on this modul can be avoided by replacing
+                     # logsumexp_ne and exp_ne with logsumexp and np.exp
 
-#[q sp Li] =
-def VB_diarization(X, m, iE, w, V, sp=None, q=None,
+#[gamma pi Li] =
+def VB_diarization(X, m, invSigma, w, V, pi=None, gamma=None,
                    maxSpeakers = 10, maxIters = 10,
                    epsilon = 1e-4, loopProb = 0.99, statScale = 1.0,
-                   alphaQInit = 1.0, downsample = None, VtiEV = None, ref=None,
-                   plot=False, sparsityThr=0.001, llScale=1.0, minDur=1):
+                   alphaQInit = 1.0, downsample = None, VtinvSigmaV = None, ref=None,
+                   plot=False, sparsityThr=0.001, llScale=1.0, minDur=1, Fa=1.0, Fb=1.0):
 
   """
   This a generalized version of speaker diarization described in:
 
-  Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors,
-  Montreal, CRIM, May 2008.
-
-  Kenny, P., Reynolds, D., and Castaldo, F. Diarization of Telephone
-  Conversations using Factor Analysis IEEE Journal of Selected Topics in Signal
-  Processing, December 2010.
+  Diez. M., Burget. L., Landini. F., Cernocky. J.
+  Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
 
-  The generalization introduced in this implementation lies in using an HMM
-  instead of the simple mixture model when modeling generation of segments
-  (or even frames) from speakers. HMM limits the probability of switching
-  between speakers when changing frames, which makes it possible to use
-  the model on frame-by-frame bases without any need to iterate between
-  1) clustering speech segments and 2) re-segmentation (i.e. as it was done in
-  the paper above).
+  Variable names and equation numbers refer to those used the paper
 
   Inputs:
   X  - T x D array, where columns are D dimensional feature vectors for T frames
   m  - C x D array of GMM component means
-  iE - C x D array of GMM component inverse covariance matrix diagonals
+  invSigma - C x D array of GMM component inverse covariance matrix diagonals
   w  - C dimensional column vector of GMM component weights
   V  - R x C x D array of eigenvoices
   maxSpeakers - maximum number of speakers expected in the utterance
   maxIters    - maximum number of algorithm iterations
   epsilon     - stop iterating, if obj. fun. improvement is less than epsilon
   loopProb    - probability of not switching speakers between frames
-  statScale   - scale sufficient statiscits collected using UBM
+  statScale   - deprecated, use Fa instead
+  Fa          - scale sufficient statiscits collected using UBM
+  Fb          - speaker regularization coefficient Fb (controls final # of speaker)
   llScale     - scale UBM likelihood (i.e. llScale < 1.0 make atribution of
                 frames to UBM componets more uncertain)
   sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory
                 as the posteriors are represented by sparse matrix)
-  alphaQInit  - Dirichlet concentraion parameter for initializing q
+  alphaQInit  - Dirichlet concentraion parameter for initializing gamma
   downsample  - perform diarization on input downsampled by this factor
-  VtiEV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
-                VtiEV is None. However, it can be pre-calculated using function
-                precalculate_VtiEV(V) and used across calls of VB_diarization.
+  VtinvSigmaV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
+                VtinvSigmaV is None. However, it can be pre-calculated using function
+                precalculate_VtinvSigmaV(V) and used across calls of VB_diarization.
   minDur      - minimum number of frames between speaker turns imposed by linear
                 chains of HMM states corresponding to each speaker. All the states
                 in a chain share the same output distribution
@@ -80,192 +75,190 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None,
   plot        - if set to True, plot per-frame speaker posteriors.
 
    Outputs:
-   q  - S x T matrix of posteriors attribution each frame to one of S possible
+   gamma  - S x T matrix of posteriors attribution each frame to one of S possible
         speakers, where S is given by opts.maxSpeakers
-   sp - S dimensional column vector of ML learned speaker priors. Ideally, these
+   pi - S dimensional column vector of ML learned speaker priors. Ideally, these
         should allow to estimate # of speaker in the utterance as the
         probabilities of the redundant speaker should converge to zero.
-   Li - values of auxiliary function (and DER and frame cross-entropy between q
+   Li - values of auxiliary function (and DER and frame cross-entropy between gamma  
         and reference if 'ref' is provided) over iterations.
   """
 
-  # The references to equations corresponds to the technical report:
-  # Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors,
-  # Montreal, CRIM, May 2008.
+  # The references to equations corresponds to
+  # Diez. M., Burget. L., Landini. F., Cernocky. J.
+  # Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
 
   D=X.shape[1]  # feature dimensionality
   C=len(w)      # number of mixture components
   R=V.shape[0]  # subspace rank
   nframes=X.shape[0]
 
-  if VtiEV is None:
-    VtiEV = precalculate_VtiEV(V, iE)
+  if VtinvSigmaV is None:
+    VtinvSigmaV = precalculate_VtinvSigmaV(V, invSigma)
 
   V = V.reshape(V.shape[0],-1)
 
-  if sp is None:
-    sp = np.ones(maxSpeakers)/maxSpeakers
+  if pi is None:
+    pi = np.ones(maxSpeakers)/maxSpeakers
   else:
-    maxSpeakers = len(sp)
+    maxSpeakers = len(pi)
 
-  if q is None:
-    # initialize q from flat Dirichlet prior with concentrsaion parameter alphaQInit
-    q = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
-    q = q / q.sum(1, keepdims=True)
+  if gamma is None:
+    # initialize gamma from flat Dirichlet prior with concentration parameter alphaQInit
+    gamma = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
+    gamma = gamma / gamma.sum(1, keepdims=True)
 
   # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
-  ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi))
+  ll = (X**2).dot(-0.5*invSigma.T) + X.dot(invSigma.T*m.T)-0.5*((invSigma * m**2 - np.log(invSigma)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi))
   ll *= llScale
-  G = logsumexp(ll, axis=1)
-  NN =  np.exp(ll - G[:,np.newaxis]) * statScale
-  NN[NN<sparsityThr] = 0.0
-
-  #Kx = np.sum(NN * (np.log(w) - np.log(NN)), 1)
-  NN = coo_matrix(NN) # represent zero-order stats using sparse matrix
-  print('Sparsity: ', len(NN.row), float(len(NN.row))/np.prod(NN.shape))
+  G = logsumexp_ne(ll, axis=1) 
+  zeta =  exp_ne(ll - G[:,np.newaxis])  
+  zeta[zeta<sparsityThr] = 0.0
+  zeta = zeta * statScale
+  G = G * statScale
+
+  #Kx = np.sum(zeta * (np.log(w) - np.log(zeta)), 1)
+  zeta = coo_matrix(zeta) # represent zero-order stats using sparse matrix
+  print('Sparsity: ', len(zeta.row), float(len(zeta.row))/np.prod(zeta.shape))
   LL = np.sum(G) # total log-likelihod as calculated using UBM
 
-  mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))), shape=(C, C*D))
+  mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))))
 
-  #G = np.sum((NN.multiply(ll - np.log(w))).toarray(), 1) + Kx  # eq. (15) # Aleready calculated above
+  #G = np.sum((zeta.multiply(ll - np.log(w))).toarray(), 1) + Kx  # from eq. (30) # Aleready calculated above
 
   # Calculate per-frame first order statistics projected into the R-dim. subspace
   # V^T \Sigma^{-1} F_m
-  F_s = coo_matrix((((X[NN.row]-m[NN.col])*NN.data[:,np.newaxis]).flat,
-                   (NN.row.repeat(D), NN.col.repeat(D)*D+np.tile(range(D), len(NN.col)))), shape=(nframes, D*C))
-  VtiEF = F_s.tocsr().dot((iE.flat * V).T) ; del F_s
+  F_s =coo_matrix((((X[zeta.row]-m[zeta.col])*zeta.data[:,np.newaxis]).flat,
+                   (zeta.row.repeat(D), zeta.col.repeat(D)*D+np.tile(range(D), len(zeta.col)))))
+  rho = F_s.tocsr().dot((invSigma.flat * V).T) ; del F_s
   ## The code above is only efficient implementation of the following comented code
-  #VtiEF = 0;
+  #rho = 0;
   #for ii in range(C):
-  #  VtiEF = VtiEF + V[ii*D:(ii+1)*D,:].T.dot(NN[ii,:] * np.sqrt(iE[:,[ii]]) *  (X - m[:,[ii]]))
+  #  rho = rho + V[ii*D:(ii+1)*D,:].T.dot(zeta[ii,:] * invSigma[:,[ii]] *  (X - m[:,[ii]]))
 
   if downsample is not None:
-    # Downsample NN, VtiEF, G and q by summing the statistic over 'downsample' frames
+    # Downsample zeta, rho, G and gamma by summing the statistic over 'downsample' frames
     # This speeds-up diarization for the price of lowering its frame resolution
-    downsampler = coo_matrix((np.ones(nframes, dtype=np.int64), ((np.ceil(np.arange(nframes)/downsample)).astype(int), np.arange(nframes))), shape=(int(np.ceil((nframes - 1.0) / downsample)) + 1, nframes))
-    NN    = downsampler.dot(NN)
-    VtiEF = downsampler.dot(VtiEF)
+    #downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
+    downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
+    zeta  = downsampler.dot(zeta)
+    rho   = downsampler.dot(rho)
     G     = downsampler.dot(G)
-    q     = downsampler.dot(q) / downsample
+    gamma = downsampler.dot(gamma) / downsample
   else:
     downsampler=np.array(1)
 
-  Li = [[LL]] # for the 0-th iteration,
+  Li = [[LL*Fa]] # for the 0-th iteration,
   if ref is not None:
-    Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
+    Li[-1] += [DER(downsampler.T.dot(gamma), ref), DER(downsampler.T.dot(gamma), ref, xentropy=True)]
 
-  lls = np.zeros_like(q)
+  ln_p = np.zeros_like(gamma)
   tr = np.eye(minDur*maxSpeakers, k=1)
   ip = np.zeros(minDur*maxSpeakers)
   for ii in range(maxIters):
-    L = 0 # objective function (37) (i.e. VB lower-bound on the evidence)
-    Ns =   NN.T.dot(q).T                             # bracket in eq. (34) for all 's'
-    VtNsiEV_flat = Ns.astype(VtiEV.dtype).dot(VtiEV) # eq. (34) except for 'I' for all 's'
-    VtiEFs = q.T.dot(VtiEF)                          # eq. (35) except for \Lambda_s^{-1} for all 's'
+    ELBO = 0                                                                   # objective function (11) (i.e. VB lower-bound on the evidence)
+    sum_gamma_zeta =   zeta.T.dot(gamma).T                                     # corresponds to the last sum in eq. (26) for all 's'
+    invLnoI_flat = sum_gamma_zeta.astype(VtinvSigmaV.dtype).dot(VtinvSigmaV)   # eq. (26) except for 'I' and the F_A F_B factors for all 's'
+    sum_gamma_rho = gamma.T.dot(rho)                                           # summation in eq. (17) 
     for sid in range(maxSpeakers):
-        invL = np.linalg.inv(np.eye(R) + tril_to_sym(VtNsiEV_flat[sid])) # eq. (34) inverse
-        a = invL.dot(VtiEFs[sid])                                        # eq. (35)
-        # eq. (29) except for the prior term \ln \pi_s. Our prior is given by HMM
-        # trasition probability matrix. Instead of eq. (30), we need to use
-        # forward-backwar algorithm to calculate per-frame speaker posteriors,
-        # where 'lls' plays role of HMM output log-probabilities
-        lls[:,sid] = G + VtiEF.dot(a) - 0.5 * NN.dot(mixture_sum.dot(((invL+np.outer(a,a)).astype(V.dtype).dot(V) * (iE.flat * V)).sum(0)))
-        L += 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a**2, 0) + R)
+        invL = np.linalg.inv(np.eye(R) + tril_to_sym(invLnoI_flat[sid])*Fa/Fb) # eq. (18) inverse
+        a = invL.dot(sum_gamma_rho[sid])*Fa/Fb                                 # eq. (17)
+        ln_p[:,sid] = Fa * (G + rho.dot(a) - 0.5 * zeta.dot(mixture_sum.dot(((invL+np.outer(a,a)).astype(V.dtype).dot(V) * (invSigma.flat * V)).sum(0)))) #eq. (23)
+        ELBO += Fb* 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a**2, 0) + R)
 
     # Construct transition probability matrix with linear chain of 'minDur'
     # states for each of 'maxSpeaker' speaker. The last state in each chain has
     # self-loop probability 'loopProb' and the transition probabilities to the
-    # initial chain states given by vector '(1-loopProb) * sp'. From all other,
-    #states, one must move to the next state in the chain with probability one.
-    tr[minDur-1::minDur,0::minDur]=(1-loopProb)*sp
+    # initial chain states given by vector '(1-loopProb) * pi'. From all other,
+    # states, one must move to the next state in the chain with probability one.
+    tr[minDur-1::minDur,0::minDur]=(1-loopProb)*pi
     tr[(np.arange(1,maxSpeakers+1)*minDur-1,)*2] += loopProb
-    ip[::minDur]=sp
+    ip[::minDur]=pi
     # per-frame HMM state posteriors. Note that we can have linear chain of minDur states
     # for each speaker.
-    q, tll, lf, lb = forward_backward(lls.repeat(minDur,axis=1), tr, ip) #, np.arange(1,maxSpeakers+1)*minDur-1)
+    gamma, tll, lf, lb = forward_backward(ln_p.repeat(minDur,axis=1), tr, ip) #, np.arange(1,maxSpeakers+1)*minDur-1)
 
     # Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
-    # L now contains -KL{q(Y)||p(Y)}. Therefore, L+ttl is correct value for ELBO.
-    L += tll
-    Li.append([L])
+    # ELBO now contains -KL{q(Y)||p(Y)}. Therefore, ELBO+ttl is correct value for ELBO.
+    ELBO += tll
+    Li.append([ELBO])
 
-    # ML estimate of speaker prior probabilities (analogue to eq. (38))
-    sp = q[0,::minDur] + np.exp(logsumexp(lf[:-1,minDur-1::minDur],axis=1)[:,np.newaxis]
-                       + lb[1:,::minDur] + lls[1:] + np.log((1-loopProb)*sp)-tll).sum(0)
-    sp = sp / sp.sum()
+    # ML estimate of speaker prior probabilities, eq. (24)
+    pi = gamma[0,::minDur] + np.exp(logsumexp(lf[:-1,minDur-1::minDur],axis=1)[:,np.newaxis]
+                           + lb[1:,::minDur] + ln_p[1:] + np.log((1-loopProb)*pi)-tll).sum(0)
+    pi = pi / pi.sum()
 
-    # per-frame speaker posteriors (analogue to eq. (30)), obtained by summing
+    # per-frame speaker posteriors (eq. (19)), obtained by summing
     # HMM state posteriors corresponding to each speaker
-    q = q.reshape(len(q),maxSpeakers,minDur).sum(axis=2)
+    gamma = gamma.reshape(len(gamma),maxSpeakers,minDur).sum(axis=2)
 
 
     # if reference is provided, report DER, cross-entropy and plot the figures
     if ref is not None:
-      Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
+      Li[-1] += [DER(downsampler.T.dot(gamma), ref), DER(downsampler.T.dot(gamma), ref, xentropy=True)]
 
       if plot:
         import matplotlib.pyplot
         if ii == 0: matplotlib.pyplot.clf()
         matplotlib.pyplot.subplot(maxIters, 1, ii+1)
-        matplotlib.pyplot.plot(downsampler.T.dot(q), lw=2)
+        matplotlib.pyplot.plot(downsampler.T.dot(gamma), lw=2)
         matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
                                  cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
-        
       print(ii, Li[-2])
 
 
-    if ii > 0 and L - Li[-2][0] < epsilon:
-      if L - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!')
+    if ii > 0 and ELBO - Li[-2][0] < epsilon:
+      if ELBO - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!')
       break
 
   if downsample is not None:
-    #upsample resulting q to match number of frames in the input utterance
-    q = downsampler.T.dot(q)
+    # upsample resulting gamma to match number of frames in the input utterance
+    gamma = downsampler.T.dot(gamma)
 
-  return q, sp, Li
+  return gamma, pi, Li
 
 
-def precalculate_VtiEV(V, iE):
+def precalculate_VtinvSigmaV(V, invSigma):
     tril_ind = np.tril_indices(V.shape[0])
-    VtiEV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype)
+    VtinvSigmaV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype)
     for c in range(V.shape[1]):
-        VtiEV[c,:] = np.dot(V[:,c,:]*iE[np.newaxis,c,:], V[:,c,:].T)[tril_ind]
-    return VtiEV
+        VtinvSigmaV[c,:] = np.dot(V[:,c,:]*invSigma[np.newaxis,c,:], V[:,c,:].T)[tril_ind]
+    return VtinvSigmaV
 
 
-# Initialize q (per-frame speaker posteriors) from a reference
+# Initialize gamma (per-frame speaker posteriors) from a reference
 # (vector of per-frame zero based integer speaker IDs)
-def frame_labels2posterior_mx(labels, maxSpeakers):
-    #initialize from reference
-    #pmx = np.zeros((len(labels), labels.max()+1))
-    pmx = np.zeros((len(labels), maxSpeakers))
+def frame_labels2posterior_mx(labels):
+    # initialize from reference
+    pmx = np.zeros((len(labels), labels.max()+1))
     pmx[np.arange(len(labels)), labels] = 1
     return pmx
 
+
 # Calculates Diarization Error Rate (DER) or per-frame cross-entropy between
-# reference (vector of per-frame zero based integer speaker IDs) and q (per-frame
-# speaker posteriors). If expected=False, q is converted into hard labels before
-# calculating DER. If expected=TRUE, posteriors in q are used to calculated
-# "expected" DER.
-def DER(q, ref, expected=True, xentropy=False):
+# reference (vector of per-frame zero based integer speaker IDs) and gamma 
+# (per-frame speaker posteriors). If expected=False, gamma is converted into 
+# hard labels before calculating DER. If expected=TRUE, posteriors in gamma 
+# are used to calculate "expected" DER.
+def DER(gamma, ref, expected=True, xentropy=False):
     from itertools import permutations
 
     if not expected:
-        # replce probabiities in q by zeros and ones
-        hard_labels = q.argmax(1)
-        q = np.zeros_like(q)
-        q[range(len(q)), hard_labels] = 1
+        # replace probabiities in gamma by zeros and ones
+        hard_labels = gamma.argmax(1)
+        gamma = np.zeros_like(gamma)
+        gamma[range(len(gamma)), hard_labels] = 1
 
-    err_mx = np.empty((ref.max()+1, q.shape[1]))
+    err_mx = np.empty((ref.max()+1, gamma.shape[1]))
     for s in range(err_mx.shape[0]):
-        tmpq = q[ref == s,:]
-        err_mx[s] = (-np.log(tmpq) if xentropy else tmpq).sum(0)
+        tmpgamma = gamma[ref == s,:]
+        err_mx[s] = (-np.log(tmpgamma) if xentropy else tmpgamma).sum(0)
 
     if err_mx.shape[0] < err_mx.shape[1]:
         err_mx = err_mx.T
 
     # try all alignments (permutations) of reference and detected speaker
-    #could be written in more efficient way using dynamic programing
+    # could be written in more efficient way using dynamic programing
     acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum()
               for perm in permutations(range(err_mx.shape[0]))]
     if xentropy:
@@ -342,7 +335,7 @@ def forward_backward(lls, tr, ip):
     lfw[0] = lls[0] + np.log(ip)
     lbw[-1] = 0.0
 
-    for ii in range(1,len(lls)):
+    for ii in  range(1,len(lls)):
         lfw[ii] =  lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1)
 
     for ii in reversed(range(len(lls)-1)):
@@ -350,4 +343,4 @@ def forward_backward(lls, tr, ip):
 
     tll = logsumexp(lfw[-1])
     sp = np.exp(lfw + lbw - tll)
-    return sp, tll, lfw, lbw
+    return sp, tll, lfw, lbw
\ No newline at end of file
diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py
new file mode 100644
index 00000000000..2907cc2d114
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# Copyright 2020 Johns Hopkins University (Author: Desh Raj)
+# Apache 2.0
+
+# This script is based on the Bayesian HMM-based xvector clustering
+# code released by BUTSpeech at: https://github.com/BUTSpeechFIT/VBx.
+# Note that this assumes that the provided labels are for a single
+# recording. So this should be called from a script such as
+# vb_hmm_xvector.sh which can divide all labels into per recording
+# labels.
+
+import sys, argparse, struct
+import numpy as np
+import itertools
+import kaldi_io
+
+from scipy.special import softmax
+
+import VB_diarization
+
+########### HELPER FUNCTIONS #####################################
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script performs Bayesian HMM-based
+            clustering of x-vectors for one recording""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--init-smoothing", type=float, default=10,
+        help="AHC produces hard assignments of x-vetors to speakers."
+        " These are smoothed to soft assignments as the initialization"
+        " for VB-HMM. This parameter controls the amount of smoothing."
+        " Not so important, high value (e.g. 10) is OK  => keeping hard assigment")
+    parser.add_argument("--loop-prob", type=float, default=0.80,
+                        help="probability of not switching speakers between frames")
+    parser.add_argument("--fa", type=float, default=0.4,
+                        help="scale sufficient statistics collected using UBM")
+    parser.add_argument("--fb", type=float, default=11,
+                        help="speaker regularization coefficient Fb (controls final # of speaker)")
+    parser.add_argument("xvector_ark_file", type=str,
+                        help="Ark file containing xvectors for all subsegments")
+    parser.add_argument("plda", type=str,
+                        help="path to PLDA model")
+    parser.add_argument("input_label_file", type=str,
+                        help="path of input label file")
+    parser.add_argument("output_label_file", type=str,
+                        help="path of output label file")
+    args = parser.parse_args()
+    return args
+
+def read_labels_file(label_file):
+    segments = []
+    labels = []
+    with open(label_file, 'r') as f:
+        for line in f.readlines():
+            segment, label = line.strip().split()
+            segments.append(segment)
+            labels.append(int(label))
+    return segments, labels
+
+def write_labels_file(seg2label, out_file):
+    f = open(out_file, 'w')
+    for seg in sorted(seg2label.keys()):
+        f.write("{} {}\n".format(seg, seg2label[seg]))
+    f.close()
+    return
+
+def read_args(args):
+    segments, labels = read_labels_file(args.input_label_file)
+    xvec_all = dict(kaldi_io.read_vec_flt_ark(args.xvector_ark_file))
+    xvectors = []
+    for segment in segments:
+        xvectors.append(xvec_all[segment])
+    _, _, plda_psi = kaldi_io.read_plda(args.plda)
+    return xvectors, segments, labels, plda_psi
+
+
+###################################################################
+
+def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, fa, fb):
+    x = np.array(xvectors)
+    dim = x.shape[1]
+
+    # Smooth the hard labels obtained from AHC to soft assignments of x-vectors to speakers
+    q_init = np.zeros((len(in_labels), np.max(in_labels)+1))
+    q_init[range(len(in_labels)), in_labels] = 1.0
+    q_init = softmax(q_init*init_smoothing, axis=1)
+
+    # Prepare model for VB-HMM clustering
+    ubmWeights = np.array([1.0])
+    ubmMeans = np.zeros((1,dim))
+    invSigma= np.ones((1,dim))
+    V=np.diag(np.sqrt(plda_psi[:dim]))[:,np.newaxis,:]
+
+    # Use VB-HMM for x-vector clustering. Instead of i-vector extractor model, we use PLDA
+    # => GMM with only 1 component, V derived across-class covariance, and invSigma is inverse 
+    # within-class covariance (i.e. identity)
+    q, _, _ = VB_diarization.VB_diarization(x, ubmMeans, invSigma, ubmWeights, V, pi=None, 
+        gamma=q_init, maxSpeakers=q_init.shape[1], maxIters=40, epsilon=1e-6, loopProb=loop_prob,
+        Fa=fa, Fb=fb)
+
+    labels = np.unique(q.argmax(1), return_inverse=True)[1] 
+
+    return {seg:label for seg,label in zip(segments,labels)}
+
+def main():
+    args = get_args()
+    xvectors, segments, labels, plda_psi = read_args(args)
+
+    seg2label_vb = vb_hmm(segments, labels, xvectors, plda_psi, args.init_smoothing, 
+        args.loop_prob, args.fa, args.fb)
+    write_labels_file(seg2label_vb, args.output_label_file)
+
+if __name__=="__main__":
+    main()
+
diff --git a/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh
new file mode 100755
index 00000000000..70cd245e90a
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/vb_hmm_xvector.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+
+# Copyright       2020  Desh Raj
+# Apache 2.0.
+
+# This script performs Bayesian HMM on top of labels produced
+# by a first-pass AHC clustering. See https://arxiv.org/abs/1910.08847
+# for details about the model.
+
+# Begin configuration section.
+cmd="run.pl"
+stage=0
+nj=10
+cleanup=true
+rttm_channel=0
+
+# The hyperparameters used here are taken from the DIHARD
+# optimal hyperparameter values reported in:
+# http://www.fit.vutbr.cz/research/groups/speech/publi/2019/diez_IEEE_ACM_2019_08910412.pdf
+# These may require tuning for different datasets.
+loop_prob=0.85
+fa=0.2
+fb=1
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <dir> <xvector-dir> <plda>"
+  echo " e.g.: $0 exp/ exp/xvectors_dev exp/xvector_nnet_1a/plda"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --cleanup <bool|false>                           # If true, remove temporary files"
+  exit 1;
+fi
+
+dir=$1
+xvec_dir=$2
+plda=$3
+
+mkdir -p $dir/tmp
+
+for f in $dir/labels ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+# check if numexpr is installed. Also install
+# a modified version of kaldi_io with extra functions
+# needed to read the PLDA file
+result=`python3 -c "\
+try:
+    import kaldi_io, numexpr
+    print (int(hasattr(kaldi_io, 'read_plda')))
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "0" ]; then
+    echo "Installing kaldi_io and numexpr"
+    python3 -m pip install git+https://github.com/desh2608/kaldi-io-for-python.git@vbx
+    python3 -m pip install numexpr
+fi
+
+if [ $stage -le 0 ]; then
+  # Mean subtraction (If original x-vectors are high-dim, e.g. 512, you should
+  # consider also applying LDA to reduce dimensionality to, say, 200) 
+  $cmd $xvec_dir/log/transform.log \
+    ivector-subtract-global-mean scp:$xvec_dir/xvector.scp ark:$xvec_dir/xvector_norm.ark
+fi
+
+echo -e "Performing bayesian HMM based x-vector clustering..\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+  cat <<-EOF > $dir/tmp/vb_hmm.$n.sh
+  python3 diarization/vb_hmm_xvector.py \
+      --loop-prob $loop_prob --fa $fa --fb $fb \
+      $xvec_dir/xvector_norm.ark $plda $dir/labels.$n $dir/labels.vb.$n
+EOF
+done
+
+chmod a+x $dir/tmp/vb_hmm.*.sh
+$cmd JOB=1:$nj $dir/log/vb_hmm.JOB.log \
+  $dir/tmp/vb_hmm.JOB.sh
+
+if [ $stage -le 1 ]; then
+  echo "$0: combining labels"
+  for j in $(seq $nj); do cat $dir/labels.vb.$j; done > $dir/labels.vb || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing RTTM"
+  diarization/make_rttm.py --rttm-channel $rttm_channel $xvec_dir/plda_scores/segments $dir/labels.vb $dir/rttm.vb || exit 1;
+fi
+
+if $cleanup ; then
+  rm -r $dir/tmp || exit 1;
+fi
diff --git a/egs/libri_css/s5_mono/local/decode.sh b/egs/libri_css/s5_mono/local/decode.sh
index a0d65d47bc8..6c1fe57ff5a 100755
--- a/egs/libri_css/s5_mono/local/decode.sh
+++ b/egs/libri_css/s5_mono/local/decode.sh
@@ -111,7 +111,7 @@ if [ $stage -le 3 ]; then
 
     [ ! -d exp/xvector_nnet_1a ] && ./local/download_diarizer.sh
 
-    local/diarize.sh --nj $diar_nj --cmd "$train_cmd" --stage $diarizer_stage \
+    local/diarize_bhmm.sh --nj $diar_nj --cmd "$train_cmd" --stage $diarizer_stage \
       --ref-rttm $ref_rttm \
       exp/xvector_nnet_1a \
       data/${datadir} \
@@ -127,7 +127,7 @@ if [ $stage -le 4 ]; then
     asr_nj=$(wc -l < "data/$datadir/wav.scp")
     local/decode_diarized.sh --nj $asr_nj --cmd "$decode_cmd" --stage $decode_diarize_stage \
       --lm-suffix "_tgsmall" \
-      exp/${datadir}_diarization data/$datadir data/lang_nosp_test_tgsmall \
+      exp/${datadir}_diarization/rttm.vb data/$datadir data/lang_test_tgsmall \
       exp/chain${nnet3_affix}/tdnn_${affix}_sp exp/nnet3${nnet3_affix} \
       data/${datadir}_diarized || exit 1
   done
@@ -163,7 +163,7 @@ if $rnnlm_rescore; then
       rnnlm/lmrescore$pruned.sh \
           --cmd "$decode_cmd --mem 8G" \
           --weight 0.45 --max-ngram-order $ngram_order \
-          data/lang_nosp_test_tgsmall $rnnlm_dir \
+          data/lang_test_tgsmall $rnnlm_dir \
           data/${decode_set}_diarized_hires ${decode_dir} \
           ${ac_model_dir}/decode_${decode_set}_diarized_2stage_rescore
     done
@@ -207,7 +207,7 @@ fi
 if [ $stage -le 9 ]; then
   local/decode_oracle.sh --stage $decode_oracle_stage \
     --affix $affix \
-    --lang-dir data/lang_nosp_test_tgsmall \
+    --lang-dir data/lang_test_tgsmall \
     --lm-suffix "_tgsmall" \
     --rnnlm-rescore $rnnlm_rescore \
     --test_sets "$test_sets"
diff --git a/egs/libri_css/s5_mono/local/decode_diarized.sh b/egs/libri_css/s5_mono/local/decode_diarized.sh
index b81515f22b4..6eda74506c5 100755
--- a/egs/libri_css/s5_mono/local/decode_diarized.sh
+++ b/egs/libri_css/s5_mono/local/decode_diarized.sh
@@ -15,7 +15,7 @@ echo "$0 $@"  # Print the command line for logging
 if [ -f path.sh ]; then . ./path.sh; fi
 . utils/parse_options.sh || exit 1;
 if [ $# != 6 ]; then
-  echo "Usage: $0 <rttm-dir> <in-data-dir> <lang-dir> <model-dir> <ivector-dir> <out-dir>"
+  echo "Usage: $0 <rttm> <in-data-dir> <lang-dir> <model-dir> <ivector-dir> <out-dir>"
   echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain/tdnn_1a \
                  exp/nnet3_cleaned data/dev_diarized"
   echo "Options: "
@@ -24,14 +24,14 @@ if [ $# != 6 ]; then
   exit 1;
 fi
 
-rttm_dir=$1
+rttm=$1
 data_in=$2
 lang_dir=$3
 asr_model_dir=$4
 ivector_extractor=$5
 out_dir=$6
 
-for f in $rttm_dir/rttm $data_in/wav.scp $data_in/text.bak \
+for f in $rttm $data_in/wav.scp $data_in/text.bak \
          $lang_dir/L.fst $asr_model_dir/graph${lm_suffix}/HCLG.fst \
          $asr_model_dir/final.mdl; do
   [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
@@ -46,8 +46,8 @@ fi
 
 if [ $stage -le 1 ]; then
   echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel "
-  local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm \
-    <(awk '{print $2" "$2" "$3}' $rttm_dir/rttm |sort -u) \
+  local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm \
+    <(awk '{print $2" "$2" "$3}' $rttm |sort -u) \
     ${out_dir}_hires/utt2spk ${out_dir}_hires/segments
 
   utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt
diff --git a/egs/libri_css/s5_mono/local/diarize_bhmm.sh b/egs/libri_css/s5_mono/local/diarize_bhmm.sh
new file mode 100755
index 00000000000..30ae2def3f6
--- /dev/null
+++ b/egs/libri_css/s5_mono/local/diarize_bhmm.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# Copyright   2019   David Snyder
+#             2020   Desh Raj
+
+# Apache 2.0.
+#
+# This script takes an input directory that has a segments file (and
+# a feats.scp file), and performs diarization on it, using BUTs
+# Bayesian HMM-based diarization model. A first-pass of AHC is performed
+# first followed by VB-HMM.
+
+stage=0
+nj=10
+cmd="run.pl"
+ref_rttm=
+score_overlaps_only=true
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>"
+  echo "e.g.: $0 exp/xvector_nnet_1a  data/dev exp/dev_diarization"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref_rttm ./local/dev_rttm                      # the location of the reference RTTM file"
+  exit 1;
+fi
+
+model_dir=$1
+data_in=$2
+out_dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/segments $model_dir/plda \
+  $model_dir/final.raw $model_dir/extract.config; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: computing features for x-vector extractor"
+  utils/fix_data_dir.sh data/${name}
+  rm -rf data/${name}_cmn
+  local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
+    data/$name data/${name}_cmn exp/${name}_cmn
+  cp data/$name/segments exp/${name}_cmn/
+  utils/fix_data_dir.sh data/${name}_cmn
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: extracting x-vectors for all segments"
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
+    --nj $nj --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $model_dir \
+    data/${name}_cmn $out_dir/xvectors_${name}
+fi
+
+# Perform PLDA scoring
+if [ $stage -le 3 ]; then
+  # Perform PLDA scoring on all pairs of segments for each recording.
+  echo "$0: performing PLDA scoring between all pairs of x-vectors"
+  diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \
+    --target-energy 0.5 \
+    --nj $nj $model_dir/ $out_dir/xvectors_${name} \
+    $out_dir/xvectors_${name}/plda_scores
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: performing clustering using PLDA scores (threshold tuned on dev)"
+  diarization/cluster.sh --cmd "$cmd" --nj $nj \
+    --rttm-channel 1 --threshold 0.4 \
+    $out_dir/xvectors_${name}/plda_scores $out_dir
+  echo "$0: wrote RTTM to output directory ${out_dir}"
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: performing VB-HMM on top of first-pass AHC"
+  diarization/vb_hmm_xvector.sh --nj $nj --rttm-channel 1 \
+    $out_dir $out_dir/xvectors_${name} $model_dir/plda
+fi
+
+hyp_rttm=${out_dir}/rttm.vb
+
+# For scoring the diarization system, we use the same tool that was
+# used in the DIHARD II challenge. This is available at:
+# https://github.com/nryant/dscore
+if [ $stage -le 6 ]; then
+  echo "Diarization results for "${name}
+  if ! [ -d dscore ]; then
+    git clone https://github.com/desh2608/dscore.git -b libricss --single-branch || exit 1;
+    cd dscore
+    python -m pip install --user -r requirements.txt
+    cd ..
+  fi
+
+  # Create per condition ref and hyp RTTM files for scoring per condition
+  mkdir -p tmp
+  conditions="0L 0S OV10 OV20 OV30 OV40"
+  cp $ref_rttm tmp/ref.all
+  cp $hyp_rttm tmp/hyp.all
+  for rttm in ref hyp; do
+    for cond in $conditions; do
+      cat tmp/$rttm.all | grep $cond > tmp/$rttm.$cond
+    done
+  done
+
+  echo "Scoring all regions..."
+  for cond in $conditions 'all'; do
+    echo -n "Condition: $cond: "
+    ref_rttm_path=$(readlink -f tmp/ref.$cond)
+    hyp_rttm_path=$(readlink -f tmp/hyp.$cond)
+    cd dscore && python score.py -r $ref_rttm_path -s $hyp_rttm_path --global_only && cd .. || exit 1;
+  done
+
+  # We also score overlapping regions only
+  if [ $score_overlaps_only == "true" ]; then
+    echo "Scoring overlapping regions..."
+    for cond in $conditions 'all'; do
+      echo -n "Condition: $cond: "
+      ref_rttm_path=$(readlink -f tmp/ref.$cond)
+      hyp_rttm_path=$(readlink -f tmp/hyp.$cond)
+      cd dscore && python score.py -r $ref_rttm_path -s $hyp_rttm_path --overlap_only --global_only && cd .. || exit 1;
+    done
+  fi
+
+  rm -r tmp
+fi
diff --git a/egs/libri_css/s5_mono/local/download_diarizer.sh b/egs/libri_css/s5_mono/local/download_diarizer.sh
index 959e8729215..753475ee3fd 100755
--- a/egs/libri_css/s5_mono/local/download_diarizer.sh
+++ b/egs/libri_css/s5_mono/local/download_diarizer.sh
@@ -31,7 +31,8 @@ tar -xvzf 0012_diarization_v1.tar.gz
 rm -f 0012_diarization_v1.tar.gz
 
 # Download PLDA model trained on augmented Librispeech data
-wget https://desh2608.github.io/static/files/jsalt/plda 0012_diarization_v1/exp/xvector_nnet_1a/
+rm 0012_diarization_v1/exp/xvector_nnet_1a/plda
+wget https://desh2608.github.io/static/files/jsalt/plda -P 0012_diarization_v1/exp/xvector_nnet_1a/
 cd ../..
 cp -r ${dir}/0012_diarization_v1/exp .
 rm -rf ${dir}