Skip to content

Commit

Permalink
Merge pull request #5015 from kamo-naoyuki/test2
Browse files Browse the repository at this point in the history
Modify .pre-commit-config.yaml
  • Loading branch information
mergify[bot] committed Mar 15, 2023
2 parents 7964a2a + 85d9f10 commit b7885c2
Show file tree
Hide file tree
Showing 13 changed files with 143 additions and 87 deletions.
12 changes: 12 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,15 @@ repos:
exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|tools/installers/patch_mwerSegmenter)
- id: check-added-large-files
exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|tools/installers/patch_mwerSegmenter)

- repo: https://github.com/psf/black
rev: 23.1.0
hooks:
- id: black
exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|doc)

- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|doc)
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
[![codecov](https://codecov.io/gh/espnet/espnet/branch/master/graph/badge.svg)](https://codecov.io/gh/espnet/espnet)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/espnet/espnet/master.svg)](https://results.pre-commit.ci/latest/github/espnet/espnet/master)
[![Mergify Status](https://img.shields.io/endpoint.svg?url=https://api.mergify.com/v1/badges/espnet/espnet&style=flat)](https://mergify.com)
[![Gitter](https://badges.gitter.im/espnet-en/community.svg)](https://gitter.im/espnet-en/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)

Expand Down
17 changes: 3 additions & 14 deletions ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,12 @@

set -euo pipefail

modules="espnet espnet2 test utils setup.py egs*/*/*/local egs2/TEMPLATE/*/pyscripts tools/*.py ci/*.py"

# black
if ! black --check ${modules}; then
printf '[INFO] Please apply black:\n $ black %s\n' "${modules}"
exit 1
fi
# isort
if ! isort -c -v ${modules}; then
printf '[INFO] Please apply isort:\n $ isort %s\n' "${modules}"
exit 1
fi
exclude="egs2/TEMPLATE/asr1/utils,egs2/TEMPLATE/asr1/steps,egs2/TEMPLATE/tts1/sid,doc,tools,bats-core"

# flake8
"$(dirname $0)"/test_flake8.sh
# "$(dirname $0)"/test_flake8.sh
# pycodestyle
pycodestyle -r ${modules} --show-source --show-pep8
pycodestyle --exclude "${exclude}" --show-source --show-pep8

LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q

Expand Down
43 changes: 26 additions & 17 deletions egs/libri_css/asr1/diarization/VB_diarization.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

# Copyright 2013-2019 Lukas Burget, Mireia Diez (burget@fit.vutbr.cz, mireia@fit.vutbr.cz)
# Copyright 2013-2019 Lukas Burget, Mireia Diez
# (burget@fit.vutbr.cz, mireia@fit.vutbr.cz)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -24,13 +25,14 @@
# 03/10/19 02:27PM - speaker regularization coefficient Fb added
#

import numexpr as ne # the dependency on this modul can be avoided by replacing
import numpy as np
from scipy.sparse import coo_matrix
import scipy.linalg as spl
import numexpr as ne # the dependency on this modul can be avoided by replacing
from scipy.sparse import coo_matrix

# logsumexp_ne and exp_ne with logsumexp and np.exp


# [gamma pi Li] =
def VB_diarization(
X,
Expand All @@ -56,7 +58,6 @@ def VB_diarization(
Fa=1.0,
Fb=1.0,
):

"""
This a generalized version of speaker diarization described in:
Expand Down Expand Up @@ -124,17 +125,18 @@ def VB_diarization(
maxSpeakers = len(pi)

if gamma is None:
# initialize gamma from flat Dirichlet prior with concentration parameter alphaQInit
# initialize gamma from flat Dirichlet prior with concentration parameter
# alphaQInit
gamma = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
gamma = gamma / gamma.sum(1, keepdims=True)

# calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
ll = (
(X ** 2).dot(-0.5 * invSigma.T)
(X**2).dot(-0.5 * invSigma.T)
+ X.dot(invSigma.T * m.T)
- 0.5
* (
(invSigma * m ** 2 - np.log(invSigma)).sum(1)
(invSigma * m**2 - np.log(invSigma)).sum(1)
- 2 * np.log(w)
+ D * np.log(2 * np.pi)
)
Expand All @@ -152,8 +154,8 @@ def VB_diarization(
LL = np.sum(G) # total log-likelihod as calculated using UBM

mixture_sum = coo_matrix((np.ones(C * D), (np.repeat(range(C), D), range(C * D))))

# G = np.sum((zeta.multiply(ll - np.log(w))).toarray(), 1) + Kx # from eq. (30) # Aleready calculated above
# Aleready calculated above
# G = np.sum((zeta.multiply(ll - np.log(w))).toarray(), 1) + Kx # from eq. (30)

# Calculate per-frame first order statistics projected into the R-dim. subspace
# V^T \Sigma^{-1} F_m
Expand All @@ -168,15 +170,20 @@ def VB_diarization(
)
rho = F_s.tocsr().dot((invSigma.flat * V).T)
del F_s
## The code above is only efficient implementation of the following comented code
# # The code above is only efficient implementation of the following comented code
# rho = 0;
# for ii in range(C):
# rho = rho + V[ii*D:(ii+1)*D,:].T.dot(zeta[ii,:] * invSigma[:,[ii]] * (X - m[:,[ii]]))
# rho = rho + V[ii*D:(ii+1)*D,:].T.dot(zeta[ii,:] * invSigma[:,[ii]] * \
# (X - m[:,[ii]]))

if downsample is not None:
# Downsample zeta, rho, G and gamma by summing the statistic over 'downsample' frames
# Downsample zeta, rho, G and gamma by summing the statistic
# over 'downsample' frames
# This speeds-up diarization for the price of lowering its frame resolution
# downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
# downsampler = coo_matrix(
# (np.ones(nframes),
# (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes)))
# )
downsampler = coo_matrix(
(
np.ones(nframes),
Expand Down Expand Up @@ -230,7 +237,7 @@ def VB_diarization(
)
)
) # eq. (23)
ELBO += Fb * 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a ** 2, 0) + R)
ELBO += Fb * 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a**2, 0) + R)

# Construct transition probability matrix with linear chain of 'minDur'
# states for each of 'maxSpeaker' speaker. The last state in each chain has
Expand All @@ -240,14 +247,16 @@ def VB_diarization(
tr[minDur - 1 :: minDur, 0::minDur] = (1 - loopProb) * pi
tr[(np.arange(1, maxSpeakers + 1) * minDur - 1,) * 2] += loopProb
ip[::minDur] = pi
# per-frame HMM state posteriors. Note that we can have linear chain of minDur states
# per-frame HMM state posteriors. Note that we can have linear
# chain of minDur states
# for each speaker.
gamma, tll, lf, lb = forward_backward(
ln_p.repeat(minDur, axis=1), tr, ip
) # , np.arange(1,maxSpeakers+1)*minDur-1)

# Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
# ELBO now contains -KL{q(Y)||p(Y)}. Therefore, ELBO+ttl is correct value for ELBO.
# ELBO now contains -KL{q(Y)||p(Y)}. Therefore,
# ELBO+ttl is correct value for ELBO.
ELBO += tll
Li.append([ELBO])

Expand Down Expand Up @@ -372,7 +381,7 @@ def logsumexp(x, axis=0):
# the dependency on the module.
def logsumexp_ne(x, axis=0):
xmax = np.array(x).max(axis=axis)
xmax_e = np.expand_dims(xmax, axis)
# xmax_e = np.expand_dims(xmax, axis)
x = ne.evaluate("sum(exp(x - xmax_e), axis=%d)" % axis)
x = ne.evaluate("xmax + log(x)")
infs = np.isinf(xmax)
Expand Down
8 changes: 6 additions & 2 deletions egs/libri_css/asr1/diarization/calc_cossim_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
# Apache 2.0.

import argparse

import numpy as np
from scipy.spatial.distance import cosine, pdist, squareform
from kaldiio import ReadHelper, WriteHelper
from scipy.spatial.distance import pdist, squareform


def LoadReco2Utt(file):
Expand Down Expand Up @@ -54,7 +55,10 @@ def WriteDistMatrices(D, wspec):

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Usage: calc_cossim_scores.py <reco2utt-rspec> <xvec-rspec> <simmat-wspec>\nComputes matrices of the cosine similarity scores between normalized x-vectors for each recording"
description="Usage: calc_cossim_scores.py "
"<reco2utt-rspec> <xvec-rspec> <simmat-wspec>\n"
"Computes matrices of the cosine similarity scores "
"between normalized x-vectors for each recording"
)
parser.add_argument(
"reco2utt",
Expand Down
2 changes: 1 addition & 1 deletion egs/libri_css/asr1/diarization/make_rttm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
"""

import argparse
import sys
import codecs
import sys


def get_args():
Expand Down
33 changes: 21 additions & 12 deletions egs/libri_css/asr1/diarization/spec_clust.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
# Apache 2.0.

import argparse
import os

import numpy as np
from sklearn.cluster import k_means
from kaldiio import ReadHelper, WriteHelper
import scipy
from kaldiio import ReadHelper
from sklearn.cluster import SpectralClustering

"""
Spectral Clustering based on binarization and automatic thresholding
Paper: T.Park, K.Han, M.Kumar, and S.Narayanan, Auto-tuning spectral clustering for speaker diarization using normalized maximumeigengap, IEEE Signal Processing Letters, vol. 27, pp. 381-385,2019
Paper: T.Park, K.Han, M.Kumar, and S.Narayanan, Auto-tuning spectral clustering
for speaker diarization using normalized maximumeigengap,
IEEE Signal Processing Letters, vol. 27, pp. 381-385,2019
"""

# Input-output routines
Expand Down Expand Up @@ -60,6 +61,7 @@ def SaveLabels(IDs, labels, file):

# NME low-level operations


# Prepares binarized(0/1) affinity matrix with p_neighbors non-zero elements in each row
def get_kneighbors_conn(X_dist, p_neighbors):
X_dist_out = np.zeros_like(X_dist)
Expand Down Expand Up @@ -88,7 +90,8 @@ def Laplacian(A):
return D - A


# Calculates eigengaps (differences between adjacent eigenvalues sorted in descending order)
# Calculates eigengaps
# (differences between adjacent eigenvalues sorted in descending order)
def Eigengap(S):
S = sorted(S)
return np.diff(S)
Expand All @@ -115,7 +118,8 @@ def ComputeNMEParameters(A, p, max_num_clusters):
"""
Performs spectral clustering with Normalized Maximum Eigengap (NME)
Parameters:
A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
A: affinity matrix
(matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
num_clusters: number of clusters to generate (if None, determined automatically)
max_num_clusters: maximum allowed number of clusters to generate
pmax: maximum count for matrix binarization (should be at least 2)
Expand Down Expand Up @@ -149,9 +153,11 @@ def NME_SpectralClustering(A, num_clusters=None, max_num_clusters=10, pbest=0, p


"""
Performs spectral clustering with Normalized Maximum Eigengap (NME) with fixed threshold and number of clusters
Performs spectral clustering with Normalized Maximum Eigengap (NME)
with fixed threshold and number of clusters
Parameters:
A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
A: affinity matrix
(matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
num_clusters: number of clusters to generate
pbest: best count for matrix binarization
Returns: cluster assignments for every speaker embedding
Expand All @@ -170,9 +176,11 @@ def NME_SpectralClustering_sklearn(A, num_clusters, pbest):

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Usage: spec_clust.py [options] <scores-rspec> <reco2utt-rspec> <labels-wspec>\n"
+ "Performs spectral clustering of xvectors according to pairwise similarity scores\n"
+ "Auto-selects binarization threshold"
description="Usage: spec_clust.py [options] "
"<scores-rspec> <reco2utt-rspec> <labels-wspec>\n"
"Performs spectral clustering of xvectors according "
"to pairwise similarity scores\n"
"Auto-selects binarization threshold"
)
parser.add_argument(
"simmat_rspec",
Expand Down Expand Up @@ -216,7 +224,8 @@ def NME_SpectralClustering_sklearn(A, num_clusters, pbest):
)

print(
"Spectral clustering of xvector according to precomputed similarity scores matrix"
"Spectral clustering of xvector according to precomputed "
"similarity scores matrix"
)
print("Parameters:")
print("Similarity matrix rspecifier: {}".format(args.simmat_rspec))
Expand Down
21 changes: 11 additions & 10 deletions egs/libri_css/asr1/diarization/vb_hmm_xvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,14 @@
# vb_hmm_xvector.sh which can divide all labels into per recording
# labels.

import sys, argparse, struct
import numpy as np
import itertools
import kaldi_io

from scipy.special import softmax
import argparse

import kaldi_io
import numpy as np
import VB_diarization
from scipy.special import softmax

########### HELPER FUNCTIONS #####################################
# ########## HELPER FUNCTIONS #####################################


def get_args():
Expand Down Expand Up @@ -102,7 +100,8 @@ def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, f
x = np.array(xvectors)
dim = x.shape[1]

# Smooth the hard labels obtained from AHC to soft assignments of x-vectors to speakers
# Smooth the hard labels obtained from AHC to soft assignments of
# x-vectors to speakers
q_init = np.zeros((len(in_labels), np.max(in_labels) + 1))
q_init[range(len(in_labels)), in_labels] = 1.0
q_init = softmax(q_init * init_smoothing, axis=1)
Expand All @@ -113,8 +112,10 @@ def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, f
invSigma = np.ones((1, dim))
V = np.diag(np.sqrt(plda_psi[:dim]))[:, np.newaxis, :]

# Use VB-HMM for x-vector clustering. Instead of i-vector extractor model, we use PLDA
# => GMM with only 1 component, V derived across-class covariance, and invSigma is inverse
# Use VB-HMM for x-vector clustering. Instead of i-vector extractor model,
# we use PLDA
# => GMM with only 1 component, V derived across-class covariance,
# and invSigma is inverse
# within-class covariance (i.e. identity)
q, _, _ = VB_diarization.VB_diarization(
x,
Expand Down
24 changes: 14 additions & 10 deletions egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import sys

import pandas as pd
from espnet_model_zoo.downloader import ModelDownloader

tts_reference = "@inproceedings{hayashi2020espnet,\n\
title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},\n\
author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},\n\
booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},\n\
pages={7654--7658},\n\
year={2020},\n\
organization={IEEE}\n\
}"
tts_reference = (
"@inproceedings{hayashi2020espnet,\n"
"title={{Espnet-TTS}: Unified, reproducible, "
"and integratable open source end-to-end text-to-speech toolkit},\n"
"author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, "
"Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, "
"Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},\n"
"booktitle={Proceedings of IEEE International Conference on Acoustics, "
"Speech and Signal Processing (ICASSP)},\n"
"pages={7654--7658},\n"
"year={2020},\n"
"organization={IEEE}\n"
"}"
)


def create_Readme_file(repo_name, model_name):
Expand All @@ -24,7 +29,6 @@ def create_Readme_file(repo_name, model_name):
template_Readme = open("TEMPLATE_Readme.md")
new_Readme = open(repo_name + "/README.md", "w")
lines_arr = [line for line in template_Readme]
line_final_arr = []
for line in lines_arr:
if "<add_more_tags>" in line:
if task_name == "asr":
Expand Down
2 changes: 1 addition & 1 deletion egs2/jkac/tts1/local/prep_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def parse_label(book_dict, path):
for path in paths:
wav_scp_f.write(path.wav_scp_str(sample_rate=sample_rate) + "\n")
labels = list(read_label(path))
labels.sort(key=lambda l: l.utt_id())
labels.sort(key=lambda lll: lll.utt_id())
for label in labels:
text_f.write(label.text_file_str() + "\n")
segments_f.write(label.segment_file_str() + "\n")
Expand Down

0 comments on commit b7885c2

Please sign in to comment.