Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
dengkangle
committed
Dec 27, 2019
1 parent
dc80694
commit ddbd908
Showing
68 changed files
with
3,322 additions
and
54 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
import librosa | ||
import librosa.filters | ||
import numpy as np | ||
import tensorflow as tf | ||
from scipy import signal | ||
from scipy.io import wavfile | ||
|
||
|
||
def load_wav(path, sr): | ||
return librosa.core.load(path, sr=sr)[0] | ||
|
||
def save_wav(wav, path, sr): | ||
wav *= 32767 / max(0.01, np.max(np.abs(wav))) | ||
#proposed by @dsmiller | ||
wavfile.write(path, sr, wav.astype(np.int16)) | ||
|
||
def save_wavenet_wav(wav, path, sr): | ||
librosa.output.write_wav(path, wav, sr=sr) | ||
|
||
def preemphasis(wav, k, preemphasize=True): | ||
if preemphasize: | ||
return signal.lfilter([1, -k], [1], wav) | ||
return wav | ||
|
||
def inv_preemphasis(wav, k, inv_preemphasize=True): | ||
if inv_preemphasize: | ||
return signal.lfilter([1], [1, -k], wav) | ||
return wav | ||
|
||
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py | ||
def start_and_end_indices(quantized, silence_threshold=2): | ||
for start in range(quantized.size): | ||
if abs(quantized[start] - 127) > silence_threshold: | ||
break | ||
for end in range(quantized.size - 1, 1, -1): | ||
if abs(quantized[end] - 127) > silence_threshold: | ||
break | ||
|
||
assert abs(quantized[start] - 127) > silence_threshold | ||
assert abs(quantized[end] - 127) > silence_threshold | ||
|
||
return start, end | ||
|
||
def get_hop_size(hparams): | ||
hop_size = hparams.hop_size | ||
if hop_size is None: | ||
assert hparams.frame_shift_ms is not None | ||
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) | ||
return hop_size | ||
|
||
def linearspectrogram(wav, hparams): | ||
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) | ||
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db | ||
|
||
if hparams.signal_normalization: | ||
return _normalize(S, hparams) | ||
return S | ||
|
||
def melspectrogram(wav, hparams): | ||
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) | ||
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db | ||
|
||
if hparams.signal_normalization: | ||
return _normalize(S, hparams) | ||
return S | ||
|
||
def inv_linear_spectrogram(linear_spectrogram, hparams): | ||
"""Converts linear spectrogram to waveform using librosa""" | ||
if hparams.signal_normalization: | ||
D = _denormalize(linear_spectrogram, hparams) | ||
else: | ||
D = linear_spectrogram | ||
|
||
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear | ||
|
||
if hparams.use_lws: | ||
processor = _lws_processor(hparams) | ||
D = processor.run_lws(S.astype(np.float64).T ** hparams.power) | ||
y = processor.istft(D).astype(np.float32) | ||
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) | ||
else: | ||
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) | ||
|
||
def inv_mel_spectrogram(mel_spectrogram, hparams): | ||
"""Converts mel spectrogram to waveform using librosa""" | ||
if hparams.signal_normalization: | ||
D = _denormalize(mel_spectrogram, hparams) | ||
else: | ||
D = mel_spectrogram | ||
|
||
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear | ||
|
||
if hparams.use_lws: | ||
processor = _lws_processor(hparams) | ||
D = processor.run_lws(S.astype(np.float64).T ** hparams.power) | ||
y = processor.istft(D).astype(np.float32) | ||
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) | ||
else: | ||
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) | ||
|
||
def _lws_processor(hparams): | ||
import lws | ||
return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech") | ||
|
||
def _griffin_lim(S, hparams): | ||
"""librosa implementation of Griffin-Lim | ||
Based on https://github.com/librosa/librosa/issues/434 | ||
""" | ||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) | ||
S_complex = np.abs(S).astype(np.complex) | ||
y = _istft(S_complex * angles, hparams) | ||
for i in range(hparams.griffin_lim_iters): | ||
angles = np.exp(1j * np.angle(_stft(y, hparams))) | ||
y = _istft(S_complex * angles, hparams) | ||
return y | ||
|
||
def _stft(y, hparams): | ||
if hparams.use_lws: | ||
return _lws_processor(hparams).stft(y).T | ||
else: | ||
return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size) | ||
|
||
def _istft(y, hparams): | ||
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size) | ||
|
||
########################################################## | ||
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) | ||
def num_frames(length, fsize, fshift): | ||
"""Compute number of time frames of spectrogram | ||
""" | ||
pad = (fsize - fshift) | ||
if length % fshift == 0: | ||
M = (length + pad * 2 - fsize) // fshift + 1 | ||
else: | ||
M = (length + pad * 2 - fsize) // fshift + 2 | ||
return M | ||
|
||
|
||
def pad_lr(x, fsize, fshift): | ||
"""Compute left and right padding | ||
""" | ||
M = num_frames(len(x), fsize, fshift) | ||
pad = (fsize - fshift) | ||
T = len(x) + 2 * pad | ||
r = (M - 1) * fshift + fsize - T | ||
return pad, pad + r | ||
########################################################## | ||
#Librosa correct padding | ||
def librosa_pad_lr(x, fsize, fshift): | ||
return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0] | ||
|
||
# Conversions | ||
_mel_basis = None | ||
_inv_mel_basis = None | ||
_mel_basis_40 = None | ||
|
||
def _linear_to_mel(spectogram, hparams): | ||
global _mel_basis | ||
if _mel_basis is None: | ||
_mel_basis = _build_mel_basis(hparams) | ||
return np.dot(_mel_basis, spectogram) | ||
|
||
def _mel_to_linear(mel_spectrogram, hparams): | ||
global _inv_mel_basis | ||
if _inv_mel_basis is None: | ||
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) | ||
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) | ||
|
||
def _build_mel_basis(hparams): | ||
assert hparams.fmax <= hparams.sample_rate // 2 | ||
return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, | ||
fmin=hparams.fmin, fmax=hparams.fmax) | ||
|
||
def _amp_to_db(x, hparams): | ||
min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) | ||
return 20 * np.log10(np.maximum(min_level, x)) | ||
|
||
def _db_to_amp(x): | ||
return np.power(10.0, (x) * 0.05) | ||
|
||
def _normalize(S, hparams): | ||
if hparams.allow_clipping_in_normalization: | ||
if hparams.symmetric_mels: | ||
return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, | ||
-hparams.max_abs_value, hparams.max_abs_value) | ||
else: | ||
return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) | ||
|
||
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 | ||
if hparams.symmetric_mels: | ||
return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value | ||
else: | ||
return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) | ||
|
||
def _denormalize(D, hparams): | ||
if hparams.allow_clipping_in_normalization: | ||
if hparams.symmetric_mels: | ||
return (((np.clip(D, -hparams.max_abs_value, | ||
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) | ||
+ hparams.min_level_db) | ||
else: | ||
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) | ||
|
||
if hparams.symmetric_mels: | ||
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) | ||
else: | ||
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) | ||
|
||
def mel80_to_mel40(mel_spectrogram, hparams): | ||
if hparams.signal_normalization: | ||
D = _denormalize(mel_spectrogram, hparams) | ||
else: | ||
D = mel_spectrogram | ||
|
||
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear | ||
global _mel_basis_40 | ||
if _mel_basis_40 is None: | ||
_mel_basis_40 = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=40) | ||
return np.dot(_mel_basis_40, S**2) | ||
|
||
def wav2seg(wav, step, window): | ||
seg = [] | ||
for i in range(0, len(wav), step): | ||
if len(wav[i:i+window]) < window: | ||
break | ||
seg.append(wav[i:i+window]) | ||
return np.array(seg).T | ||
|
||
def seg2wav(data, step): | ||
wav = np.zeros(data.shape[0]+(data.shape[1]-1)*step) | ||
for i in range(data.shape[1]): | ||
wav[i*step:i*step+data.shape[0]] += data[:,i] | ||
wav /= (data.shape[0] / step) | ||
return wav.astype(np.float32) |
Oops, something went wrong.