Skip to content

Commit

Permalink
Merge branch 'uhifigan' of https://github.com/jerryuhoo/espnet into u…
Browse files Browse the repository at this point in the history
…hifigan
  • Loading branch information
ftshijt committed May 23, 2023
2 parents bf3a5aa + 989836f commit 912cfe9
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 39 deletions.
2 changes: 1 addition & 1 deletion egs2/opencpop/svs1/conf/tuning/train_visinger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ svs_conf:
use_conformer_conv_in_text_encoder: false
text_encoder_conformer_kernel_size: -1
decoder_kernel_size: 7
decoder_channels: 256
decoder_channels: 512
decoder_upsample_scales: [8, 8, 4, 2]
decoder_upsample_kernel_sizes: [16, 16, 8, 4]
decoder_resblock_kernel_sizes: [3, 7, 11]
Expand Down
1 change: 0 additions & 1 deletion egs2/opencpop/svs1/conf/tuning/train_visinger2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ svs_conf:
decoder_upsample_kernel_sizes: [16, 16, 8, 4]
# visinger2 vocoder
n_harmonic: 64
n_bands: 65
decoder_resblock_kernel_sizes: [3, 7, 11]
decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
use_weight_norm_in_decoder: true
Expand Down
25 changes: 1 addition & 24 deletions egs2/opencpop/svs1/conf/tuning/train_visinger_avocodo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,38 +42,16 @@ svs_conf:
decoder_kernel_size: 7

# HifiGAN
decoder_channels: 256
decoder_channels: 512
decoder_upsample_scales: [8, 8, 4, 2]
decoder_upsample_kernel_sizes: [16, 16, 8, 4]

# UHifiGAN 0
# decoder_channels: 64
# decoder_downsample_scales: [2, 2, 8, 8]
# decoder_downsample_kernel_sizes: [4, 4, 16, 16]
# decoder_upsample_scales: [8, 8, 2, 2]
# decoder_upsample_kernel_sizes: [16, 16, 4, 4]

# UHifiGAN 1
# decoder_channels: 64
# decoder_downsample_scales: [8, 8, 2, 2]
# decoder_downsample_kernel_sizes: [16, 16, 4, 4]
# decoder_upsample_scales: [2, 2, 8, 8]
# decoder_upsample_kernel_sizes: [4, 4, 16, 16]

# UHifiGAN 2 (better)
# decoder_channels: 128
# decoder_downsample_scales: [8, 8, 4]
# decoder_downsample_kernel_sizes: [16, 16, 8]
# decoder_upsample_scales: [4, 8, 8]
# decoder_upsample_kernel_sizes: [8, 16, 16]

# avocodo
projection_filters: [0, 1, 1, 1]
projection_kernels: [0, 5, 7, 11]

# visinger2 vocoder
n_harmonic: 64
n_bands: 65

decoder_resblock_kernel_sizes: [3, 7, 11]
decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
Expand Down Expand Up @@ -162,7 +140,6 @@ svs_conf:
pqmf_config:
sbd: [16, 256, 0.03, 10.0]
fsbd: [64, 256, 0.1, 9.0]
segment_size: 10240 # 20 * hop_size
use_spectral_norm: False
pqmf_config:
lv1: [2, 256, 0.25, 10.0] # first value should be the last upsample value 2
Expand Down
20 changes: 18 additions & 2 deletions espnet2/gan_svs/avocodo/avocodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,7 @@ def __init__(
"sbd": [16, 256, 0.03, 10.0],
"fsbd": [64, 256, 0.1, 9.0],
},
"segment_size": "${svs_conf.generator_params.segment_size}",
"segment_size": 8192,
},
pqmf_config: Dict[str, Any] = {
"lv1": [2, 256, 0.25, 10.0],
Expand Down Expand Up @@ -763,7 +763,7 @@ def __init__(
"sbd": [16, 256, 0.03, 10.0],
"fsbd": [64, 256, 0.1, 9.0],
},
"segment_size": "${svs_conf.generator_params.segment_size}",
"segment_size": 8192,
},
pqmf_config: Dict[str, Any] = {
"lv1": [2, 256, 0.25, 10.0],
Expand Down Expand Up @@ -794,6 +794,22 @@ def __init__(
sbd,
use_spectral_norm=sbd["use_spectral_norm"],
)
# Multi-frequency discriminator related
if "hop_lengths" not in multi_freq_disc_params:

Check warning on line 798 in espnet2/gan_svs/avocodo/avocodo.py

View check run for this annotation

Codecov / codecov/patch

espnet2/gan_svs/avocodo/avocodo.py#L798

Added line #L798 was not covered by tests
# Transfer hop lengths factors to hop lengths
multi_freq_disc_params["hop_lengths"] = []

Check warning on line 800 in espnet2/gan_svs/avocodo/avocodo.py

View check run for this annotation

Codecov / codecov/patch

espnet2/gan_svs/avocodo/avocodo.py#L800

Added line #L800 was not covered by tests

for i in range(len(multi_freq_disc_params["hop_length_factors"])):
multi_freq_disc_params["hop_lengths"].append(

Check warning on line 803 in espnet2/gan_svs/avocodo/avocodo.py

View check run for this annotation

Codecov / codecov/patch

espnet2/gan_svs/avocodo/avocodo.py#L802-L803

Added lines #L802 - L803 were not covered by tests
int(
sample_rate
* multi_freq_disc_params["hop_length_factors"][i]
/ 1000
)
)

del multi_freq_disc_params["hop_length_factors"]

Check warning on line 811 in espnet2/gan_svs/avocodo/avocodo.py

View check run for this annotation

Codecov / codecov/patch

espnet2/gan_svs/avocodo/avocodo.py#L811

Added line #L811 was not covered by tests

self.mfd = MultiFrequencyDiscriminator(

Check warning on line 813 in espnet2/gan_svs/avocodo/avocodo.py

View check run for this annotation

Codecov / codecov/patch

espnet2/gan_svs/avocodo/avocodo.py#L813

Added line #L813 was not covered by tests
**multi_freq_disc_params,
)
Expand Down
2 changes: 2 additions & 0 deletions espnet2/gan_svs/visinger2/ddsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ def extract_loudness(signal, sampling_rate, block_size, n_fft=2048):
return S

Check warning on line 97 in espnet2/gan_svs/visinger2/ddsp.py

View check run for this annotation

Codecov / codecov/patch

espnet2/gan_svs/visinger2/ddsp.py#L97

Added line #L97 was not covered by tests


# TODO (Yifeng): Some functions are not used here such as crepe,
# maybe we can remove them later or only import used functions.
def extract_pitch(signal, sampling_rate, block_size):
length = signal.shape[-1] // block_size
f0 = crepe.predict(

Check warning on line 104 in espnet2/gan_svs/visinger2/ddsp.py

View check run for this annotation

Codecov / codecov/patch

espnet2/gan_svs/visinger2/ddsp.py#L103-L104

Added lines #L103 - L104 were not covered by tests
Expand Down
1 change: 1 addition & 0 deletions espnet2/gan_svs/visinger2/visinger2_vocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ def __init__(

super().__init__()

# TODO (Yifeng): Maybe use LogMelFbank instead of TorchSTFT
self.stfts = torch.nn.ModuleList(
[
TorchSTFT(
Expand Down
28 changes: 21 additions & 7 deletions espnet2/gan_svs/vits/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def __init__(
projection_kernels: List[int] = [0, 5, 7, 11],
# visinger 2
n_harmonic: int = 64,
n_bands: int = 65,
use_weight_norm_in_decoder: bool = True,
posterior_encoder_kernel_size: int = 5,
posterior_encoder_layers: int = 16,
Expand Down Expand Up @@ -155,13 +154,21 @@ def __init__(
conformer block of text encoder.
decoder_kernel_size (int): Decoder kernel size.
decoder_channels (int): Number of decoder initial channels.
decoder_downsample_scales (List[int]): List of downsampling scales in
decoder.
decoder_downsample_kernel_sizes (List[int]): List of kernel sizes for
downsampling layers in decoder.
decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
decoder_upsample_kernel_sizes (List[int]): List of kernel size for
decoder_upsample_kernel_sizes (List[int]): List of kernel sizes for
upsampling layers in decoder.
decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
in decoder.
decoder_resblock_kernel_sizes (List[int]): List of kernel sizes for
resblocks in decoder.
decoder_resblock_dilations (List[List[int]]): List of list of dilations for
resblocks in decoder.
use_avocodo (bool): Whether to use Avocodo model in the generator.
projection_filters (List[int]): List of projection filter sizes.
projection_kernels (List[int]): List of projection kernel sizes.
n_harmonic (int): Number of harmonic components.
use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
decoder.
posterior_encoder_kernel_size (int): Posterior encoder kernel size.
Expand All @@ -179,6 +186,16 @@ def __init__(
use_weight_norm_in_flow (bool): Whether to apply weight normalization in
flow.
use_only_mean_in_flow (bool): Whether to use only mean in flow.
generator_type (str): Type of generator to use for the model.
vocoder_generator_type (str): Type of vocoder generator to use for the
model.
fs (int): Sample rate of the audio.
hop_length (int): Number of samples between successive frames in STFT.
win_length (int): Window size of the STFT.
n_fft (int): Length of the FFT window to be used.
use_phoneme_predictor (bool): Whether to use phoneme predictor in the model.
expand_f0_method (str): The method used to expand F0. Use "repeat" or
"interpolation".
"""
super().__init__()
self.aux_channels = aux_channels
Expand Down Expand Up @@ -542,8 +559,6 @@ def forward(
LF0 = LF0 / 500
LF0 = LF0.transpose(1, 2)

# x_mask = torch.unsqueeze(sequence_mask(mel_len, x.size(2)), 1)

predict_lf0, predict_bn_mask = self.f0_decoder(
decoder_input + decoder_input_pitch, feats_lengths, g=g
)
Expand All @@ -570,7 +585,6 @@ def forward(
decoder_output, predict_bn_mask = self.prior_decoder(
decoder_input, feats_lengths, g=g
)
# x_mask = x_mask.to(x.device)

prior_info = decoder_output
prior_mean = prior_info[:, : self.hidden_channels, :]
Expand Down
13 changes: 9 additions & 4 deletions espnet2/gan_svs/vits/vits.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,6 @@ def __init__(
"sbd": [16, 256, 0.03, 10.0],
"fsbd": [64, 256, 0.1, 9.0],
},
"segment_size": 8192, # 32 * hop_size
# TODO(Yifeng): Is it better that segment_size should be
# the same as the one in the generator, which is 32,
# and we should multiply it by hop_size?
},
"pqmf_config": {
"lv1": [2, 256, 0.25, 10.0],
Expand Down Expand Up @@ -297,6 +293,7 @@ def __init__(
sampling_rate (int): Sampling rate, not used for the training but it will
be referred in saving waveform during the inference.
generator_type (str): Generator type.
vocoder_generator_type (str): Type of vocoder generator to use in the model.
generator_params (Dict[str, Any]): Parameter dict for generator.
discriminator_type (str): Discriminator type.
discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
Expand All @@ -311,7 +308,11 @@ def __init__(
lambda_feat_match (float): Loss scaling coefficient for feat match loss.
lambda_dur (float): Loss scaling coefficient for duration loss.
lambda_kl (float): Loss scaling coefficient for KL divergence loss.
lambda_pitch (float): Loss scaling coefficient for pitch loss.
lambda_phoneme (float): Loss scaling coefficient for phoneme loss.
lambda_c_yin (float): Loss scaling coefficient for yin loss.
cache_generator_outputs (bool): Whether to cache generator outputs.
use_phoneme_predictor (bool): Whether to use phoneme predictor in the model.
"""
assert check_argument_types()
Expand Down Expand Up @@ -353,6 +354,10 @@ def __init__(
discriminator_params.update(
projection_filters=generator_params["projection_filters"]
)
discriminator_params["sbd"].update(
segment_size=generator_params["segment_size"]
* mel_loss_params["hop_length"]
)
if "visinger2" in discriminator_type:
discriminator_params["multi_freq_disc_params"].update(
sample_rate=sampling_rate
Expand Down

0 comments on commit 912cfe9

Please sign in to comment.