-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
1,053 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
202 changes: 202 additions & 0 deletions
202
egs2/ofuton_p_utagoe_db/svs1/conf/tuning/train_visinger.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
# This configuration is for ESPnet2 to train VITS, which | ||
# is truely end-to-end text-to-waveform model. To run | ||
# this config, you need to specify "--svs_task gan_svs" | ||
# option for svs.sh at least and use 22050 hz audio as | ||
# the training data (mainly tested on LJspeech). | ||
# This configuration tested on 4 GPUs (V100) with 32GB GPU | ||
# memory. It takes around 2 weeks to finish the training | ||
# but 100k iters model should generate reasonable results. | ||
|
||
########################################################## | ||
# SVS MODEL SETTING # | ||
########################################################## | ||
svs: vits | ||
svs_conf: | ||
# generator related | ||
generator_type: visinger | ||
vocoder_generator_type: hifigan # hifigan, avocodo, uhifigan, visinger2 | ||
generator_params: | ||
hidden_channels: 192 | ||
spks: -1 | ||
global_channels: -1 | ||
segment_size: 20 | ||
text_encoder_attention_heads: 2 | ||
text_encoder_ffn_expand: 4 | ||
text_encoder_blocks: 6 | ||
text_encoder_positionwise_layer_type: "conv1d" | ||
text_encoder_positionwise_conv_kernel_size: 3 | ||
text_encoder_positional_encoding_layer_type: "rel_pos" | ||
text_encoder_self_attention_layer_type: "rel_selfattn" | ||
text_encoder_activation_type: "swish" | ||
text_encoder_normalize_before: true | ||
text_encoder_dropout_rate: 0.1 | ||
text_encoder_positional_dropout_rate: 0.0 | ||
text_encoder_attention_dropout_rate: 0.1 | ||
use_macaron_style_in_text_encoder: true | ||
# NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes | ||
# errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable | ||
# it as a default. We need to consider the alternative normalization | ||
# or different version pytorch may solve this issue. | ||
use_conformer_conv_in_text_encoder: false | ||
text_encoder_conformer_kernel_size: -1 | ||
decoder_kernel_size: 7 | ||
decoder_channels: 512 | ||
decoder_upsample_scales: [8, 8, 4, 2] | ||
decoder_upsample_kernel_sizes: [16, 16, 8, 4] | ||
decoder_resblock_kernel_sizes: [3, 7, 11] | ||
decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] | ||
use_weight_norm_in_decoder: true | ||
posterior_encoder_kernel_size: 3 | ||
posterior_encoder_layers: 8 | ||
posterior_encoder_stacks: 1 | ||
posterior_encoder_base_dilation: 1 | ||
posterior_encoder_dropout_rate: 0.0 | ||
use_weight_norm_in_posterior_encoder: true | ||
flow_flows: -1 # 4 | ||
flow_kernel_size: 5 | ||
flow_base_dilation: 1 | ||
flow_layers: 4 | ||
flow_dropout_rate: 0.0 | ||
use_weight_norm_in_flow: true | ||
use_only_mean_in_flow: true | ||
use_phoneme_predictor: false | ||
# discriminator related | ||
discriminator_type: visinger2 # avocodo, hifigan_multi_scale_multi_period_discriminator, visinger2, avocodo_plus | ||
discriminator_params: | ||
scales: 1 | ||
scale_downsample_pooling: "AvgPool1d" | ||
scale_downsample_pooling_params: | ||
kernel_size: 4 | ||
stride: 2 | ||
padding: 2 | ||
scale_discriminator_params: | ||
in_channels: 1 | ||
out_channels: 1 | ||
kernel_sizes: [15, 41, 5, 3] | ||
channels: 128 | ||
max_downsample_channels: 1024 | ||
max_groups: 256 | ||
bias: True | ||
downsample_scales: [4, 4, 4, 4] | ||
nonlinear_activation: "LeakyReLU" | ||
nonlinear_activation_params: | ||
negative_slope: 0.1 | ||
use_weight_norm: True | ||
use_spectral_norm: False | ||
follow_official_norm: False | ||
periods: [2, 3, 5, 7, 11] | ||
period_discriminator_params: | ||
in_channels: 1 | ||
out_channels: 1 | ||
kernel_sizes: [5, 3] | ||
channels: 32 | ||
downsample_scales: [3, 3, 3, 3, 1] | ||
max_downsample_channels: 1024 | ||
bias: True | ||
nonlinear_activation: "LeakyReLU" | ||
nonlinear_activation_params: | ||
negative_slope: 0.1 | ||
use_weight_norm: True | ||
use_spectral_norm: False | ||
multi_freq_disc_params: | ||
hop_length_factors: [2.5, 5, 7.5, 10, 12.5, 15] | ||
hidden_channels: [256, 256, 256, 256, 256] | ||
domain: "double" | ||
mel_scale: True | ||
divisors: [32, 16, 8, 4, 2, 1, 1] | ||
strides: [1, 2, 1, 2, 1, 2, 1] | ||
|
||
# loss function related | ||
generator_adv_loss_params: | ||
average_by_discriminators: false # whether to average loss value by #discriminators | ||
loss_type: mse # loss type, "mse" or "hinge" | ||
discriminator_adv_loss_params: | ||
average_by_discriminators: false # whether to average loss value by #discriminators | ||
loss_type: mse # loss type, "mse" or "hinge" | ||
feat_match_loss_params: | ||
average_by_discriminators: false # whether to average loss value by #discriminators | ||
average_by_layers: false # whether to average loss value by #layers of each discriminator | ||
include_final_outputs: true # whether to include final outputs for loss calculation | ||
mel_loss_params: | ||
fs: 44100 # must be the same as the training data | ||
n_fft: 2048 # fft points | ||
hop_length: 512 # hop size | ||
win_length: 2048 # window length | ||
window: hann # window type | ||
n_mels: 80 # number of Mel basis | ||
fmin: 0 # minimum frequency for Mel basis | ||
fmax: 22050 # maximum frequency for Mel basis | ||
log_base: null # null represent natural log | ||
lambda_adv: 1.0 # loss scaling coefficient for adversarial loss | ||
lambda_mel: 45.0 # loss scaling coefficient for Mel loss | ||
lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss | ||
lambda_dur: 0.1 # loss scaling coefficient for duration loss | ||
lambda_pitch: 10.0 # loss scaling coefficient for pitch loss | ||
lambda_phoneme: 1.0 # loss scaling coefficient for ctc loss | ||
lambda_kl: 1.0 # loss scaling coefficient for KL divergence loss | ||
# others | ||
sampling_rate: 44100 # needed in the inference for saving wav | ||
cache_generator_outputs: true # whether to cache generator outputs in the training | ||
|
||
# extra module for additional inputs | ||
pitch_extract: dio # pitch extractor type | ||
pitch_extract_conf: | ||
use_token_averaged_f0: false | ||
use_log_f0: false | ||
pitch_normalize: None # normalizer for the pitch feature | ||
|
||
# ying_extract: ying | ||
|
||
########################################################## | ||
# OPTIMIZER & SCHEDULER SETTING # | ||
########################################################## | ||
# optimizer setting for generator | ||
optim: adamw | ||
optim_conf: | ||
lr: 2.0e-4 | ||
betas: [0.8, 0.99] | ||
eps: 1.0e-9 | ||
weight_decay: 0.0 | ||
scheduler: exponentiallr | ||
scheduler_conf: | ||
gamma: 0.998 | ||
# optimizer setting for discriminator | ||
optim2: adamw | ||
optim2_conf: | ||
lr: 2.0e-4 | ||
betas: [0.8, 0.99] | ||
eps: 1.0e-9 | ||
weight_decay: 0.0 | ||
scheduler2: exponentiallr | ||
scheduler2_conf: | ||
gamma: 0.998 | ||
generator_first: false # whether to start updating generator first | ||
|
||
########################################################## | ||
# OTHER TRAINING SETTING # | ||
########################################################## | ||
num_iters_per_epoch: 1000 # number of iterations per epoch | ||
max_epoch: 500 # number of epochs | ||
accum_grad: 1 # gradient accumulation | ||
batch_size: 8 # batch size | ||
batch_type: sorted # how to make batch | ||
grad_clip: -1 # gradient clipping norm | ||
grad_noise: false # whether to use gradient noise injection | ||
sort_in_batch: descending # how to sort data in making batch | ||
sort_batch: descending # how to sort created batches | ||
num_workers: 4 # number of workers of data loader | ||
use_amp: false # whether to use pytorch amp | ||
log_interval: 50 # log interval in iterations | ||
keep_nbest_models: 10 # number of models to keep | ||
num_att_plot: 3 # number of attention figures to be saved in every check | ||
seed: 777 # random seed number | ||
patience: null # patience for early stopping | ||
unused_parameters: true # needed for multi gpu case | ||
best_model_criterion: # criterion to save the best models | ||
- - train | ||
- total_count | ||
- max | ||
cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic | ||
# in the case of GAN-SVS training, we strongly recommend setting to false | ||
cudnn_benchmark: false # setting to true might acdelerate the training speed but sometimes decrease it | ||
# therefore, we set to false as a default (recommend trying both cases) |
Oops, something went wrong.