espnet · ftshijt · May 23, 2023 · Mar 18, 2023 · Mar 19, 2023 · Mar 21, 2023
diff --git a/egs2/TEMPLATE/svs1/svs.sh b/egs2/TEMPLATE/svs1/svs.sh
@@ -65,9 +65,10 @@ n_shift=256       # The number of shift points.
 win_length=null   # Window length.
 score_feats_extract=frame_score_feats # The type of music score feats (frame_score_feats or syllable_score_feats)
 pitch_extract=None
+ying_extract=None
 # Only used for the model using pitch features (e.g. FastSpeech2)
 f0min=80          # Maximum f0 for pitch extraction.
-f0max=400         # Minimum f0 for pitch extraction.
+f0max=800         # Minimum f0 for pitch extraction.
 
 oov="<unk>"         # Out of vocabrary symbol.
 blank="<blank>"     # CTC blank symbol.
@@ -527,6 +528,9 @@ if ! "${skip_train}"; then
         _opts+="--pitch_extract_conf hop_length=${n_shift} "
         _opts+="--pitch_extract_conf f0max=${f0max} "
         _opts+="--pitch_extract_conf f0min=${f0min} "
+        _opts+="--ying_extract ${ying_extract} "
+        _opts+="--ying_extract_conf fs=${fs} "
+        _opts+="--ying_extract_conf w_step=${n_shift} "
         _opts+="--energy_extract_conf fs=${fs} "
         _opts+="--energy_extract_conf n_fft=${n_fft} "
         _opts+="--energy_extract_conf hop_length=${n_shift} "
@@ -669,6 +673,7 @@ if ! "${skip_train}"; then
             _opts+="--feats_extract_conf hop_length=${n_shift} "
             _opts+="--feats_extract_conf win_length=${win_length} "
             _opts+="--pitch_extract ${pitch_extract} "
+            _opts+="--ying_extract ${ying_extract} "
             if [ "${feats_extract}" = fbank ]; then
                 _opts+="--feats_extract_conf fs=${fs} "
                 _opts+="--feats_extract_conf fmin=${fmin} "
@@ -682,6 +687,10 @@ if ! "${skip_train}"; then
                 _opts+="--pitch_extract_conf f0max=${f0max} "
                 _opts+="--pitch_extract_conf f0min=${f0min} "
             fi
+            if [ "${ying_extract}" = ying ]; then
+                _opts+="--ying_extract_conf fs=${fs} "
+                _opts+="--ying_extract_conf w_step=${n_shift} "
+            fi
 
             if [ "${num_splits}" -gt 1 ]; then
                 # If you met a memory error when parsing text files, this option may help you.
@@ -800,6 +809,14 @@ if ! "${skip_train}"; then
             _opts+="--train_data_path_and_name_and_type ${_train_collect_dir}/${_scp},feats,${_type} "
             _opts+="--valid_data_path_and_name_and_type ${_valid_collect_dir}/${_scp},feats,${_type} "
         fi
+        if [ -e "${svs_stats_dir}/train/collect_feats/ying.scp" ]; then
+            _scp=ying.scp
+            _type=npy
+            _train_collect_dir=${svs_stats_dir}/train/collect_feats
+            _valid_collect_dir=${svs_stats_dir}/valid/collect_feats
+            _opts+="--train_data_path_and_name_and_type ${_train_collect_dir}/${_scp},ying,${_type} "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_collect_dir}/${_scp},ying,${_type} "
+        fi
 
         # Check extra statistics
         if [ -e "${svs_stats_dir}/train/pitch_stats.npz" ]; then
@@ -817,6 +834,10 @@ if ! "${skip_train}"; then
             _opts+="--energy_extract_conf win_length=${win_length} "
             _opts+="--energy_normalize_conf stats_file=${svs_stats_dir}/train/energy_stats.npz "
         fi
+        if [ -e "${svs_stats_dir}/train/ying_stats.npz" ]; then
+            _opts+="--ying_extract_conf fs=${fs} "
+            _opts+="--ying_extract_conf w_step=${n_shift} "
+        fi
 
 
         # Add X-vector to the inputs if needed

diff --git a/...opencpop/svs1/conf/tuning/train_vits.yaml → ...cpop/svs1/conf/tuning/train_visinger.yaml b/...opencpop/svs1/conf/tuning/train_vits.yaml → ...cpop/svs1/conf/tuning/train_visinger.yaml
@@ -13,12 +13,13 @@
 svs: vits
 svs_conf:
     # generator related
-    generator_type: vits_generator
+    generator_type: visinger
+    vocoder_generator_type: hifigan # hifigan, avocodo, uhifigan, visinger2
     generator_params:
         hidden_channels: 192
         spks: -1
         global_channels: -1
-        segment_size: 32
+        segment_size: 20
         text_encoder_attention_heads: 2
         text_encoder_ffn_expand: 4
         text_encoder_blocks: 6
@@ -40,26 +41,27 @@ svs_conf:
         text_encoder_conformer_kernel_size: -1
         decoder_kernel_size: 7
         decoder_channels: 512
-        decoder_upsample_scales: [8, 8, 2, 2]
-        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_upsample_scales: [8, 8, 4, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 8, 4]
         decoder_resblock_kernel_sizes: [3, 7, 11]
         decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
         use_weight_norm_in_decoder: true
-        posterior_encoder_kernel_size: 5
-        posterior_encoder_layers: 16
+        posterior_encoder_kernel_size: 3
+        posterior_encoder_layers: 8
         posterior_encoder_stacks: 1
         posterior_encoder_base_dilation: 1
         posterior_encoder_dropout_rate: 0.0
         use_weight_norm_in_posterior_encoder: true
-        flow_flows: 4
+        flow_flows: -1 # 4
         flow_kernel_size: 5
         flow_base_dilation: 1
         flow_layers: 4
         flow_dropout_rate: 0.0
         use_weight_norm_in_flow: true
         use_only_mean_in_flow: true
+        use_phoneme_predictor: false
     # discriminator related
-    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_type: visinger2 # avocodo, hifigan_multi_scale_multi_period_discriminator, visinger2, avocodo_plus
     discriminator_params:
         scales: 1
         scale_downsample_pooling: "AvgPool1d"
@@ -73,9 +75,9 @@ svs_conf:
             kernel_sizes: [15, 41, 5, 3]
             channels: 128
             max_downsample_channels: 1024
-            max_groups: 16
+            max_groups: 256
             bias: True
-            downsample_scales: [2, 2, 4, 4, 1]
+            downsample_scales: [4, 4, 4, 4]
             nonlinear_activation: "LeakyReLU"
             nonlinear_activation_params:
                 negative_slope: 0.1
@@ -96,6 +98,14 @@ svs_conf:
                 negative_slope: 0.1
             use_weight_norm: True
             use_spectral_norm: False
+        multi_freq_disc_params:
+            hop_length_factors: [2.5, 5, 7.5, 10, 12.5, 15]
+            hidden_channels: [256, 256, 256, 256, 256]
+            domain: "double"
+            mel_scale: True
+            divisors: [32, 16, 8, 4, 2, 1, 1]
+            strides: [1, 2, 1, 2, 1, 2, 1]
+
     # loss function related
     generator_adv_loss_params:
         average_by_discriminators: false # whether to average loss value by #discriminators
@@ -108,31 +118,34 @@ svs_conf:
         average_by_layers: false         # whether to average loss value by #layers of each discriminator
         include_final_outputs: true      # whether to include final outputs for loss calculation
     mel_loss_params:
-        fs: 22050          # must be the same as the training data
-        n_fft: 1024        # fft points
-        hop_length: 256    # hop size
-        win_length: null   # window length
+        fs: 44100          # must be the same as the training data
+        n_fft: 2048        # fft points
+        hop_length: 512    # hop size
+        win_length: 2048   # window length
         window: hann       # window type
         n_mels: 80         # number of Mel basis
         fmin: 0            # minimum frequency for Mel basis
-        fmax: null         # maximum frequency for Mel basis
+        fmax: 22050        # maximum frequency for Mel basis
         log_base: null     # null represent natural log
     lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
     lambda_mel: 45.0       # loss scaling coefficient for Mel loss
     lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
     lambda_dur: 0.1        # loss scaling coefficient for duration loss
-    lambda_pitch: 1.0      # loss scaling coefficient for pitch loss
+    lambda_pitch: 10.0      # loss scaling coefficient for pitch loss
     lambda_phoneme: 1.0    # loss scaling coefficient for ctc loss
     lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
     # others
-    sampling_rate: 22050          # needed in the inference for saving wav
+    sampling_rate: 44100          # needed in the inference for saving wav
     cache_generator_outputs: true # whether to cache generator outputs in the training
 
 # extra module for additional inputs
-pitch_extract: dio           # pitch extractor type
+pitch_extract: dio         # pitch extractor type
 pitch_extract_conf:
     use_token_averaged_f0: false
-pitch_normalize: global_mvn  # normalizer for the pitch feature
+    use_log_f0: false
+pitch_normalize: None      # normalizer for the pitch feature
+
+# ying_extract: ying
 
 ##########################################################
 #            OPTIMIZER & SCHEDULER SETTING               #
@@ -146,7 +159,7 @@ optim_conf:
     weight_decay: 0.0
 scheduler: exponentiallr
 scheduler_conf:
-    gamma: 0.999875
+    gamma: 0.998
 # optimizer setting for discriminator
 optim2: adamw
 optim2_conf:
@@ -156,17 +169,17 @@ optim2_conf:
     weight_decay: 0.0
 scheduler2: exponentiallr
 scheduler2_conf:
-    gamma: 0.999875
+    gamma: 0.998
 generator_first: false # whether to start updating generator first
 
 ##########################################################
 #                OTHER TRAINING SETTING                  #
 ##########################################################
 num_iters_per_epoch: 1000 # number of iterations per epoch
-max_epoch: 600            # number of epochs
+max_epoch: 500            # number of epochs
 accum_grad: 1             # gradient accumulation
-batch_bins: 500000        # batch bins (feats_type=raw)
-batch_type: numel         # how to make batch
+batch_size: 8             # batch size
+batch_type: sorted        # how to make batch
 grad_clip: -1             # gradient clipping norm
 grad_noise: false         # whether to use gradient noise injection
 sort_in_batch: descending # how to sort data in making batch