# Imports & preparatory steps

In [1]:
import os
import torch
import sys
import shutil

# Check the number of CPUs
# $PBS_NUM_PPN vs $OMP_NUM_THREADS?
N_CPUS = int(os.environ["PBS_NUM_PPN"])
print(f"> Number of CPUs: {N_CPUS}")

# Limit CPU operation in pytorch to `N_CPUS`

torch.set_num_threads(N_CPUS)
torch.set_num_interop_threads(N_CPUS)

# Set username
USER = os.environ["USER"]

> Number of CPUs: 1


# Settings

In [2]:
# Non-Coqui-TTS parameters
COPY_TO_SCRATCH = False

In [3]:
# General params & paths
run_name = "test"
run_description = "test"
project_name = "TTS"
output_path = f"/storage/plzen4-ntis/home/dcifka20/hds/tmp_files"
continue_path = ""
restore_path = "/storage/plzen4-ntis/projects/korpusy-public/vyuka/HDS2024/SP2/model.pth"
best_path = ""
grad_accum_steps = 1     # Number of gradient accumulation steps. It is used to accumulate gradients over multiple batches (1)
small_run = 100          # Number of samples to use (suitable for debugging) - defaults to None (all samples are used)
coqui_path = f"/storage/plzen4-ntis/home/dcifka20/GIT_repos/Coqui-TTS"
trainer_path = f"/storage/plzen4-ntis/home/dcifka20/GIT_repos/Trainer"

# AUDIO PARAMS
audio = {
    # STFT params
    "fft_size": 1024,     # number of stft frequency levels. Size of the linear spectogram frame.
    "win_length": 1024,   # STFT window length
    "hop_length": 256,    # STFT window hop-lengh
    # Audio processing parameters
    "sample_rate": 24000, # DATASET-RELATED: wav sample-rate.
    # MelSpectrogram params
    "num_mels": 80,       # size of the mel spec frame (80)
    "mel_fmin": 0,        # DATASET-RELATED: minimum freq level for mel-spec (0). ~50 for male and ~95 for female voices.
    "mel_fmax": 12000,    # DATASET-RELATED: maximum freq level for mel-spec (None)
}

# DATASET
datasets = [   # List of datasets. They all merged and they get different speaker_ids.
    {"formatter": "artic",
     "path": f"/storage/plzen4-ntis/home/dcifka20/hds/datasets/FulTo.cs-CZ.m",
     "meta_file_train": "train.ph-redu.epa.csv", # for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
     "ignored_speakers": None,              # List of speakers IDs that are not used at the training (None).
     "language": "cs-cz",                   # Language code of the dataset (None). If defined, it overrides `phoneme_language`.
     "meta_file_val": None,                 # Name of the dataset meta file that defines the instances used at validation.
     "meta_file_attn_mask": "",             # Path to the file that lists the attention mask files used with models that require attention masks to train the duration predictor.
     # "meta_file_dur": meta_file_dur,        # duration of particular speech units (including punctuation, breaks etc.)
    }
]

# VOCABULARY PARAMS
characters = {     # Defines character or phoneme set used by the model
    "pad": "<PAD>",    # characters in place of empty padding (None)
    "eos": "<EOS>",    # characters showing the end of a sentence (None)
    "bos": "<BOS>",    # characters showing the beginning of a sentence (None)
    "blank": "<BLNK>", # Optional character used between characters by some models for better prosody.
    # character set used by the model. Characters not in this list are ignored when converting input text to a list of sequence IDs (None).
    # "characters": "AÁÄBCČDĎEÉĚËFGHIÍJKLMNŇOÓÖPQRŘSŠTŤUÚŮÜVWXYÝZŽaáäbcčdďeéěëfghiíjklmnňoóöpqrřsštťuúůüvwxyýzž",    # Czech graphemes
    #"characters": "0=abcdfijklmnoprstuvxzŋřɛɟɡɦɪɲʃʊʒʔː", # Czech IPA
    "characters": "ACDEIJOPRSTUZabcdefghijklmnopqrstuvxz@#$*%Ç",
    # characters considered as punctuation as parsing the input sentence (None)
    "punctuations": "!,-.:;–/()?ˈ„“”\"‚‘’ˌː… ",
    # characters considered as parsing phonemes (None)
    # "phonemes": "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
    "phonemes": None,
}

# VITS MODEL ARGS
model_args = {
    "num_chars": None,                          # Number of characters in the vocabulary (100)
    "out_channels": 513,                        # Number of output channels (513)
    "spec_segment_size": 32,                    # Decoder input segment size (32). `(32 * hoplength": waveform length)`.
    "hidden_channels": 192,                     # Number of hidden channels of the model (192)
    "hidden_channels_ffn_text_encoder": 768,    # Number of hidden channels of the feed-forward layers of the text encoder transformer (768)
    "num_heads_text_encoder": 2,                # Number of attention heads of the text encoder transformer (2)
    "num_layers_text_encoder": 6,               # Number of transformer layers in the text encoder (6)
    "kernel_size_text_encoder": 3,              # Kernel size of the text encoder transformer FFN layers (3)
    "dropout_p_text_encoder": 0.1,              # Dropout rate of the text encoder (0.1)
    "dropout_p_duration_predictor": 0.5,        # Dropout rate of the duration predictor (0.5)
    "kernel_size_posterior_encoder": 5,         # Kernel size of the posterior encoder's WaveNet layers (5)
    "dilation_rate_posterior_encoder": 1,       # Dilation rate of the posterior encoder's WaveNet layers (1)
    "num_layers_posterior_encoder": 16,         # Number of posterior encoder's WaveNet layers (16)
    "kernel_size_flow": 5,                      # Kernel size of the Residual Coupling layers of the flow network (5)
    "dilation_rate_flow": 1,                    # Dilation rate of the Residual Coupling WaveNet layers of the flow network (1)     
    "num_layers_flow": 4,                       # Number of Residual Coupling WaveNet layers of the flow network (4)
    "resblock_type_decoder": "2",               # Type of the residual block in the decoder network ("1")
    "resblock_kernel_sizes_decoder": [          # Kernel sizes of the residual blocks in the decoder network (`[3, 7, 11]`).
        3, 5, 7
    ],   
    "resblock_dilation_sizes_decoder": [        # Dilation sizes of the residual blocks in the decoder network
        [1, 2],                                 # (`[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`)
        [2, 6],
        [3, 12],
    ],
    "upsample_rates_decoder": [8, 8, 4],        # Upsampling rates for each concecutive upsampling layer in the decoder network.
                                                # The multiply of these values must be equal to the kop length used for computing spectrograms
                                                # (`[8, 8, 2, 2]`)
    "upsample_initial_channel_decoder": 128,    # Number of hidden channels of the first upsampling convolution layer of the decoder network (512)
    "upsample_kernel_sizes_decoder": [          # Kernel sizes for each upsampling layer of the decoder network (`[16, 16, 4, 4]`)
        16, 16, 8
    ],
    "periods_multi_period_discriminator": [     # Periods values for Vits Multi-Period Discriminator (`[2, 3, 5, 7, 11]`)
        2, 3, 5, 7, 11
    ],
    "use_sdp": False,                           # Use Stochastic Duration Predictor (True)
    "noise_scale": 1.0,                         # Noise scale used for the sample noise tensor in training (1.0)
    "inference_noise_scale": 0.667,             # Noise scale used for the sample noise tensor in inference (0.667)
    "length_scale": 1,                          # Scale factor for the predicted duration values (1). Smaller values result faster speech.
    "noise_scale_dp": 1.0,                      # Noise scale used by the Stochastic Duration Predictor sample noise in training (1.0)
    "inference_noise_scale_dp": 0.8,            # Noise scale for the Stochastic Duration Predictor in inference (0.8)
    "max_inference_len": None,                  # Maximum inference length to limit the memory use (None)
    "init_discriminator": True,                 # Initialize the disciminator network if set True. Set False for inference.
    "use_spectral_norm_discriminator": False,   # Use spectral normalization over weight norm in the discriminator (False)
    "detach_dp_input": True,                    # Detach duration predictor's input from the network for stopping the gradients (True)
    "freeze_encoder": False,                    # Freeze the encoder weigths during training (False)
    "freeze_DP": False,                         # Freeze the duration predictor weigths during training (False)
    "freeze_PE": False,                         # Freeze the posterior encoder weigths during training (False)
    "freeze_flow_decoder": False,               # Freeze the flow encoder weigths during training (False)
    "freeze_waveform_decoder": False,           # Freeze the waveform decoder weigths during training (False)
    "encoder_sample_rate": None,                # If not None this sample rate will be used for training the Posterior Encoder, flow, text_encoder and duration predictor.
                                                # The decoder part (vocoder) will be trained with the `config.audio.sample_rate` (None).
    "interpolate_z": True,                      # If `encoder_sample_rate` not None and this parameter True the nearest interpolation will be used
                                                # to upsampling the latent variable z with the sampling rate `encoder_sample_rate` to the `config.audio.sample_rate` (True).
                                                # If it is False you will need to add extra `upsample_rates_decoder` to match the shape.
    # MULTI-SPEAKER
    "num_speakers": 0,                      # Number of speakers for the speaker embedding layer
    "use_speaker_embedding": False,         # Enable/disable using speaker embeddings for multi-speaker models (False). If set True, the model is in the multi-speaker mode.
    "speakers_file": None,                  # Path to the speaker mapping file for the Speaker Manager
    "speaker_embedding_channels": 256,      # Number of speaker embedding channels (256)    
    "use_d_vector_file": False,             # Enable/disable using external speaker embeddings in place of the learned embeddings (False)
    "d_vector_file": None,                  # Path to the file including pre-computed speaker embeddings (None)
    "d_vector_dim": None,                   # Channels of external speaker embedding vectors (0)
    "use_speaker_encoder_as_loss": False,   # Enable/Disable Speaker Consistency Loss (SCL) (False)
    "speaker_encoder_config_path": None,    # Path to the file speaker encoder config file, to use for SCL ("").
    "speaker_encoder_model_path": None,     # Path to the file speaker encoder checkpoint file, to use for SCL ("").
    "condition_dp_on_speaker": True,        # Condition the duration predictor on the speaker embedding (True)
    # MULTI-LANGUAGE
    "use_language_embedding": False,    # Enable/Disable language embedding for multilingual models (False)
    "embedded_language_dim": 4,         # Number of language embedding channels (4)
    "num_languages": 0,                 # Number of languages for the language embedding layer (0)
    "language_ids_file": None,          # Path to the language mapping file for the Language Manager (None)
}

# CONFIG
model_config = {
    # DATA LOADING
    "num_loader_workers": 1,               # number of training data loader processes. Don't set it too big. 4-8 are good values.
    "num_eval_loader_workers": 1,          # number of evaluation data loader processes.
    "text_cleaner": "no_cleaners",       # Name of the text cleaner used for cleaning and formatting transcripts.
    "enable_eos_bos_chars": False,         # enable/disable beginning of sentence and end of sentence chars.
    "batch_group_size": 5,                 # Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
                                           # length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
                                           # prevent using the same batches for each epoch.
    "min_text_len": 1,                     # Minimum length of input text to be used (0). All shorter samples will be ignored.
    "max_text_len": 999,                   # Maximum length of input text to be used (float("inf")). All longer samples will be ignored.
    "min_audio_len": 7200,                 # Minimum length of input audio to be used (0). All shorter samples will be ignored.
    "max_audio_len": 480000,               # Maximum length of input audio to be used (float("inf")). All longer samples will be ignored.
                                           # The maximum length in the dataset defines the VRAM used in the training.
                                           # Hence, pay attention to this value if you encounter an OOM error in training.
                                           # For FS=24kHz and max audio length 15s: # 360000 = 24000 * 15
    "start_by_longest": True,              # Start by longest sequence. It is especially useful to check OOM (False)
    "compute_input_seq_cache": True,       # If true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
    "use_noise_augment": False,            # Augment the input audio with random noise
    "add_blank": False,                    # Add blank characters between each other two characters (True). It improves performance for some models at expense of slower run-time due to the longer input sequence.
    "compute_linear_spec": True,           # If True, the linear spectrogram is computed and returned alongside the mel output (True). Do not change.
    "return_wav": True,                    # If true, data loader returns the waveform as well as the other outputs (True). Do not change.
    "compute_f0": False,

    # PHONEMES
    "phoneme_cache_path": "phoneme_cache", # phoneme computation is slow, therefore, it caches results in the given folder
    "use_phonemes": False,                 # use phonemes instead of raw characters. It is suggested for better pronounciation.
    "phoneme_language": "cz-cz",           # depending on your target language, pick one from  https"://github.com/bootphon/phonemizer#languages

    # DISTRIBUTED TRAINING
    "distributed_backend": "gloo",
    "distributed_url": "tcp://localhost:54321",

    # TRAINING
    "epochs": 10,              # total number of epochs to train (10000)
    "use_total_epochs": True,  # JMa: Compute the number of epochs done as a total number across continue runs (False). If True, total number of epochs is added to checkpoint path.
    "stop_after_steps": False, # JMa: Stop training after defined step (False)
    "steps": 1000000,          # JMa: "Number of steps to stop training when `stop_after_steps` is True (1000000).
    "batch_size": 16,          # Batch size for training. Lower values than 32 might cause hard to learn attention.
    "mixed_precision": True,   # level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
    "loss_masking": None,

    # VALIDATION
    "run_eval": True,               # Run evaluation after each epoch.
    "eval_batch_size": 16,          # Validation batch size.
    "eval_split_max_size": 256,     # Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
    "eval_split_size": 0.01,        # If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set. 
                                    # If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
    "test_delay_epochs": -1,        # Until attention is aligned, testing only wastes computation time.
    "test_epoch_step": 1,           # JMa: Number of epochs to run test and generate testing files (1)
    "save_test_files": True,        # JMa: Save test files (False)
    "test_sentences_file": None,    # set a file to load sentences to be used for testing. If it is null then we use default english sentences.
    "test_sentences": [             # sentences to be used for testing"
        ["$ pRIliZ ZluTouCkI kUJ Upjel DAbelskE Odi. $"],
        ["$ strC prst skrs krk, # nebo Ti ho tam strCIm sAm. $"],
        ["$ tohle je pokus, # snat to vijde. $"],
        ["$ omlouvAm se, # tohle se stAvA maksimAlJe jednou za deset let. $"],
        ["$ tRista tRicet tRi stRIbrnIx stRIkaCek stRIkalo pRes tRista tRicet tRi stRIbrnIx stRex. $"],
        ["$ pRAl bix si, # abix bil uZ doma. $"],
        ["$ pUjdeme zItra do kina? $"],
        ["$ proC to DelAS? $"],
    ],
    # OPTIMIZER
    "lr": 0.001,                           # Learning rate for each optimizer (0.001)
    "lr_scheduler": None,                  # Learning rate scheduler(s) to use (None)
    "lr_scheduler_params": None,           # Learning rate scheduler(s) arguments (None)
    "optimizer": "AdamW",                  # Optimizer used for the training.
    "optimizer_params": {                  # Optimizer kwargs.
        "betas": [0.8, 0.99],
        "eps": 0.000000001,
        "weight_decay": 0.01,              # Weight decay weight.
    },
    "use_grad_scaler": False,              # Enable/disable gradient scaler explicitly. It is enabled by default with AMP training (False)
    "lr_gen": 0.0002,                      # Initial learning rate for the generator (0.0002)
    "lr_disc": 0.0002,                     # Initial learning rate for the discriminator (0.0002)
    "lr_scheduler_gen": "ExponentialLR",   # Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*` (`ExponentialLR`).
    "lr_scheduler_gen_params": {           # Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma'": 0.999875, "last_epoch":-1}`.
        "gamma": 0.999875,
        "last_epoch": -1,
    },
    "lr_scheduler_disc": "ExponentialLR",  # Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*` (`ExponentialLR`).
    "lr_scheduler_disc_params": {          # Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma'": 0.999875, "last_epoch":-1}`.
        "gamma": 0.999875,
        "last_epoch": -1,
    },
    "grad_clip":  [1000, 1000],            # Gradient clipping thresholds for each optimizer
    "scheduler_after_epoch": True,         # If true, step the scheduler after each epoch else after each step (True).

    # LOSS PARAMS
    "kl_loss_alpha": 5.0,               # Loss weight for KL loss (1.0)
    "disc_loss_alpha": 1.0,             # Loss weight for the discriminator loss (1.0)
    "gen_loss_alpha": 1.0,              # Loss weight for the generator loss (1.0)
    "feat_loss_alpha": 1.0,             # Loss weight for the feature matching loss (1.0)
    "mel_loss_alpha": 45.0,             # Loss weight for the mel loss (45.0)
    "dur_loss_alpha": 1.0,              # Loss weight for duration loss (1.0)
    "speaker_encoder_loss_alpha": 9.0,  # Speaker Consistency Loss (SCL) α to 9 like in the YourTTS paper
                                        # (used when `use_speaker_encoder_as_loss = True`)

    # SAMPLE BALANCING
    "use_speaker_weighted_sampler": False,  # Enable/Disable the batch balancer by speaker (False).
    "speaker_weighted_sampler_alpha": 1.0,  # Number that control the influence of the speaker sampler weights (1.0)
    "use_language_weighted_sampler": False, # Enable/Disable the batch balancer by language (False)
    "language_weighted_sampler_alpha": 1.0, # Number that control the influence of the language sampler weights (1.0)
    "use_length_weighted_sampler": False,   # Enable/Disable the batch balancer by audio length (False). If enabled the dataset will be divided into 10 buckets
                                            # considering the min and max audio of the dataset. The sampler weights will be computed forcing to have
                                            # the same quantity of data for each bucket in each training batch.
    "length_weighted_sampler_alpha": 1.0,   # Number that control the influence of the length sampler weights (1.0)
    "use_weighted_sampler": False,          # If true, use weighted sampler with bucketing for balancing samples between datasets used in training (False)
    "weighted_sampler_attrs": {             # Key returned by the formatter to be used for weighted sampler ({}).
        "speaker_name": 1.0,                # For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
    },
                                            # by overweighting `root_path` by 2.0.
    "weighted_sampler_multipliers": {},     # Weight each unique value of a key returned by the formatter for weighted sampling ({}).
                                            # For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/": 1.0,
                                            #                            "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
                                            # It will sample instances from `train-clean-100` 2 times more than `train-clean-360`.

    # TENSORBOARD, LOGGING & CHECKPOINTING
    "print_step": 25,                  # Number of steps to log training on console.
    "plot_step": 25,                   # Number of steps required to print the next training log.
    "dashboard_logger": "tensorboard", # "tensorboard" or "wandb"
    "print_eval": True,                # If True, it prints intermediate loss values in evalulation
    "save_checkpoints": True,          # If true, it saves checkpoints per "save_step"
    "save_on_epochs": True,            # JMa: If True, checkpoints/models are saved based on epochs (False)
    "save_step": 5000,                 # Number of training steps expected to save training stats and checkpoints (10000)
    "save_epoch": 10,                  # Number of training epochs expected to save training stats and checkpoints (25). Used instead of `save_steps` when `use_total_epochs == True`.
    "log_model_step": None,            # Save checkpoint to the logger every `log_model_step`` steps (None). If not defined `log_model_step == save_step`.
    "log_model_epoch": None,           # Save checkpoint to the logger every `log_model_epoch`` epochs (None). If not defined `log_model_epoch == save_epoch`. Used instead of `log_model_step` when `use_total_epochs == True`.
    "save_n_checkpoints": 2,           # Keep n local checkpoints (5).
    "save_all_best": False,            # If true, save all best checkpoints and keep the older ones.
    "save_best_after": 10000,          # Global step after which to save best models if save_all_best is true (10000)
    "model_param_stats": False,        # Enable/Disable logging internal model stats for model diagnostic. It might be useful for model debugging. Defaults to False.
    "log_test_files": False,           # JMa: Log test files (True)
    "use_epoch_in_path": True          # JMa: If True, total number of epochs is added to checkpoint/model and test file path (False)
}

In [4]:
# Parameters

# Non-Coqui-TTS parameters
COPY_TO_SCRATCH = True

# Copy data

In [5]:
if COPY_TO_SCRATCH:
    # Copy datasets
    DATASETS_SCRATCH = []
    for d in datasets:
        # Prepare dataset dir in the scratch
        dataset_scratch = os.path.join(os.environ["SCRATCHDIR"], os.path.basename(d["path"]))
        # Copy dataset to local scratch
        print(f"> Copying data to local scratch: {dataset_scratch}")
        shutil.copytree(d["path"], dataset_scratch)
        # Store the scratch dataset so that it is used for training
        d["path"] = dataset_scratch
        # Store paths to individual datasets in scratch so that they can be deleted in the end
        DATASETS_SCRATCH.append(dataset_scratch)

> Copying data to local scratch: /scratch.ssd/dcifka20/job_1688648.pbs-m1.metacentrum.cz/FulTo.cs-CZ.m


FileExistsError: [Errno 17] File exists: '/scratch.ssd/dcifka20/job_1688648.pbs-m1.metacentrum.cz/FulTo.cs-CZ.m'

# Set path to training framework

In [6]:
# Set path to (modified) Coqui-TTS
sys.path.insert(0, coqui_path)
# Set path to (modified) Coqui-Trainer
sys.path.insert(0, trainer_path)

# Training

In [None]:
import os
# Trainer: Where the ✨️ happens.
# TrainerArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs
# VitsConfig: all model related values for training, validating and testing.
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
from TTS.hds.vits import VitsHDS
# BaseDatasetConfig: defines name, formatter and path of the dataset
from TTS.config.shared_configs import BaseDatasetConfig
# TTSTokenizer: defines tokens
from TTS.tts.utils.text.tokenizer import TTSTokenizer
# CharactersConfig: defines characters/phonemes
from TTS.tts.configs.shared_configs import CharactersConfig
from TTS.tts.datasets import load_tts_samples
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.languages import LanguageManager
# To check module version
from TTS import __version__ as coqui_tts_version
from trainer import __version__ as trainer_version
from torch import __version__ as torch_version
from platform import python_version

print(" > Python & module versions...")
print(f" | > Python:    {python_version()}")
print(f" | > PyTorch:   {torch_version}")
print(f" | > Coqui-TTS: {coqui_tts_version}")
print(f" | > Trainer:   {trainer_version}")

# Set audio config
audio_config = VitsAudioConfig(**audio)
# Set dataset config
dataset_config = [BaseDatasetConfig(**d) for d in datasets]
# Set characters config
character_config = CharactersConfig(**characters)

# VITS model args
vits_args = VitsArgs(**model_args)

# VITS config
config = VitsConfig(
    # General params and paths
    run_name=run_name,
    run_description=run_description,
    project_name=project_name,
    output_path=output_path,
    # Model args
    model_args=vits_args,
    # Audio config
    audio=audio_config,
    # Datasets config
    datasets=dataset_config,
    # Character config
    characters=character_config,
    **model_config
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config,
                                               eval_split=True,
                                               eval_split_max_size=config.eval_split_max_size,
                                               eval_split_size=config.eval_split_size)
print(f" | > # training files:      {len(train_samples)}")
print(f" | > # evaluation files:    {len(eval_samples)}")
print(f" | > evaluation split size: {config.eval_split_size}")

# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = VitsHDS(config, ap, tokenizer)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
# Trainer arguments
trainer_args = {
    "continue_path": continue_path,
    "restore_path": restore_path,
    "best_path": best_path,
    "use_unique_model_folder": False,
    "grad_accum_steps": grad_accum_steps,
    "small_run": small_run,
}
trainer = Trainer(
    TrainerArgs(**trainer_args),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

print(" > Training arguments...")
print(f" | > gradient accumulation steps: {grad_accum_steps}")
print(f" | > true batch size: {int(grad_accum_steps*int(config.batch_size))}")
print(f" | > learning rate: {config.lr}")
print(f" | > save test files: {config.save_test_files} (each {config.test_epoch_step} epochs)")
print(f" | > log test files: {config.log_test_files}")
print(f" | > use total epochs: {config.use_total_epochs}")
if config.save_on_epochs:
    print(f" | > checkpoint model: {config.save_epoch} epochs")
else:
    print(f" | > checkpoint model: {config.save_step} steps")
if config.stop_after_steps:
    print(f" | > stop training: {config.steps} steps")
else:
    print(f" | > stop training: {config.epochs} epochs")

# AND... 3,2,1... 🚀
trainer.fit()

 > Python & module versions...
 | > Python:    3.10.12
 | > PyTorch:   2.1.2+cu121
 | > Coqui-TTS: 0.22.0
 | > Trainer:   v0.0.36
 > Setting up Audio Processor...
 | > sample_rate:24000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:12000
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > ARTIC dataset: voice=FulTo, sex=m, language=cs-CZ
 | > Found 1000 files in /auto/plzen4-ntis/home/dcifka20/hds/datasets/F

fatal: not a git repository (or any parent up to mount point /auto/plzen4-ntis)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 64
 | > Num. of Torch Threads: 1
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
2024-05-10 13:41:34.141236: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-10 13:41:34.141307: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-10 13:41:34.141336: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.

 > Training arguments...
 | > gradient accumulation steps: 1
 | > true batch size: 16
 | > learning rate: 0.001
 | > save test files: True (each 1 epochs)
 | > log test files: False
 | > use total epochs: True
 | > checkpoint model: 10 epochs
 | > stop training: 10 epochs


> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 100



[1m > TRAINING (2024-05-10 13:41:39) [0m


 | > Preprocessing samples
 | > Max text length: 145
 | > Min text length: 47
 | > Avg text length: 84.91
 | 
 | > Max audio length: 312000
 | > Min audio length: 117600
 | > Avg audio length: 192455.88
 | > Num. instances discarded samples: 0
 | > Batch group size: 80.
$ ve zvjeTe se fSak podle mlufCI eurotelu ivi taTounovE v posledJI dobje objevila dalSI noviNka. $
 [!] Character 'N' not found in the vocabulary. Discarding it.
$ jiRI jirous pozastavI zvUj pRedimenzovanI vulgarostroj, # a pRemItA o hodnotAdzh po~ezije a jImavje recituje spatra. $
 [!] Character '~' not found in the vocabulary. Discarding it.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.
$ sedmaosmdesATiletI bAsJIk ji zIskal za zvou posledJI kJihu nazvanou po^uliCJI psIk. $
 [!] Character '^' not found in the voca





> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 10
 | > Preprocessing samples
 | > Max text length: 121
 | > Min text length: 56
 | > Avg text length: 95.2
 | 
 | > Max audio length: 300000
 | > Min audio length: 132000
 | > Avg audio length: 221039.6
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
$ podle andreje staNkoviCe je lukeSUv urAZlivI ClAnek pouhou metaforou, # vijadRujIcI rozhoRCeJI nad dzhaosem. $
 [!] Character 'N' not found in the vocabulary. Discarding it.



[1m > EVALUATION [0m



$ podle andreje staNkoviCe je lukeSUv urAZlivI ClAnek pouhou metaforou, # vijadRujIcI rozhoRCeJI nad dzhaosem. $
 [!] Character 'N' not found in the vocabulary. Discarding it.
Volám diskriminátor pro výpočet loss funkce


[1m   --> STEP: 0[0m
     | > loss_disc: 1.3587108850479126  (1.3587108850479126)
     | > loss_disc_real_0: 0.12047269195318222  (0.12047269195318222)
     | > loss_disc_real_1: 0.1178659126162529  (0.1178659126162529)
     | > loss_disc_real_2: 0.20523667335510254  (0.20523667335510254)
     | > loss_disc_real_3: 0.11560378223657608  (0.11560378223657608)
     | > loss_disc_real_4: 0.15837617218494415  (0.15837617218494415)
     | > loss_disc_real_5: 0.09131519496440887  (0.09131519496440887)
     | > loss_0: 1.3587108850479126  (1.3587108850479126)
     | > loss_gen: 3.8523104190826416  (3.8523104190826416)
     | > loss_kl: 22.580995559692383  (22.580995559692383)
     | > loss_feat: 71.39514923095703  (71.39514923095703)
     | > loss_mel: 38.266658782958984  (38.266658782958984)
     | > loss_1: 136.09510803222656  (136.09510803222656)



Volám generátor pro výpočet loss funkcí.
 | > Synthesizing test sentences.
 | > Saving 8 test audio files at step/epoch 000061008-00000
 | > Saving 8 test figures at step/epoch 000061008-00000



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.3341529369354248 [0m(+0)
     | > avg_loss_disc: 1.3587108850479126 [0m(+0)
     | > avg_loss_disc_real_0: 0.12047269195318222 [0m(+0)
     | > avg_loss_disc_real_1: 0.1178659126162529 [0m(+0)
     | > avg_loss_disc_real_2: 0.20523667335510254 [0m(+0)
     | > avg_loss_disc_real_3: 0.11560378223657608 [0m(+0)
     | > avg_loss_disc_real_4: 0.15837617218494415 [0m(+0)
     | > avg_loss_disc_real_5: 0.09131519496440887 [0m(+0)
     | > avg_loss_0: 1.3587108850479126 [0m(+0)
     | > avg_loss_gen: 3.8523104190826416 [0m(+0)
     | > avg_loss_kl: 22.580995559692383 [0m(+0)
     | > avg_loss_feat: 71.39514923095703 [0m(+0)
     | > avg_loss_mel: 38.266658782958984 [0m(+0)
     | > avg_loss_1: 136.09510803222656 [0m(+0)

 > BEST MODEL : /storage/plzen4-ntis/home/dcifka20/hds/tmp_files/test/best_model_61008-0.pth

[4m[1m > EPOCH: 1/10[0m
 --> /storage/plzen4-ntis/home/dcifka20/hds/tmp_files/test

[1m > TRAINING (202

$ ve zvjeTe se fSak podle mlufCI eurotelu ivi taTounovE v posledJI dobje objevila dalSI noviNka. $
 [!] Character 'N' not found in the vocabulary. Discarding it.
$ jiRI jirous pozastavI zvUj pRedimenzovanI vulgarostroj, # a pRemItA o hodnotAdzh po~ezije a jImavje recituje spatra. $
 [!] Character '~' not found in the vocabulary. Discarding it.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.
$ sedmaosmdesATiletI bAsJIk ji zIskal za zvou posledJI kJihu nazvanou po^uliCJI psIk. $
 [!] Character '^' not found in the vocabulary. Discarding it.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss f


[1m > EVALUATION [0m



$ podle andreje staNkoviCe je lukeSUv urAZlivI ClAnek pouhou metaforou, # vijadRujIcI rozhoRCeJI nad dzhaosem. $
 [!] Character 'N' not found in the vocabulary. Discarding it.
Volám diskriminátor pro výpočet loss funkce
Volám generátor pro výpočet loss funkcí.


[1m   --> STEP: 0[0m
     | > loss_disc: 1.5713304281234741  (1.5713304281234741)
     | > loss_disc_real_0: 0.11404550075531006  (0.11404550075531006)
     | > loss_disc_real_1: 0.162921741604805  (0.162921741604805)
     | > loss_disc_real_2: 0.3405941128730774  (0.3405941128730774)
     | > loss_disc_real_3: 0.16517972946166992  (0.16517972946166992)
     | > loss_disc_real_4: 0.1545741707086563  (0.1545741707086563)
     | > loss_disc_real_5: 0.20709426701068878  (0.20709426701068878)
     | > loss_0: 1.5713304281234741  (1.5713304281234741)
     | > loss_gen: 4.681225299835205  (4.681225299835205)
     | > loss_kl: 7.366035461425781  (7.366035461425781)
     | > loss_feat: 92.36017608642578  (92.36017608642578)
     | > loss_mel: 28.425506591796875  (28.425506591796875)
     | > loss_1: 132.83294677734375  (132.83294677734375)



 | > Synthesizing test sentences.
 | > Saving 8 test audio files at step/epoch 000061015-00001
 | > Saving 8 test figures at step/epoch 000061015-00001



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.206404447555542 [0m(-0.1277484893798828)
     | > avg_loss_disc:[91m 1.5713304281234741 [0m(+0.21261954307556152)
     | > avg_loss_disc_real_0:[92m 0.11404550075531006 [0m(-0.006427191197872162)
     | > avg_loss_disc_real_1:[91m 0.162921741604805 [0m(+0.045055828988552094)
     | > avg_loss_disc_real_2:[91m 0.3405941128730774 [0m(+0.13535743951797485)
     | > avg_loss_disc_real_3:[91m 0.16517972946166992 [0m(+0.04957594722509384)
     | > avg_loss_disc_real_4:[92m 0.1545741707086563 [0m(-0.003802001476287842)
     | > avg_loss_disc_real_5:[91m 0.20709426701068878 [0m(+0.11577907204627991)
     | > avg_loss_0:[91m 1.5713304281234741 [0m(+0.21261954307556152)
     | > avg_loss_gen:[91m 4.681225299835205 [0m(+0.8289148807525635)
     | > avg_loss_kl:[92m 7.366035461425781 [0m(-15.214960098266602)
     | > avg_loss_feat:[91m 92.36017608642578 [0m(+20.96502685546875)
     | > avg_loss_mel:[92m 28.425

# Cleanup

In [None]:
if COPY_TO_SCRATCH:
    for d in DATASETS_SCRATCH:
        # Delete local dataset directory
        shutil.rmtree(d)