In [1]:
import os
os.chdir("../")

In [2]:
%pwd


'd:\\office\\vc'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    output_dir: Path
    phoneme_cache_path: Path
    dataset_name: str
    dataset_path: Path
    metadata_path: Path

In [4]:
from cloner.constants import *
from cloner.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath= CONFIG_FILE_PATH,
            params_filepath= PARAMS_FILE_PATH):
            
            self.config=read_yaml(config_filepath)
            self.params=read_yaml(params_filepath)

            create_directories([self.config.artifacts_root]) 

    def get_model_training_config(self)-> ModelTrainingConfig:
          config=self.config.model_training
          create_directories([config.root_dir]) 

          model_training_config=ModelTrainingConfig( 
                root_dir= config.root_dir,
                output_dir= config.output_dir,
                phoneme_cache_path= config.phoneme_cache_path,
                dataset_name= config.dataset_name,
                dataset_path= config.dataset_path,
                metadata_path= config.metadata_path
                ) 

          return model_training_config

In [6]:
from pathlib import Path
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from cloner.utils.common import read_yaml
from cloner.pipeline.stage_02_data_preprocessing import DataPreprocessor
from cloner.entity.config_entity import DataPreProcessConfig

import os

[2025-05-22 13:27:31,934: DEBUG: __init__: pydot initializing]
[2025-05-22 13:27:31,936: DEBUG: __init__: pydot 4.0.0]
[2025-05-22 13:27:31,952: DEBUG: core: pydot core module initializing]
[2025-05-22 13:27:37,715: DEBUG: utils: Loading FFmpeg6]
[2025-05-22 13:27:37,735: DEBUG: utils: Failed to load FFmpeg6 extension.]
Traceback (most recent call last):
  File "d:\miniconda\envs\vc\lib\site-packages\torio\_extension\utils.py", line 116, in _find_ffmpeg_extension
    ext = _find_versionsed_ffmpeg_extension(ffmpeg_ver)
  File "d:\miniconda\envs\vc\lib\site-packages\torio\_extension\utils.py", line 108, in _find_versionsed_ffmpeg_extension
    _load_lib(lib)
  File "d:\miniconda\envs\vc\lib\site-packages\torio\_extension\utils.py", line 94, in _load_lib
    torch.ops.load_library(path)
  File "d:\miniconda\envs\vc\lib\site-packages\torch\_ops.py", line 1350, in load_library
    ctypes.CDLL(path)
  File "d:\miniconda\envs\vc\lib\ctypes\__init__.py", line 374, in __init__
    self._handle 

In [None]:
class ModelConfig:
    def __init__(self,config: ModelTrainingConfig):
        self.config=config
        self.params=read_yaml(PARAMS_FILE_PATH)

        self.audio_config=self.get_audio_config()
        self.dataset_config=self.get_dataset_config()
        self.vits_config=self.get_vits_config()

        self._audio_processor=None
        self._tokenizer=None
        self._model=None
        self._trainer_instance=None
        self._train_samples=None
        self._eval_samples=None

    def get_audio_config(self):
        return self.params["audio_config"]

    def get_dataset_config(self):
        return BaseDatasetConfig(
            formatter=self.config.dataset_name,
            meta_file_train=self.config.metadata_path,
            path=self.config.dataset_path
        )

    def get_vits_config(self):
        config=self.config
        params=self.params["model_config"]
        audio_config=self.audio_config
        dataset_config=self.dataset_config
        return VitsConfig(
            audio=audio_config,
            run_name=params["run_name"],
            batch_size=params["batch_size"],
            eval_batch_size=params["eval_batch_size"],
            batch_group_size=params["batch_group_size"],
            num_loader_workers=params["num_loader_workers"],
            num_eval_loader_workers=params["num_eval_loader_workers"],
            run_eval=params["run_eval"],
            test_delay_epochs=params["test_delay_epochs"],
            epochs=params["epochs"],
            text_cleaner=params["text_cleaner"],
            use_phonemes=params["use_phonemes"],
            phoneme_language=params["phoneme_language"],
            phoneme_cache_path=os.path.join(params["output_path"], "phoneme_cache"),
            compute_input_seq_cache=params["compute_input_seq_cache"],
            print_step=params["print_step"],
            print_eval=params["print_eval"],
            mixed_precision=params["mixed_precision"],
            output_path=params["output_path"],
            datasets=[dataset_config],
            cudnn_benchmark=params["cudnn_benchmark"],
        )

    def get_audio_processor(self):
        if self._audio_processor is None:
            data_preprocess_config=DataPreProcessConfig(
                root_dir=self.config.root_dir,
                processed_audio_dir="",  
                audio_path=""         
            )
            processor=DataPreprocessor(config=data_preprocess_config)
            self._audio_processor=processor.get_audio_processor()
        return self._audio_processor

    def get_tokenizer(self):
            if self._tokenizer is None:
                vits_config=self.vits_config
                tokenizer, config=TTSTokenizer.init_from_config(vits_config)
                self._tokenizer=tokenizer
            return self._tokenizer
    
    def get_data_split(self):
        if self._train_samples is None or self._eval_samples is None: 
            self._train_samples,self._eval_samples=load_tts_samples(
                self.dataset_config,
                eval_split=True,
                eval_split_max_size=self.vits_config.eval_split_max_size,
                eval_split_size=self.vits_config.eval_split_size,
            )
        return self._train_samples,self._eval_samples
    
    def get_model(self):
        if self._model is None:
            config=self.vits_config
            ap=self.get_audio_processor()
            tokenizer=self.get_tokenizer()
            self._model=Vits(config,ap,tokenizer,speaker_manager=None)
            return self._model
    
    def get_trainer(self):
        if self._trainer_instance is None:
            train_samples,eval_samples=self.get_data_split()
            self._trainer_instance=Trainer(
                TrainerArgs(),
                config=self.vits_config,
                output_path=self.config.output_dir,
                model=self.get_model(),
                train_samples=train_samples,
                eval_samples=eval_samples,
                parse_command_line_args=False
            )
        return self._trainer_instance

    def get_fit(self):
        trainer=self.get_trainer()
        if trainer is None:
            raise ValueError("Trainer instance is None. Cannot start training.")
        trainer.fit()

: 

In [None]:
try:
    config=ConfigurationManager()
    model_training_config=config.get_model_training_config()
    model_training=ModelConfig(config=model_training_config)
    model_training.get_audio_config()
    model_training.get_dataset_config()
    model_training.get_vits_config()
    model_training.get_fit()    
    
except Exception as e:
    raise e

 | > Found 829 files in D:\office\vc\artifacts\data_ingestion\LJSpeech-1.1
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:12.5
 | > frame_length_ms:50
 | > ref_level_db:0
 | > fft_size:2400
 | > power:1.5
 | > preemphasis:0.98
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > win_length:1102
 | > hop_length:275


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 12
 | > Num. of Torch Threads: 6
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=artifacts/model_training/output\vits_ljspeech-May-22-2025_01+27PM-f4cb597

 > Model has 83059180 parameters

[4m[1m > EPOCH: 0/200[0m
 --> artifacts/model_training/output\vits_ljspeech-May-22-2025_01+27PM-f4cb597


[*] Pre-computing phonemes...


  0%|          | 3/821 [00:02<10:03,  1.35it/s]

hi ɪsnt æz bɪɡ bʌt hz stɪl kwaɪt pɑpjəlɚ ɪv hɚd ðə seɪm θɪŋ əbaʊt hɪz kɑntɛnt nɛvɚ wɔt͡ʃt hɪm mʌt͡ʃ
 [!] Character '͡' not found in the vocabulary. Discarding it.


100%|██████████| 821/821 [02:14<00:00,  6.12it/s]




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 1 not found characters:
	| > ͡
| > Number of instances : 821



[1m > TRAINING (2025-05-22 13:30:22) [0m


 | > Preprocessing samples
 | > Max text length: 173
 | > Min text length: 6
 | > Avg text length: 55.60535931790499
 | 
 | > Max audio length: 358870.0
 | > Min audio length: 25558.0
 | > Avg audio length: 110688.91352009744
 | > Num. instances discarded samples: 0
 | > Batch group size: 32.


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\SpectralOps.cpp:878.)
  return _VF.stft(  # type: ignore[attr-defined]
  with autocast(enabled=False):  # use float32 for the criterion
  with autocast(enabled=False):
  with autocast(enabled=False):  # use float32 for the criterion

[1m   --> TIME: 2025-05-22 13:30:35 -- STEP: 0/52 -- GLOBAL_STEP: 0[0m
     | > loss_disc: 5.949109077453613  (5.949109077453613)
     | > loss_disc_real_0: 0.9778444170951843  (0.9778444170951843)
     | > loss_disc_real_1: 0.9998602271080017  (0.9998602271080017)
     | > loss_disc_real_2: 1.0166256427764893  (1.0166256427764893)
     | > loss_disc_real_3: 0.9675069451332092  (0.9675069451332092)
     | > loss_disc_real_4: 0.9778657555580139  (0.9778657555580139)
     | > loss_disc_real_5: 1.0087900161743164  (1.0087900161743164)
     | > loss_0:

 | > Synthesizing test sentences.


  test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)

  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.16208044220419485 [0m(+0)
     | > avg_step_time: 12.575639149721932 [0m(+0)
     | > avg_loss_disc: 3.0456783117032518 [0m(+0)
     | > avg_loss_disc_real_0: 0.25952082960044637 [0m(+0)
     | > avg_loss_disc_real_1: 0.2691716201895592 [0m(+0)
     | > avg_loss_disc_real_2: 0.2705564628921303 [0m(+0)
     | > avg_loss_disc_real_3: 0.27019652081470863 [0m(+0)
     | > avg_loss_disc_real_4: 0.2752129537802115 [0m(+0)
     | > avg_loss_disc_real_5: 0.2832507356709125 [0m(+0)
     | > avg_loss_0: 3.0456783117032518 [0m(+0)
     | > avg_grad_norm_0: tensor(3.7854, device='cuda:0') [0m(+0)
     | > avg_loss_gen: 1.6618466096765854 [0m(+0)
     | > avg_loss_kl: 9.548747137481094 [0m(+0)
     | > avg_loss_feat: 0.42215345171736735 [0m(+0)
     | > avg_loss_mel: 67.6496855791877 [0m(+0)
     | > avg_loss_duration: 2.19770551662818