In [2]:
import os
os.chdir("../")

In [1]:
%pwd


'd:\\office\\vc\\research'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    output_dir: Path
    phoneme_cache_path: Path
    dataset_name: str
    dataset_path: Path
    metadata_path: Path
    restore_path: Path

In [4]:
from cloner.constants import *
from cloner.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath= CONFIG_FILE_PATH,
            params_filepath= PARAMS_FILE_PATH):
            
            self.config=read_yaml(config_filepath)
            self.params=read_yaml(params_filepath)

            create_directories([self.config.artifacts_root]) 

    def get_model_training_config(self)-> ModelTrainingConfig:
          config=self.config.model_training
          create_directories([config.root_dir]) 

          model_training_config=ModelTrainingConfig( 
                root_dir= config.root_dir,
                output_dir= config.output_dir,
                phoneme_cache_path= config.phoneme_cache_path,
                dataset_name= config.dataset_name,
                dataset_path= config.dataset_path,
                metadata_path= config.metadata_path,
                restore_path= config.restore_path
                ) 

          return model_training_config

In [6]:
from pathlib import Path
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from cloner.utils.common import read_yaml
from cloner.pipeline.stage_02_data_preprocessing import DataPreprocessor
from cloner.entity.config_entity import DataPreProcessConfig
import torch
import os

[2025-05-22 16:06:05,506: DEBUG: __init__: pydot initializing]
[2025-05-22 16:06:05,508: DEBUG: __init__: pydot 4.0.0]
[2025-05-22 16:06:05,524: DEBUG: core: pydot core module initializing]
[2025-05-22 16:06:11,311: DEBUG: utils: Loading FFmpeg6]
[2025-05-22 16:06:11,332: DEBUG: utils: Failed to load FFmpeg6 extension.]
Traceback (most recent call last):
  File "d:\miniconda\envs\vc\lib\site-packages\torio\_extension\utils.py", line 116, in _find_ffmpeg_extension
    ext = _find_versionsed_ffmpeg_extension(ffmpeg_ver)
  File "d:\miniconda\envs\vc\lib\site-packages\torio\_extension\utils.py", line 108, in _find_versionsed_ffmpeg_extension
    _load_lib(lib)
  File "d:\miniconda\envs\vc\lib\site-packages\torio\_extension\utils.py", line 94, in _load_lib
    torch.ops.load_library(path)
  File "d:\miniconda\envs\vc\lib\site-packages\torch\_ops.py", line 1350, in load_library
    ctypes.CDLL(path)
  File "d:\miniconda\envs\vc\lib\ctypes\__init__.py", line 374, in __init__
    self._handle 

In [None]:
class ModelConfig:
    def __init__(self,config: ModelTrainingConfig):
        self.config=config
        self.params=read_yaml(PARAMS_FILE_PATH)

        self.audio_config=self.get_audio_config()
        self.dataset_config=self.get_dataset_config()
        self.vits_config=self.get_vits_config()

        self._audio_processor=None
        self._tokenizer=None
        self._model=None
        self._trainer_instance=None
        self._train_samples=None
        self._eval_samples=None

    def get_audio_config(self):
        return self.params["audio_config"]

    def get_dataset_config(self):
        return BaseDatasetConfig(
            formatter=self.config.dataset_name,
            meta_file_train=self.config.metadata_path,
            path=self.config.dataset_path
        )

    def get_vits_config(self):
        config=self.config
        params=self.params["model_config"]
        audio_config=self.audio_config
        dataset_config=self.dataset_config
        return VitsConfig(
            audio=audio_config,
            run_name=params["run_name"],
            batch_size=params["batch_size"],
            eval_batch_size=params["eval_batch_size"],
            batch_group_size=params["batch_group_size"],
            num_loader_workers=params["num_loader_workers"],
            num_eval_loader_workers=params["num_eval_loader_workers"],
            run_eval=params["run_eval"],
            test_delay_epochs=params["test_delay_epochs"],
            epochs=params["epochs"],
            text_cleaner=params["text_cleaner"],
            use_phonemes=params["use_phonemes"],
            phoneme_language=params["phoneme_language"],
            phoneme_cache_path=os.path.join(params["output_path"], "phoneme_cache"),
            compute_input_seq_cache=params["compute_input_seq_cache"],
            print_step=params["print_step"],
            print_eval=params["print_eval"],
            mixed_precision=params["mixed_precision"],
            output_path=params["output_path"],
            datasets=[dataset_config],
            cudnn_benchmark=params["cudnn_benchmark"],
        )

    def get_audio_processor(self):
        if self._audio_processor is None:
            data_preprocess_config=DataPreProcessConfig(
                root_dir=self.config.root_dir,
                processed_audio_dir="",  
                audio_path=""         
            )
            processor=DataPreprocessor(config=data_preprocess_config)
            self._audio_processor=processor.get_audio_processor()
        return self._audio_processor

    def get_tokenizer(self):
            if self._tokenizer is None:
                vits_config=self.vits_config
                tokenizer, config=TTSTokenizer.init_from_config(vits_config)
                self._tokenizer=tokenizer
            return self._tokenizer
    
    def get_data_split(self):
        if self._train_samples is None or self._eval_samples is None: 
            self._train_samples,self._eval_samples=load_tts_samples(
                self.dataset_config,
                eval_split=True,
                eval_split_max_size=self.vits_config.eval_split_max_size,
                eval_split_size=self.vits_config.eval_split_size,
            )
        return self._train_samples,self._eval_samples
    
    def get_model(self, checkpoint_path=None):
        if self._model is None:
            config=self.vits_config
            ap=self.get_audio_processor()
            tokenizer=self.get_tokenizer()
            self._model=Vits(config,ap,tokenizer,speaker_manager=None)
            return self._model
        if checkpoint_path:
            checkpoint=torch.load(checkpoint_path)
            self._model.load_state_dict(checkpoint['model_state_dict'])
            epoch=checkpoint.get('epoch',0)
            print(f"Resuming from checkpoint at epoch{epoch}")
        return self._model
    
    def get_trainer(self, restore_path=None):
        train_samples, eval_samples = self.get_data_split()

        model = self.get_model()
        
        trainer_args = TrainerArgs()
        trainer_args.restore_path = restore_path 
        trainer_instance = Trainer(
            trainer_args,
            config=self.vits_config,
            output_path=self.config.output_dir,
            model=model,
            train_samples=train_samples,
            eval_samples=eval_samples,
            parse_command_line_args=False
        )
        return trainer_instance
    
    def load_model_from_checkpoint(self, restore_path): 
        if os.path.exists(restore_path):
            checkpoint = torch.load(restore_path, map_location="cpu")
            model = self.get_model()
            model.load_state_dict(checkpoint["model"])
            optimizer = checkpoint["optimizer"]
            epoch = checkpoint["epoch"]
            step = checkpoint["step"]
            return model, optimizer, epoch, step
        else:
            return None, None, 0, 0
    def get_fit(self):
        restore_path = getattr(self.config, "restore_path", None)
        trainer = self.get_trainer(restore_path)
        trainer.fit()

In [16]:
try:
    config=ConfigurationManager()
    model_training_config=config.get_model_training_config()
    model_training=ModelConfig(config=model_training_config)
    model_training.get_audio_config()
    model_training.get_dataset_config()
    model_training.get_vits_config()
    model_training.get_fit()    
    
except Exception as e:
    raise e

  checkpoint = torch.load(restore_path, map_location="cpu")


 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:12.5
 | > frame_length_ms:50
 | > ref_level_db:0
 | > fft_size:2400
 | > power:1.5
 | > preemphasis:0.98
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > win_length:1102
 | > hop_length:275
 | > Found 829 files in D:\office\vc\artifacts\data_ingestion\LJSpeech-1.1


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 12
 | > Num. of Torch Threads: 6
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=artifacts/model_training/output\vits_ljspeech-May-22-2025_04+12PM-b0c777f
 > Restoring from best_model.pth ...
 > Restoring Model...
 > Restoring Optimizer...
 > Restoring Scaler...
 > Model restored from step 927

 > Model has 83059180 parameters

[4m[1m > EPOCH: 0/10[0m
 --> artifacts/model_training/output\vits_ljspeech-May-22-2025_04+12PM-b0c777f




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 821



[1m > TRAINING (2025-05-22 16:12:36) [0m


 | > Preprocessing samples
 | > Max text length: 173
 | > Min text length: 6
 | > Avg text length: 55.60535931790499
 | 
 | > Max audio length: 358870.0
 | > Min audio length: 25558.0
 | > Avg audio length: 110688.91352009744
 | > Num. instances discarded samples: 0
 | > Batch group size: 16.


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\SpectralOps.cpp:878.)
  return _VF.stft(  # type: ignore[attr-defined]
  with autocast(enabled=False):  # use float32 for the criterion
  with autocast(enabled=False):
  with autocast(enabled=False):  # use float32 for the criterion

[1m   --> TIME: 2025-05-22 16:15:09 -- STEP: 22/103 -- GLOBAL_STEP: 950[0m
     | > loss_disc: 2.0995562076568604  (2.1857545538382093)
     | > loss_disc_real_0: 0.10675552487373352  (0.13845491646365682)
     | > loss_disc_real_1: 0.21112851798534393  (0.23630940677090126)
     | > loss_disc_real_2: 0.2091480791568756  (0.2078784436664798)
     | > loss_disc_real_3: 0.19782444834709167  (0.20781989666548642)
     | > loss_disc_real_4: 0.24726741015911102  (0.19930819896134463)
     | > loss_disc_real_5: 0.20502349734306335  (0.22608573429963805)




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 8
 | > Preprocessing samples
 | > Max text length: 115
 | > Min text length: 17
 | > Avg text length: 63.5
 | 
 | > Max audio length: 242166.0
 | > Min audio length: 56246.0
 | > Avg audio length: 131930.0
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


[1m   --> STEP: 0[0m
     | > loss_disc: 2.206242322921753  (2.206242322921753)
     | > loss_disc_real_0: 0.08161921054124832  (0.08161921054124832)
     | > loss_disc_real_1: 0.34966135025024414  (0.34966135025024414)
     | > loss_disc_real_2: 0.09165249019861221  (0.09165249019861221)
     | > loss_disc_real_3: 0.19713346660137177  (0.19713346660137177)
     | > loss_disc_real_4: 0.08970574289560318  (0.08970574289560318)
     | > loss_disc_real_5: 0.01622418873012066  (0.01622418873012066)
     | > loss_0: 2.206242322921753  (2.206242322921753)
     | > loss_gen: 2.1133594512939453  (2.1133594512939453)
     | > loss_kl: 1.7751399278640747  (1.7751399278640747)
     | > loss_feat: 4.37363338470459  (4.37363338470459)
     | > loss_mel: 33.62900161743164  (33.62900161743164)
     | > loss_duration: 2.6557092666625977  (2.6557092666625977)
     | > loss_1: 44.546844482421875  (44.546844482421875)

[1m   --> STEP: 1[0m
     | > loss_disc: 2.233877420425415  (2.233877420425415)
  

 | > Synthesizing test sentences.


  test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)

  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.008476734161376953 [0m(+0)
     | > avg_loss_disc: 2.4435532093048096 [0m(+0)
     | > avg_loss_disc_real_0: 0.14168976247310638 [0m(+0)
     | > avg_loss_disc_real_1: 0.4144046902656555 [0m(+0)
     | > avg_loss_disc_real_2: 0.14893151447176933 [0m(+0)
     | > avg_loss_disc_real_3: 0.28080060333013535 [0m(+0)
     | > avg_loss_disc_real_4: 0.13804887235164642 [0m(+0)
     | > avg_loss_disc_real_5: 0.03378514014184475 [0m(+0)
     | > avg_loss_0: 2.4435532093048096 [0m(+0)
     | > avg_loss_gen: 2.233404517173767 [0m(+0)
     | > avg_loss_kl: 1.6888187527656555 [0m(+0)
     | > avg_loss_feat: 3.1055901050567627 [0m(+0)
     | > avg_loss_mel: 36.26587963104248 [0m(+0)
     | > avg_loss_duration: 2.1618176698684692 [0m(+0)
     | > avg_loss_1: 45.455509185791016 [0m(+0)

 > BEST MODEL : artifacts/model_training/output\vits_

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.03846466541290283 [0m(+0.02998793125152588)
     | > avg_loss_disc:[92m 2.0661057233810425 [0m(-0.3774474859237671)
     | > avg_loss_disc_real_0:[92m 0.11948908492922783 [0m(-0.022200677543878555)
     | > avg_loss_disc_real_1:[92m 0.2024657428264618 [0m(-0.21193894743919373)
     | > avg_loss_disc_real_2:[91m 0.20842939615249634 [0m(+0.059497881680727005)
     | > avg_loss_disc_real_3:[92m 0.16866377741098404 [0m(-0.1121368259191513)
     | > avg_loss_disc_real_4:[91m 0.14864657074213028 [0m(+0.010597698390483856)
     | > avg_loss_disc_real_5:[91m 0.175958551466465 [0m(+0.14217341132462025)
     | > avg_loss_0:[92m 2.0661057233810425 [0m(-0.3774474859237671)
     | > avg_loss_gen:[92m 2.212968111038208 [0m(-0.020436406135559082)
     | > avg_loss_kl:[91m 1.99964839220047 [0m(+0.31082963943481445)
     | > avg_loss_feat:[91m 3.640218138694763 [0m(+0.5346280336380005)
     | > avg_loss_mel:[92m 32

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.006102442741394043 [0m(-0.03236222267150879)
     | > avg_loss_disc:[92m 1.7560534477233887 [0m(-0.3100522756576538)
     | > avg_loss_disc_real_0:[92m 0.059926118701696396 [0m(-0.05956296622753143)
     | > avg_loss_disc_real_1:[91m 0.20665249228477478 [0m(+0.004186749458312988)
     | > avg_loss_disc_real_2:[92m 0.10108508169651031 [0m(-0.10734431445598602)
     | > avg_loss_disc_real_3:[92m 0.15697727352380753 [0m(-0.011686503887176514)
     | > avg_loss_disc_real_4:[92m 0.08059157803654671 [0m(-0.06805499270558357)
     | > avg_loss_disc_real_5:[92m 0.15280016139149666 [0m(-0.023158390074968338)
     | > avg_loss_0:[92m 1.7560534477233887 [0m(-0.3100522756576538)
     | > avg_loss_gen:[91m 2.4465034008026123 [0m(+0.2335352897644043)
     | > avg_loss_kl:[92m 1.78236323595047 [0m(-0.21728515625)
     | > avg_loss_feat:[91m 5.2194483280181885 [0m(+1.5792301893234253)
     | > avg_loss_mel:[91m 41

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.006189465522766113 [0m(+8.702278137207031e-05)
     | > avg_loss_disc:[91m 2.2555936574935913 [0m(+0.49954020977020264)
     | > avg_loss_disc_real_0:[91m 0.07379247061908245 [0m(+0.013866351917386055)
     | > avg_loss_disc_real_1:[92m 0.16813065856695175 [0m(-0.03852183371782303)
     | > avg_loss_disc_real_2:[91m 0.10239987075328827 [0m(+0.001314789056777954)
     | > avg_loss_disc_real_3:[91m 0.2646452561020851 [0m(+0.10766798257827759)
     | > avg_loss_disc_real_4:[91m 0.16814588010311127 [0m(+0.08755430206656456)
     | > avg_loss_disc_real_5:[91m 0.21179333329200745 [0m(+0.05899317190051079)
     | > avg_loss_0:[91m 2.2555936574935913 [0m(+0.49954020977020264)
     | > avg_loss_gen:[92m 2.0132200717926025 [0m(-0.43328332901000977)
     | > avg_loss_kl:[92m 1.712370753288269 [0m(-0.06999248266220093)
     | > avg_loss_feat:[92m 3.540803909301758 [0m(-1.6786444187164307)
     | > avg_loss_mel:

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.00573122501373291 [0m(-0.0004582405090332031)
     | > avg_loss_disc:[91m 2.46270751953125 [0m(+0.2071138620376587)
     | > avg_loss_disc_real_0:[91m 0.1624918282032013 [0m(+0.08869935758411884)
     | > avg_loss_disc_real_1:[91m 0.20869767665863037 [0m(+0.04056701809167862)
     | > avg_loss_disc_real_2:[91m 0.2200298011302948 [0m(+0.11762993037700653)
     | > avg_loss_disc_real_3:[91m 0.2838771119713783 [0m(+0.019231855869293213)
     | > avg_loss_disc_real_4:[91m 0.3158450573682785 [0m(+0.14769917726516724)
     | > avg_loss_disc_real_5:[91m 0.23288539052009583 [0m(+0.02109205722808838)
     | > avg_loss_0:[91m 2.46270751953125 [0m(+0.2071138620376587)
     | > avg_loss_gen:[91m 2.210547924041748 [0m(+0.1973278522491455)
     | > avg_loss_kl:[91m 2.040818750858307 [0m(+0.32844799757003784)
     | > avg_loss_feat:[92m 3.2374942302703857 [0m(-0.30330967903137207)
     | > avg_loss_mel:[92m 30.44

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.008914351463317871 [0m(+0.003183126449584961)
     | > avg_loss_disc:[91m 2.4890326261520386 [0m(+0.026325106620788574)
     | > avg_loss_disc_real_0:[92m 0.07139933854341507 [0m(-0.09109248965978622)
     | > avg_loss_disc_real_1:[92m 0.11725185438990593 [0m(-0.09144582226872444)
     | > avg_loss_disc_real_2:[92m 0.19327986985445023 [0m(-0.026749931275844574)
     | > avg_loss_disc_real_3:[92m 0.12165478989481926 [0m(-0.16222232207655907)
     | > avg_loss_disc_real_4:[92m 0.12238585948944092 [0m(-0.19345919787883759)
     | > avg_loss_disc_real_5:[92m 0.0875511672347784 [0m(-0.14533422328531742)
     | > avg_loss_0:[91m 2.4890326261520386 [0m(+0.026325106620788574)
     | > avg_loss_gen:[92m 1.5411856770515442 [0m(-0.6693622469902039)
     | > avg_loss_kl:[92m 1.859679937362671 [0m(-0.181138813495636)
     | > avg_loss_feat:[91m 5.3430198431015015 [0m(+2.1055256128311157)
     | > avg_loss_mel:[

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.01544344425201416 [0m(+0.006529092788696289)
     | > avg_loss_disc:[92m 2.0811506509780884 [0m(-0.4078819751739502)
     | > avg_loss_disc_real_0:[91m 0.15190864354372025 [0m(+0.08050930500030518)
     | > avg_loss_disc_real_1:[91m 0.24006538838148117 [0m(+0.12281353399157524)
     | > avg_loss_disc_real_2:[91m 0.21978536248207092 [0m(+0.026505492627620697)
     | > avg_loss_disc_real_3:[91m 0.2552306056022644 [0m(+0.13357581570744514)
     | > avg_loss_disc_real_4:[91m 0.1808614805340767 [0m(+0.05847562104463577)
     | > avg_loss_disc_real_5:[91m 0.16872962936758995 [0m(+0.08117846213281155)
     | > avg_loss_0:[92m 2.0811506509780884 [0m(-0.4078819751739502)
     | > avg_loss_gen:[91m 2.614359140396118 [0m(+1.073173463344574)
     | > avg_loss_kl:[92m 1.6693935990333557 [0m(-0.19028633832931519)
     | > avg_loss_feat:[92m 4.709496021270752 [0m(-0.6335238218307495)
     | > avg_loss_mel:[92m 34

: 