In [None]:
import os
from TTS.config import BaseAudioConfig
from TTS.trainer import Trainer, TrainerArgs
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.xtts import Xtts

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TORCH_CUDNN_BENCHMARK"] = "1"   

In [None]:
print(">> Loading configuration from config.json...")
config = XttsConfig()
config.load_json("config.json")


In [None]:
audio_config = BaseAudioConfig(**config.audio)

In [None]:
print(">> Loading dataset...")
train_samples, eval_samples = load_tts_samples(
    dataset_config=config.dataset_config,
    eval_split=True,
    eval_split_max_size=512,  # Limit eval set size for faster evals
    eval_split_size=0.1
)

In [None]:
print(">> Initializing XTTS-v2 model for fine-tuning...")
# This will download the XTTS-v2 model specified in the config and prepare it for training
model = Xtts.init_from_config(config)


In [None]:
trainer = Trainer(
    TrainerArgs(), # Use default trainer args, settings are in the config
    config,
    output_path=config.output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    audio_config=audio_config,
)


In [None]:
print(">> Starting Varhadi Marathi fine-tuning on XTTS-v2...")
# The trainer will automatically find the latest checkpoint in 'output_path' and resume
trainer.fit()

print(f"✅ Fine-tuning complete! Best model saved in: {config.output_path}")

To Use Saved model

In [None]:
from TTS.api import TTS
import torch

# Path to the best model's directory
model_path = "output/best_model.pth" # Replace with the actual path from your output folder
config_path = "output/config.json" # The config file saved in the output folder

# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the trained model
tts = TTS.load_from_checkpoint(
    config_path=config_path,
    checkpoint_path=model_path,
    speaker_wav="dataset/wavs/varhadi_0001.wav", # Provide one of your voice samples
    language="mr"
)
tts.to(device)

# Generate speech
text_to_speak = "माझं नाव विदर्भ एआय हाये. तुमची काय मदत करू शकतो मी?"
output_file = "generated_speech.wav"
tts.tts_to_file(text=text_to_speak, file_path=output_file)

print(f"Speech generated and saved to {output_file}")