In [10]:
# Install coqui-tts fork that supports Python 3.12
!pip install coqui-tts g2p-id soundfile


print("\n Installation complete!")


 Installation complete!


In [22]:
import os
import gdown

# Correct URLs from v1.2 release
urls = {
    'checkpoint.pth': 'https://github.com/Wikidepia/indonesian-tts/releases/download/v1.2/checkpoint_1260000-inference.pth',
    'config.json': 'https://github.com/Wikidepia/indonesian-tts/releases/download/v1.2/config.json',
    'speakers.pth': 'https://github.com/Wikidepia/indonesian-tts/releases/download/v1.2/speakers.pth'
}

model_dir = "wikidepia_id_tts"
os.makedirs(model_dir, exist_ok=True)

local_paths = {}
for filename, url in urls.items():
    local_path = os.path.join(model_dir, filename)
    if not os.path.exists(local_path) or os.path.getsize(local_path) == 0:
        print(f"â¬‡ Downloading {filename}...")
        gdown.download(url, local_path, quiet=False)
    print(f" {filename}: {os.path.getsize(local_path):,} bytes")
    local_paths[filename] = local_path

# Copy speakers.pth to /content/ (required by config)
target_speakers = '/content/speakers.pth'
if not os.path.exists(target_speakers):
    os.system(f"cp {local_paths['speakers.pth']} {target_speakers}")
    print(f" Copied speakers.pth to {target_speakers}")

 checkpoint.pth: 345,999,149 bytes
 config.json: 9,071 bytes
 speakers.pth: 1,839 bytes


In [31]:
import torch
from TTS.api import TTS
from g2p_id import G2P

device = "cuda" if torch.cuda.is_available() else "cpu"
g2p = G2P()
tts = TTS(model_path=local_paths['checkpoint.pth'],
          config_path=local_paths['config.json'],
          speakers_file_path='/content/speakers.pth').to(device)

# Get available speakers
speakers_data = torch.load('/content/speakers.pth', map_location='cpu')
print(f" Loaded! Speakers: {list(speakers_data.keys())}")

 Loaded! Speakers: ['JV-00027', 'JV-00264', 'JV-00658', 'JV-01392', 'JV-01519', 'JV-01932', 'JV-02059', 'JV-02326', 'JV-02884', 'JV-03187', 'JV-03314', 'JV-03424', 'JV-03727', 'JV-04175', 'JV-04285', 'JV-04588', 'JV-04679', 'JV-04715', 'JV-04982', 'JV-05219', 'JV-05522', 'JV-05540', 'JV-05667', 'JV-05970', 'JV-06080', 'JV-06207', 'JV-06383', 'JV-06510', 'JV-06941', 'JV-07335', 'JV-07638', 'JV-07765', 'JV-07875', 'JV-08002', 'JV-08178', 'JV-08305', 'JV-08736', 'JV-09039', 'JV-09724', 'SU-00060', 'SU-00297', 'SU-00454', 'SU-00600', 'SU-00691', 'SU-00994', 'SU-01038', 'SU-01056', 'SU-01359', 'SU-01552', 'SU-01596', 'SU-01855', 'SU-01899', 'SU-02092', 'SU-02395', 'SU-02716', 'SU-02953', 'SU-03391', 'SU-03650', 'SU-03694', 'SU-03712', 'SU-03887', 'SU-04190', 'SU-04208', 'SU-04511', 'SU-04646', 'SU-04748', 'SU-05051', 'SU-05186', 'SU-05507', 'SU-06003', 'SU-06047', 'SU-06543', 'SU-07302', 'SU-07842', 'SU-08338', 'SU-08659', 'SU-08703', 'SU-09243', 'SU-09637', 'SU-09757', 'ardi', 'gadis', 'wi

In [29]:
import random

text = "STT adalah teknologi yang memungkinkan kita untuk mengubah ucapan menjadi teks tertulis. Teknologi ini sangat berguna bagi mereka yang ingin mengetik secara lebih cepat dan efisien dengan suara."

phonemes = g2p(text)
print(f" Text: {text[:60]}...")

# Random speaker
speaker = random.choice(list(speakers_data.keys()))
print(f" Speaker: {speaker}")

tts.tts_to_file(text=phonemes, speaker=speaker, file_path="output.wav")
print(" Saved: output.wav")

 Text: STT adalah teknologi yang memungkinkan kita untuk mengubah u...
 Speaker: SU-01596
 Saved: output.wav


In [30]:
from IPython.display import Audio, display
display(Audio("output.wav"))

from google.colab import files
files.download("output.wav")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>