# SdG based on phoneme generation and text-to-speech (TTS)

In [9]:
import os
from espeak_phonemizer import Phonemizer
import numpy as np
from collections import defaultdict
import random
from TTS.api import TTS
import IPython.display as ipd

In [2]:
def train_markov_model(phoneme_sequences, n=2):
    """Train an n-gram Markov chain on a list of phoneme sequences."""
    transitions = defaultdict(lambda: defaultdict(int))
    for seq in phoneme_sequences:
        tokens = seq.split()
        for i in range(len(tokens) - n):
            prefix = tuple(tokens[i:i+n-1])
            next_token = tokens[i+n-1]
            transitions[prefix][next_token] += 1
    # Normalize to probabilities
    model = {
        prefix: {k: v / sum(d.values()) for k, v in d.items()}
        for prefix, d in transitions.items()
    }
    return model

def sample_markov(model, n=2, max_len=50, temperature=1.0):
    """Generate a random phoneme sequence."""
    prefix = random.choice(list(model.keys()))
    result = list(prefix)
    for _ in range(max_len - n):
        next_probs = model.get(prefix)
        if not next_probs:
            break
        tokens, probs = zip(*next_probs.items())
        # Apply temperature for randomness control
        probs = np.array(probs) ** (1/temperature)
        probs /= probs.sum()
        next_token = np.random.choice(tokens, p=probs)
        result.append(next_token)
        prefix = tuple(result[-(n-1):])
    return " ".join(result)


## 1. Test on sample texts

In [None]:
# 1. Phonemize sample German texts
sample_texts = [
    "Das Wetter ist heute schön.",
    "Ich gehe gerne im Wald spazieren.",
    "Die Sonne scheint über den Bergen.",
    "Wir trinken Kaffee am Nachmittag."
]
P = Phonemizer()
phonemes = [P.phonemize(text, voice='de') for text in sample_texts]  # Example usage
for t, p in zip(sample_texts, phonemes):
    print(t)
    print(p)
    print()

# 2. Train Markov model on sampled phonemes
model = train_markov_model(phonemes, n=3)  # trigram

# 3. Sample
for temp in [0.7, 1.0, 1.5]:
    print(f"\nTemperature={temp}")
    print(sample_markov(model, n=3, temperature=temp))

# 4. Synthesize with TTS
# Load a German VITS model (downloads automatically on first run)
tts = TTS("tts_models/de/thorsten/vits", progress_bar=False, gpu=False)
# Generate audio (though model expects text, not phonemes directly)
wav = tts.tts(phonemes[0])
# Save to file and play
tts.tts_to_file(text=phonemes[0], file_path="pseudo_german.wav")
ipd.Audio("pseudo_german.wav")

Das Wetter ist heute schön.
das vˈɛtɜ ɪst hˈɔøtə ʃˈøːn

Ich gehe gerne im Wald spazieren.
ɪç ɡˈeːə ɡˈɛɾnə ɪm vˈalt ʃpatsˈiːrən

Die Sonne scheint über den Bergen.
diː zˈɔnə ʃˈaɪnt ˌyːbɜ deːn bˈɛɾɡən

Wir trinken Kaffee am Nachmittag.
viːɾ tɾˈɪŋkən kˈafeː am nˈaxmɪtˌɑːk


Temperature=0.7
viːɾ tɾˈɪŋkən kˈafeː am

Temperature=1.0
ɡˈeːə ɡˈɛɾnə ɪm vˈalt

Temperature=1.5
ɡˈeːə ɡˈɛɾnə ɪm vˈalt
 > Downloading model to C:\Users\Gerardo\AppData\Local\tts\tts_models--de--thorsten--vits
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:Non

## 2. Train model on large text sample

In [None]:
def tts_pipeline(sample_text="german_text.txt", n=3, temperature=1.0, phonemize=True):
    # Check if output file already exists
    if phonemize:
        file_path = f"german_phonemes_t{temperature}_n{n}.wav"
    else:
        file_path = f"german_text_t{temperature}_n{n}.wav"
    if os.path.exists(file_path):
        return ipd.Audio(file_path)
    
    # 1. Load text sample
    if sample_text.endswith(".txt"):
        with open('audio/' + sample_text, "r", encoding="utf-8") as f:
            german_text = f.read()
    else:
        german_text = sample_text  # assume direct text input

    # 2. Phonemize if needed
    if phonemize:
        P = Phonemizer()
        phonemes = P.phonemize(german_text, voice='de')
    else:
        phonemes = german_text

    # 2. Train Markov model on large text sample
    model = train_markov_model([phonemes], n=n)  # trigram model

    # 3. Sample longer pseudo-German phoneme sequences
    sample = sample_markov(model, n=n, max_len=100, temperature=temperature)
    print(sample)

    # 4. Synthesize sampled phonemes with TTS
    tts = TTS("tts_models/de/thorsten/vits", progress_bar=False, gpu=False)
    tts.tts_to_file(text=sample, file_path='audio/' + file_path)
    return ipd.Audio('audio/' + file_path)

tts_pipeline(sample_text="german_text.txt", n=2, temperature=3, phonemize=True)

lˈɔøtə mɪt deːm rˈykən sˈɒɹi rˈʊft dɛɾ kˈɛnt ʃˌoːn zaɪt tsvˈantsɪç miːnˈuːtən ʃˈɪl mˈɑːl vɑːrən blˈɪnt ˈax diː mˈʊtɜ klˈatʃt ɪm hˈoːf dˈoː vˈeː ɡˈeːt lˈaŋzɑːm lˈeːɾt zɪç ˈaɪn fˈaɪeːrˌɑːbənt zˈɑːɡt ɛɾ dɾˈykt aʊf ˈɛŋlɪʃ dɛɾ hˈant diː ʃtɾˈɑːsə muːzˈiːk diː mˈaxən kˈaɪnən ˈʊnzˌɪn hˈɪntɜ ˌiːnən kvˈɛkt aɪn lˈiːfɜfˌɑːrɜ ʃlˈɛŋəlt zɪç diː ʃtˈat ɪn deːn fˈoːtɾʊks ʊnt dɛɾ ɡˌiːtarˈɪst rˈɔlt mɪt deːn hˈuːt aɪn klˈaɪnɜ ʃtˈamkʊndə vˈas dˈɛn dɛɾ bˈʊs fˈɛːɾt foːɾbˈaɪ ʃˈaʊ bɛɾlˈiːn ɪn ˌiːrɜ tˈaʃə nˈɪçt dɛɾ fˈɛɾnə hˈœɾt man jˈeːmandən lˈaxən am ʃvˈɪtsən ʊnt aɪn ʃtɾˈɑsənfɜkˌɔøfɜ ʃlˈiːst zˌaɪnən ʃtˈant pˈakt ɡətɾˈɛŋkə ʊnt zˈʊmt dɑːtsˈuː ˌaɪnə ˈax
 > tts_models/de/thorsten/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:No