In [85]:
import os
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = "/opt/homebrew/lib/libespeak.1.dylib"  # adjust if path differs
os.environ["PATH"] += os.pathsep + "/opt/homebrew/bin"
import numpy as np
from collections import defaultdict
import random
from phonemizer import phonemize
from phonemizer.separator import Separator
from TTS.api import TTS
import IPython.display as ipd

## Functions

In [None]:
def train_markov_model(phoneme_sequences, n=2):
    """Train an n-gram Markov chain on a list of phoneme sequences."""
    transitions = defaultdict(lambda: defaultdict(int))
    for seq in phoneme_sequences:
        tokens = seq.split()
        for i in range(len(tokens) - n):
            prefix = tuple(tokens[i:i+n-1])
            next_token = tokens[i+n-1]
            transitions[prefix][next_token] += 1
    # Normalize to probabilities
    model = {
        prefix: {k: v / sum(d.values()) for k, v in d.items()}
        for prefix, d in transitions.items()
    }
    return model

def sample_markov(model, n=2, max_len=50, temperature=1.0):
    """Generate a random phoneme sequence."""
    prefix = random.choice(list(model.keys()))
    result = list(prefix)
    for _ in range(max_len - n):
        next_probs = model.get(prefix)
        if not next_probs:
            break
        tokens, probs = zip(*next_probs.items())
        # Apply temperature for randomness control
        probs = np.array(probs) ** (1/temperature)
        probs /= probs.sum()
        next_token = np.random.choice(tokens, p=probs)
        result.append(next_token)
        prefix = tuple(result[-(n-1):])
    return " ".join(result)

## Test TTS with phonemes based on real text

In [None]:
german_texts = [
    "Das Wetter ist heute schön.",
    "Ich gehe gerne im Wald spazieren.",
    "Die Sonne scheint über den Bergen.",
    "Wir trinken Kaffee am Nachmittag."
]

# German phonemization
phonemes = phonemize(
    german_texts,
    language="de",
    backend="espeak",
    separator=Separator(phone=" ", word=" | ", syllable=""),
)

for t, p in zip(german_texts, phonemes):
    print(t)
    print(p)
    print()

# Load a German VITS model (downloads automatically on first run)
tts = TTS("tts_models/de/thorsten/vits", progress_bar=False, gpu=False)
wav = tts.tts(phonemes[0], use_phonemes=True)
# Save to file
tts.tts_to_file(text=phonemes[0], file_path="pseudo_german.wav", use_phonemes=True)
# In a notebook, play it inline:
ipd.Audio("pseudo_german.wav")

Das Wetter ist heute schön.
d a s  | v ɛ t ɜ  |  ɪ s t  | h ɔø t ə  | ʃ øː n  | 

Ich gehe gerne im Wald spazieren.
 ɪ ç  | ɡ eː ə  | ɡ ɛ ɾ n ə  |  ɪ m  | v a l t  | ʃ p a ts iː r ə n  | 

Die Sonne scheint über den Bergen.
d iː  | z ɔ n ə  | ʃ aɪ n t  |  ʏ b ɜ  | d eː n  | b ɛ ɾ ɡ ə n  | 

Wir trinken Kaffee am Nachmittag.
v iː ɾ  | t ɾ ɪ ŋ k ə n  | k a f eː  |  a m  | n a x m ɪ t ɑː k  | 

 > Text splitted to sentences.
['d a s  | v ɛ t ɜ  |  ɪ s t  | h ɔø t ə  | ʃ øː n  |']
 > Processing time: 2.7086379528045654
 > Real-time factor: 0.6552293625958912
 > Text splitted to sentences.
['d a s  | v ɛ t ɜ  |  ɪ s t  | h ɔø t ə  | ʃ øː n  |']
 > Processing time: 2.644711971282959
 > Real-time factor: 0.6257070704591121


In [9]:
# 1. Collect phoneme sequences
phoneme_sequences = phonemes  # from step 2

# 2. Train Markov model
model = train_markov_model(phoneme_sequences, n=3)  # trigram

# 3. Sample
for temp in [0.7, 1.0, 1.5]:
    print(f"\nTemperature={temp}")
    print(sample_markov(model, n=3, temperature=temp))



Temperature=0.7
| d eː n | b ɛ ɾ ɡ ə n | k a f eː | a m | n a x m ɪ t ɑː k

Temperature=1.0
ɾ | t ɾ ɪ ŋ k ə n | b ɛ ɾ ɡ ə n | b ɛ ɾ n ə | ɡ ɛ ɾ ɡ ə n | k a f eː | a m | n a x m ɪ t ɑː k

Temperature=1.5
k ə n | k a f eː | a m | n a x m ɪ t ɑː k


In [83]:
# Load text.txt and split string by periods
with open("text.txt", "r", encoding="utf-8") as f:
    text_data = f.read()
    sentences = text_data.split(". ")
    # remove nonalphabetic characters except spaces
    sentences = [''.join(c for c in s if c.isalpha() or c.isspace()).strip() for s in sentences if s.strip()]
    # remove \n characters
    sentences = [s.replace("\n", " ") for s in sentences if s]


# Phonemize
phonemes = phonemize(
    sentences,
    language="de",
    backend="espeak",
    separator=Separator(phone=" ", word=" | ", syllable=""),
)

# 2. Train Markov model
model = train_markov_model(phonemes, n=3)  # trigram
letter_model = train_markov_model(sentences, n=3)  # trigram

# 3. Sample
markov_phonemes = sample_markov(model, n=3, max_len=100, temperature=0.7)
markov_phonemes
markov_letters = sample_markov(letter_model, n=1, max_len=100, temperature=1)
markov_letters




'Ein kurzer Windstoß'

In [69]:
wav = tts.tts(
    text=markov_letters,
    speaker=None,
    language=None,
    use_phonemes=False,
)

# Save to file
tts.tts_to_file(text=markov_letters, file_path="markov_german.wav", use_phonemes=False)

 > Text splitted to sentences.
['du wieder Nachtschicht oder was Nee heute Grillen im Hof Na dann viel Spaß Und mach nicht so laut Musik die Nachbarn meckern wieder Schon gut Chef Ein junger Typ telefoniert laut Ja ich weiß du wartest schon seit zwanzig']
 > Processing time: 4.706494092941284
 > Real-time factor: 0.42714107157291453
 > Text splitted to sentences.
['du wieder Nachtschicht oder was Nee heute Grillen im Hof Na dann viel Spaß Und mach nicht so laut Musik die Nachbarn meckern wieder Schon gut Chef Ein junger Typ telefoniert laut Ja ich weiß du wartest schon seit zwanzig']
 > Processing time: 4.3707709312438965
 > Real-time factor: 0.3868886048956577


'markov_german.wav'

In [None]:
# TODO>: play with different models or encodings
# Add different voices, e.g. F5-TTS-German
# Create platform for generating sounds based on parameters (volume, duration, pitch, speed)