# Sproochmaschinn Lab (TTS + STT)

This notebook uses the Sproochmaschinn API.

**What you edit:** small values like `MODEL`, `TEXT`, and filenames.  
**What you don't touch:** the helper functions.

---


In [None]:
import time, base64
import requests

BASE_URL = "https://sproochmaschinn.lu"

# Create a session (expires after inactivity)
SESSION_ID = requests.post(f"{BASE_URL}/api/session").json()["session_id"]
print("SESSION_ID:", SESSION_ID)

def wait_result(request_id, poll_s=1.0, timeout_s=1800):
    t0 = time.time()
    while True:
        r = requests.get(f"{BASE_URL}/api/result/{request_id}").json()
        if r.get("status") == "completed":
            return r
        if time.time() - t0 > timeout_s:
            raise TimeoutError("Timed out waiting for result")
        time.sleep(poll_s)

def save_wav_from_base64(b64_data, outfile):
    with open(outfile, "wb") as f:
        f.write(base64.b64decode(b64_data))


---

# Part A — Text-to-Speech (TTS)

Edit the **three** variables below:
- `TEXT` (Luxembourgish text)
- `MODEL` (`"claude"`, `"max"`, or `"maxine"`)
- `OUTFILE` (where to save the wav)

Then run the cell.

---


In [None]:

TEXT = "Moien, wéi geet et?"
MODEL = "maxine"     # "claude" | "max" | "maxine"
OUTFILE = "tts.wav"

req = requests.post(
    f"{BASE_URL}/api/tts/{SESSION_ID}",
    json={"text": TEXT, "model": MODEL}
).json()

res = wait_result(req["request_id"], poll_s=1.0, timeout_s=300)
b64 = res["result"]["data"]

save_wav_from_base64(b64, OUTFILE)
print("Saved:", OUTFILE)

from IPython.display import Audio, display
display(Audio(OUTFILE))


---

# Part B — Speech-to-Text (STT)

You have two choices:

## Option 1: Use a sample file from the repo
Put wav files in a folder called `data/` (for Binder), then set:
- `INPUT_FILE = "data/sample1.wav"`

## Option 2: Upload a file manually
Drag and drop your file into the file menu on the left side of the screen, then set:
- `INPUT_FILE = "whatever_you_uploaded.wav"`

You can also toggle diarization (speaker identification).

---


In [None]:
INPUT_FILE = "data/audio1.wav"   # e.g. "data/audio1.wav" or "my_audio.wav"
ENABLE_SPEAKER_ID = True          # True / False

with open(INPUT_FILE, "rb") as f:
    req = requests.post(
        f"{BASE_URL}/api/stt/{SESSION_ID}",
        files={"audio": f},
        data={"enable_speaker_identification": str(ENABLE_SPEAKER_ID).lower()}
    ).json()

res = wait_result(req["request_id"], poll_s=1.0, timeout_s=1800)

print("\nTRANSCRIPT:\n")
print(res["result"].get("text", ""))


---

# Part C — Exports (optional)

Edit the options and run to generate files:
- `transcript.txt`
- `transcript_sentence.txt` (or word/segment)
- `transcript.srt`

---


In [None]:
TIMESTAMPS_LEVEL = "sentence"     # "word" | "sentence" | "segment"
INCLUDE_SPEAKERS = True           # True / False
INCLUDE_CONFIDENCE = True         # True / False
DO_SRT = True                     # True / False

# Uses the most recent STT request from the previous cell:
request_id = req["request_id"]
base = f"{BASE_URL}/api/result/{request_id}/export"

def download_text(url, params, outfile):
    r = requests.get(url, params=params)
    ct = (r.headers.get("content-type") or "").lower()
    content = r.json().get("content", "") if "application/json" in ct else r.text
    with open(outfile, "w", encoding="utf-8") as f:
        f.write(content)
    print("Saved:", outfile)

download_text(
    f"{base}/plaintext",
    {"include_speakers": str(INCLUDE_SPEAKERS).lower()},
    "transcript.txt"
)

download_text(
    f"{base}/timestamps",
    {
        "level": TIMESTAMPS_LEVEL,
        "include_speakers": str(INCLUDE_SPEAKERS).lower(),
        "include_confidence": str(INCLUDE_CONFIDENCE).lower(),
    },
    f"transcript_{TIMESTAMPS_LEVEL}.txt"
)

if DO_SRT:
    download_text(
        f"{base}/srt",
        {"include_speakers": str(INCLUDE_SPEAKERS).lower()},
        "transcript.srt"
    )
