# Indic Parler-TTS - Geeta Dhyanam Audio Generation

Generate Sanskrit recitations for the 9 sacred Geeta Dhyanam (invocation) verses.

**Workflow:**
1. Run setup cells (install, login, load model)
2. Upload dhyanam metadata JSON
3. Generate audio (~3 minutes for 9 verses)
4. Preview and download WAV files
5. **Locally:** Run `process_dhyanam_audio.py` to convert to MP3

**Prerequisites:**
- Runtime → Change runtime type → T4 GPU
- HuggingFace account with token

**Export command (run locally):**
```bash
docker compose exec backend python /app/scripts/export_dhyanam_metadata.py --pretty > dhyanam_metadata.json
```

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 1: Install Dependencies
# ═══════════════════════════════════════════════════════════════

!pip install -q git+https://github.com/huggingface/parler-tts.git
!pip install -q soundfile accelerate tqdm

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 2: HuggingFace Login
# ═══════════════════════════════════════════════════════════════

from getpass import getpass

from huggingface_hub import login

# Get token at: https://huggingface.co/settings/tokens
HF_TOKEN = getpass("Enter your HuggingFace token: ")
login(token=HF_TOKEN)
print("✓ Logged in successfully!")

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 3: Load Model (~2-3 minutes)
# ═══════════════════════════════════════════════════════════════

import json
from pathlib import Path

import soundfile as sf
import torch
from IPython.display import Audio, display
from parler_tts import ParlerTTSForConditionalGeneration
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
device = "cuda:0" if torch.cuda.is_available() else "cpu"

print("Loading Indic Parler-TTS model...")
model = ParlerTTSForConditionalGeneration.from_pretrained(
    "ai4bharat/indic-parler-tts"
).to(device)

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)


def generate_audio(text: str, description: str, output_path: str = None):
    """Generate audio for Sanskrit text with voice description."""
    desc_ids = desc_tokenizer(description, return_tensors="pt").to(device)
    prompt_ids = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        generation = model.generate(
            input_ids=desc_ids.input_ids,
            attention_mask=desc_ids.attention_mask,
            prompt_input_ids=prompt_ids.input_ids,
            prompt_attention_mask=prompt_ids.attention_mask,
        )

    audio = generation.cpu().numpy().squeeze()
    if output_path:
        sf.write(str(output_path), audio, model.config.sampling_rate)
    return audio, model.config.sampling_rate


print("✓ Model loaded and ready!")

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 4: Upload Dhyanam Metadata
# ═══════════════════════════════════════════════════════════════

from google.colab import files

print("Upload your dhyanam_metadata.json file:")
uploaded = files.upload()

metadata_file = list(uploaded.keys())[0]
with open(metadata_file) as f:
    metadata = json.load(f)

# Extract from metadata
verses_data = metadata["verses"]
verse_metadata = metadata["metadata"]
export_info = metadata["export_info"]

print(f"\n✓ Geeta Dhyanam: {export_info['verse_count']} verses")
print(f"   {export_info['description']}")
print(f"   Voice: {export_info['voice']}")

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 5: Generate Audio (9 verses, ~3 minutes)
# ═══════════════════════════════════════════════════════════════

output_dir = Path(export_info.get("output_dir", "audio/dhyanam"))
output_dir.mkdir(parents=True, exist_ok=True)

# Resume from checkpoint if exists
checkpoint_file = output_dir / ".checkpoint.json"
completed = set()
if checkpoint_file.exists():
    with open(checkpoint_file) as f:
        completed = set(json.load(f))
    print(f"Resuming: {len(completed)}/{len(verse_metadata)} already done")

# Estimate time: ~20s per verse
remaining = len(verse_metadata) - len(completed)
print(f"Generating {remaining} verses (~{remaining * 20 // 60} minutes)...\n")

results = {"success": 0, "failed": [], "skipped": len(completed)}

for verse_meta in tqdm(verse_metadata, desc="Geeta Dhyanam"):
    cid = verse_meta["canonical_id"]

    if cid in completed:
        continue

    if cid not in verses_data:
        results["failed"].append(f"{cid}: no text")
        continue

    try:
        # Get normalized text and TTS description
        text = verses_data[cid]
        description = verse_meta["tts_description"]

        print(f"\n{cid} - {verse_meta['theme']}")
        print(f"  Text: {text[:60]}...")

        generate_audio(text, description, output_dir / f"{cid}.wav")

        completed.add(cid)
        with open(checkpoint_file, "w") as f:
            json.dump(list(completed), f)
        results["success"] += 1

    except Exception as e:
        results["failed"].append(f"{cid}: {e}")

print("\n" + "═" * 50)
print(f"✓ Success: {results['success']}")
print(f"⏭ Skipped: {results['skipped']}")
print(f"✗ Failed: {len(results['failed'])}")
if results["failed"]:
    for f in results["failed"]:
        print(f"  - {f}")

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 6: Preview All 9 Verses
# ═══════════════════════════════════════════════════════════════

print("Preview - Geeta Dhyanam:")
print("═" * 50)

for i, wav_file in enumerate(sorted(output_dir.glob("*.wav")), 1):
    # Find metadata for this verse
    meta = next((m for m in verse_metadata if m["canonical_id"] == wav_file.stem), None)
    theme = meta["theme"] if meta else "Unknown"

    print(f"\nVerse {i}: {theme}")
    display(Audio(str(wav_file)))

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 7: Download WAV Files
# ═══════════════════════════════════════════════════════════════

import shutil

wav_zip = "dhyanam_wav"
shutil.make_archive(wav_zip, "zip", "audio", "dhyanam")
files.download(f"{wav_zip}.zip")
print(f"\n✓ Downloaded: {wav_zip}.zip")
print("\nNext step (run locally):")
print(
    f"  docker compose exec backend python /app/scripts/process_dhyanam_audio.py /app/{wav_zip}.zip"
)