# Indic Parler-TTS - Geetanjali Audio Generation

Generate Sanskrit verse recitations for the Bhagavad Gita using AI4Bharat's Indic Parler-TTS.

**Workflow:**
1. Run setup cells (install, login, load model)
2. Upload chapter metadata JSON
3. Generate audio (with checkpointing)
4. Preview and download WAV files
5. **Locally:** Run `process_tts_audio.py` to convert to MP3

**Prerequisites:**
- Runtime → Change runtime type → T4 GPU
- HuggingFace account with token

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 1: Install Dependencies
# ═══════════════════════════════════════════════════════════════

!pip install -q git+https://github.com/huggingface/parler-tts.git
!pip install -q soundfile accelerate tqdm

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 2: HuggingFace Login
# ═══════════════════════════════════════════════════════════════

from huggingface_hub import login
from getpass import getpass

# Get token at: https://huggingface.co/settings/tokens
HF_TOKEN = getpass("Enter your HuggingFace token: ")
login(token=HF_TOKEN)
print("✓ Logged in successfully!")

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 3: Load Model (~2-3 minutes)
# ═══════════════════════════════════════════════════════════════

import torch
import soundfile as sf
import json
from pathlib import Path
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
from IPython.display import Audio, display
from tqdm.notebook import tqdm

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
device = "cuda:0" if torch.cuda.is_available() else "cpu"

print("Loading model...")
model = ParlerTTSForConditionalGeneration.from_pretrained(
    "ai4bharat/indic-parler-tts"
).to(device)

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)

def generate_audio(text: str, description: str, output_path: str = None):
    """Generate audio for Sanskrit text with voice description."""
    desc_ids = desc_tokenizer(description, return_tensors="pt").to(device)
    prompt_ids = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        generation = model.generate(
            input_ids=desc_ids.input_ids,
            attention_mask=desc_ids.attention_mask,
            prompt_input_ids=prompt_ids.input_ids,
            prompt_attention_mask=prompt_ids.attention_mask,
        )

    audio = generation.cpu().numpy().squeeze()
    if output_path:
        sf.write(str(output_path), audio, model.config.sampling_rate)
    return audio, model.config.sampling_rate

print("✓ Model loaded and ready!")

---
# Upload & Generate

**Export command (run locally):**
```bash
docker compose exec backend python /app/scripts/export_tts_metadata.py --chapter 2 --pretty > chapter_2_metadata.json
```

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 4: Upload Chapter Metadata
# ═══════════════════════════════════════════════════════════════

from google.colab import files

print("Upload your chapter metadata JSON file:")
uploaded = files.upload()

metadata_file = list(uploaded.keys())[0]
with open(metadata_file) as f:
    metadata = json.load(f)

# Extract chapter from metadata (single source of truth)
CHAPTER = metadata["export_info"]["chapters"][0]
verses_data = metadata["verses"]
chapter_metadata = metadata["chapters"][str(CHAPTER)]

print(f"\n✓ Chapter {CHAPTER}: {len(chapter_metadata)} verses, {len(verses_data)} texts")

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 5: Generate Audio (with checkpointing)
# ═══════════════════════════════════════════════════════════════

output_dir = Path(f"audio/verses/{CHAPTER:02d}")
output_dir.mkdir(parents=True, exist_ok=True)

# Resume from checkpoint if exists
checkpoint_file = output_dir / ".checkpoint.json"
completed = set()
if checkpoint_file.exists():
    with open(checkpoint_file) as f:
        completed = set(json.load(f))
    print(f"Resuming: {len(completed)}/{len(chapter_metadata)} already done")

# Estimate time: ~20s per verse
remaining = len(chapter_metadata) - len(completed)
print(f"Generating {remaining} verses (~{remaining * 20 // 60} minutes)...\n")

results = {"success": 0, "failed": [], "skipped": len(completed)}

for verse_meta in tqdm(chapter_metadata, desc=f"Chapter {CHAPTER}"):
    cid = verse_meta["canonical_id"]
    
    if cid in completed:
        continue
    
    if cid not in verses_data:
        results["failed"].append(f"{cid}: no text")
        continue
    
    try:
        generate_audio(verses_data[cid], verse_meta["tts_description"], output_dir / f"{cid}.wav")
        completed.add(cid)
        with open(checkpoint_file, "w") as f:
            json.dump(list(completed), f)
        results["success"] += 1
    except Exception as e:
        results["failed"].append(f"{cid}: {e}")

print(f"\n" + "═" * 50)
print(f"✓ Success: {results['success']}")
print(f"⏭ Skipped: {results['skipped']}")
print(f"✗ Failed: {len(results['failed'])}")
if results["failed"]:
    for f in results["failed"]: print(f"  - {f}")

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 6: Preview Audio (first 5 files)
# ═══════════════════════════════════════════════════════════════

print(f"Preview - Chapter {CHAPTER}:")
print("═" * 50)

for wav_file in sorted(output_dir.glob("*.wav"))[:5]:
    print(f"\n{wav_file.stem}:")
    display(Audio(str(wav_file)))

In [None]:
# ═══════════════════════════════════════════════════════════════
# STEP 7: Download WAV Files
# ═══════════════════════════════════════════════════════════════

import shutil

wav_zip = f"chapter_{CHAPTER:02d}_wav"
shutil.make_archive(wav_zip, 'zip', 'audio', f"verses/{CHAPTER:02d}")
files.download(f"{wav_zip}.zip")
print(f"\n✓ Downloaded: {wav_zip}.zip")