In [None]:
import os
import requests
import sounddevice as sd
from piper.voice import PiperVoice
import re
import queue
import time
import threading
import wave
import numpy as np

class Text_to_Speech:
    def __init__(self):
        #hi/hi_IN/pratham/medium
        # self.PIPER_MODEL = "./piper_model/hi_IN-priyamvada-medium.onnx"
        # self.PIPER_CONFIG = "./piper_model/hi_IN-priyamvada-medium.json"
        self.PIPER_MODEL = "scripts/exported_models/lj-med.onnx"
        self.PIPER_CONFIG = "scripts/exported_models/lj-med.onnx.json"
        self.SAMPLE_RATE = 22050
        self.OVERLAP = 0.15  # seconds of audio overlap for smooth playback
        
        # Check if model files exist, if not download them
        if not os.path.exists(self.PIPER_MODEL) or not os.path.exists(self.PIPER_CONFIG):
            print("‚ö†Ô∏è  Model files not found. Downloading...")
            # self.download_model()
        
        self.model = PiperVoice.load(model_path=self.PIPER_MODEL, config_path=self.PIPER_CONFIG)

    # def download_model(self):
    
    #     os.makedirs("./piper_model", exist_ok=True)

    #     model_url = "https://huggingface.co/rhasspy/piper-voices/resolve/main/hi/hi_IN/rohan/medium/hi_IN-rohan-medium.onnx?download=true"
    #     model_path = "./piper_model/hi_IN-rohan-medium.onnx"

    #     print("üîΩ Downloading piper ONNX Quantized (60MB) model...")
    #     with requests.get(model_url, stream=True) as r:
    #         r.raise_for_status()
    #         with open(model_path, "wb") as f:
    #             for chunk in r.iter_content(chunk_size=8192):
    #                 f.write(chunk)
    #     print(f"‚úÖ Model saved to {model_path} ({os.path.getsize(model_path)//1_000_000} MB)\n")
        
    #     voice_url = "https://huggingface.co/rhasspy/piper-voices/resolve/main/hi/hi_IN/rohan/medium/hi_IN-rohan-medium.onnx.json?download=true"
    #     voice_path = "./piper_model/hi_IN-rohan-medium.json"

    #     print("üîΩ Downloading voice: piper config ...")
    #     with requests.get(voice_url, stream=True) as r:
    #         r.raise_for_status()
    #         with open(voice_path, "wb") as f:
    #             for chunk in r.iter_content(chunk_size=8192):
    #                 f.write(chunk)
    #     print(f"‚úÖ Voice saved to {voice_path} ({os.path.getsize(voice_path)//1_000_000} MB)")
    #     return "Done"

    def synthesizer_worker(self,q_text: queue.Queue, q_audio: queue.Queue, logs: list, full_audio_buffer: list = None):
        """Continuously pulls text sentences, synthesizes them, and queues audio."""
        while True:
            sentence = q_text.get()
            if sentence is None:
                print("[Synthesizer] Got None, stopping...")
                q_audio.put(None)
                break

            print(f"[Synthesizer] Processing: '{sentence}'")
            synth_start = time.time()
            audio_data = []
            for chunk in self.model.synthesize(sentence):
                audio_data.extend(chunk.audio_float_array)
            synth_end = time.time()

            synth_time = synth_end - synth_start
            print(f"[Synthesizer] Generated {len(audio_data)} samples in {synth_time:.3f}s")
            
            logs.append({
                "type": "synthesis",
                "text": sentence,
                "duration_sec": synth_time,
                "samples": len(audio_data),
            })

            # Check for None explicitly to avoid issues in threads
            if full_audio_buffer is not None:
                full_audio_buffer.extend(audio_data)

            q_audio.put(audio_data)

    def player_worker(self,q_audio: queue.Queue, logs: list):
        """Continuously pulls audio chunks and plays them with soft overlap."""
        print("üéß Player thread started...")

        while True:
            audio_chunk = q_audio.get()
            if audio_chunk is None:
                print("üõë Player thread stopping.")
                break
            play_start = time.time()
            sd.play(audio_chunk, samplerate=self.SAMPLE_RATE)
            
            # Wait for audio to complete playback
            sd.wait()
            play_end = time.time()

            logs.append({
                "type": "playback",
                "duration_sec": play_end - play_start,
                "samples": len(audio_chunk),
            })

    def save_wav(self, file_path, audio_data):
        """Saves audio data to a WAV file."""
        if not audio_data:
            print("‚ö†Ô∏è No audio data to save.")
            return

        print(f"üíæ Saving audio to {file_path}...")
        try:
            # Ensure directory exists
            os.makedirs(os.path.dirname(os.path.abspath(file_path)), exist_ok=True)
            
            # Convert float list to numpy array
            audio_np = np.array(audio_data, dtype=np.float32)
            
            # Convert to int16 PCM (scale -1.0 to 1.0 -> -32768 to 32767)
            # Clip is important to avoid overflow wraparound
            audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
            
            with wave.open(file_path, "wb") as wf:
                wf.setnchannels(1)  # Mono
                wf.setsampwidth(2)  # 2 bytes (16 bit)
                wf.setframerate(self.SAMPLE_RATE)
                wf.writeframes(audio_int16.tobytes())
            
            print(f"‚úÖ Audio successfully saved to {file_path}")
        except Exception as e:
            print(f"‚ùå Error saving WAV file: {e}")

    def text_to_speech_stream(self,text: str, output_wav: str = None):
        """
        Takes a block of text, streams synthesis + playback with overlap.
        Optionally saves the full session to a WAV file.
        """
        print("üß© Starting Text ‚Üí Speech pipeline...\n")

        q_text = queue.Queue(maxsize=5)
        q_audio = queue.Queue(maxsize=5)
        logs = []
        full_audio_buffer = [] if output_wav else None

        # Pass full_audio_buffer to synthesizer worker if we want to save
        synth_thread = threading.Thread(target=self.synthesizer_worker, args=(q_text, q_audio, logs, full_audio_buffer))
        play_thread = threading.Thread(target=self.player_worker, args=(q_audio, logs))

        synth_thread.start()
        play_thread.start()

        start_time = time.time()

        sentences = re.split(r'(?<=[.!?]) +', text.strip())

        for sentence in sentences:
            if sentence.strip():
                q_text.put(sentence.strip())

        q_text.put(None)
        synth_thread.join()
        play_thread.join()

        end_time = time.time()
        print("\n‚úÖ Pipeline complete.")
        print(f"‚è±Ô∏è Total runtime: {end_time - start_time:.2f}s\n")
        
        if output_wav and full_audio_buffer:
            self.save_wav(output_wav, full_audio_buffer)

In [14]:
tts = Text_to_Speech()

In [15]:
tts.text_to_speech_stream("My name is Priya and I work as a teacher in Mumbai. Every morning, I take the local train to reach my school. The weather here is very humid, especially during the monsoon season. After work, I often visit the market to buy vegetables and other groceries. My mother and father live nearby, so I visit them regularly on weekends.","prove_4.wav")

üß© Starting Text ‚Üí Speech pipeline...

üéß Player thread started...
[Synthesizer] Processing: 'My name is Priya and I work as a teacher in Mumbai.'
[Synthesizer] Generated 69632 samples in 0.450s
[Synthesizer] Processing: 'Every morning, I take the local train to reach my school.'
[Synthesizer] Generated 59136 samples in 0.305s
[Synthesizer] Processing: 'The weather here is very humid, especially during the monsoon season.'
[Synthesizer] Generated 93184 samples in 0.434s
[Synthesizer] Processing: 'After work, I often visit the market to buy vegetables and other groceries.'
[Synthesizer] Generated 115712 samples in 0.554s
[Synthesizer] Processing: 'My mother and father live nearby, so I visit them regularly on weekends.'
[Synthesizer] Generated 89600 samples in 0.419s
[Synthesizer] Got None, stopping...


KeyboardInterrupt: 