In [7]:
import numpy as np
import re
from transformers import pipeline
from IPython.display import Audio, display


In [8]:
def split_sentences(text):
    return re.split(r'(?<=[.!?])\s+', text.strip())

In [9]:
def tts_play_clean(text, model_name, pause_sec=0.4):
    tts = pipeline("text-to-speech", model=model_name)

    sentences = split_sentences(text)

    final_audio = []
    sr = None

    for sent in sentences:
        out = tts(sent)

        # FIX: force 1D audio
        audio = np.array(out["audio"], dtype=np.float32).flatten()
        sr = out["sampling_rate"]

        final_audio.append(audio)

        # silence pause
        pause = np.zeros(int(sr * pause_sec), dtype=np.float32)
        final_audio.append(pause)

    final_audio = np.concatenate(final_audio)

    # light normalization (no echo)
    final_audio /= np.max(np.abs(final_audio))

    display(Audio(final_audio, rate=sr))


In [10]:
translator = pipeline("translation", model="facebook/nllb-200-distilled-600M")

def translate(text):
    hi = translator(text, src_lang="eng_Latn", tgt_lang="hin_Deva")[0]["translation_text"]
    gu = translator(text, src_lang="eng_Latn", tgt_lang="guj_Gujr")[0]["translation_text"]
    return hi, gu


Device set to use cpu


In [11]:
text = """
Hello, my name is Diaa.
I am an AI and Machine Learning enthusiast.
I enjoy building real world AI applications.
I am continuously learning new technologies.
"""


In [12]:
hi_text, gu_text = translate(text)

print("🔊 English (clean)")
tts_play_clean(text, "facebook/mms-tts-eng", pause_sec=0.45)

print("🔊 Hindi (clean)")
tts_play_clean(hi_text, "facebook/mms-tts-hin", pause_sec=0.5)

print("🔊 Gujarati (clean)")
tts_play_clean(gu_text, "facebook/mms-tts-guj", pause_sec=0.5)


🔊 English (clean)


Device set to use cpu


🔊 Hindi (clean)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Device set to use cpu


🔊 Gujarati (clean)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/765 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]

Device set to use cpu
