In [None]:
import warnings
from transformers import logging
import os
import torch

os.environ["HF_HOME"] = r"G:\huggingface-cache"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
logging.set_verbosity_error()
warnings.filterwarnings("ignore")

print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())

In [None]:
# pip install transformers torch soundfile  # For TTS models
# pip install langchain langchain-core
# pip install sentencepiece
# `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
# pip install datasets

In [None]:
from transformers import pipeline
import torch
import soundfile as sf  # To save audio

# Device setup (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForTextToSpectrogram

processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
model = AutoModelForTextToSpectrogram.from_pretrained("microsoft/speecht5_tts")

In [None]:
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf

synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# You can replace this embedding with your own as well.

speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"speaker_embeddings": speaker_embedding})

sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])


In [None]:
# Some models need speaker embeddings; SpeechT5 uses a dataset for this
from datasets import load_dataset

def generate_speech(text: str, output_path: str = "output.wav"):
    synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)

    speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
    sf.write(output_path, speech["audio"], samplerate=speech["sampling_rate"])
    print(f"Joke : {text} and Audio saved to {output_path}")

In [None]:
from langchain_core.runnables import RunnableLambda
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
import random

# Example: Use a local LLM to generate text, then convert to speech
llm = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen2.5-7B-Instruct",
    task="text-generation",
    temperature=1.5,
)
chat_model = ChatHuggingFace(llm=llm)

prompt = ChatPromptTemplate.from_messages([("system", "Remove any characters like '\n' or '\n\n' from output generated."), ("human", "{input}")])

output_file_name = random.randint(0, 1000)

tts_runnable = RunnableLambda(lambda text: generate_speech(text, f"{output_file_name}_output.wav") or "Audio generated!")

chain = prompt | chat_model | (lambda output: output.content) | tts_runnable

result = chain.invoke({"input": "What is LLM in Machine Learning and Data Science ? explain it in 5 lines."})

result