In [None]:
import os
import getpass

os.environ["GOOGLE_API_KEY"] = getpass.getpass()

In [None]:
import whisper
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from elevenlabs import play
import warnings
warnings.filterwarnings("ignore")

load_dotenv()

with open("my words.txt", "r", encoding="utf-8") as f:
    text_input = f.read().strip()

print("📄 Text loaded from file:\n", text_input)

elevenlabs = ElevenLabs(
  api_key=os.getenv("ELEVEN_API_KEY"),
)


llm = ChatGoogleGenerativeAI(model = "gemini-1.5-flash")
memory = ConversationBufferMemory()
conversation = ConversationChain(llm = llm, memory= memory)

responce1 = conversation.invoke(text_input)["response"]

audio = elevenlabs.text_to_speech.convert(
    text= responce1,
    voice_id="JBFqnCBsd6RMkjVDRZzb",
    model_id="eleven_multilingual_v2",
    output_format="mp3_44100_128",
)

# Join all chunks into bytes
audio_bytes = b"".join(audio)

# Save to file
with open("output2.mp3", "wb") as f:
    f.write(audio_bytes)

print("Audio saved as output.mp3")

# Load a pre-trained Whisper model (tiny is fastest for CPU)
model = whisper.load_model("tiny")  

# Transcribe an audio file
result = model.transcribe("output2.mp3")
print("Detected text:", result["text"])
print("Detected language:", result["language"])

In [None]:
from IPython.display import Audio
Audio(audio_bytes)