In [1]:
import sounddevice as sd
from scipy.io.wavfile import write
import whisper
import google.generativeai as genai
from gtts import gTTS
from IPython.display import Audio, display
import os
import random
import string
from datetime import datetime
from dotenv import load_dotenv

In [2]:
load_dotenv()
api_key = os.getenv("GENAI_API_KEY")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("models/gemini-1.5-flash")


In [None]:
def record_audio(duration=10, fs=44100, filename="input.wav"):
    print("🎙️ Speak now...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()
    write(filename, fs, recording)
    print("✅ Recording finished.")

In [None]:
def unique_filename():
    time_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    rand_str = ''.join(random.choices(string.ascii_letters, k=5))
    return f"response_{time_stamp}_{rand_str}.mp3"


In [None]:
def transcribe_audio(filename):
    model = whisper.load_model("base")
    result = model.transcribe(filename)
    return result["text"]

In [None]:
def generate_response(prompt):
    response = model.generate_content(prompt)
    return response.text


In [None]:
def speak(text):
    filename = unique_filename()
    tts = gTTS(text=text, lang='en', tld='ie')  # Change tld for accent
    tts.save(filename)
    display(Audio(filename))
    return filename

In [None]:
record_audio()
transcribed = transcribe_audio("input.wav")
print(f"📝 You said: {transcribed}")

response = generate_response(transcribed)
print(f"💬 Gemini: {response}")

speak(response)