In [None]:
import wave
import re
import numpy as np
import torchaudio
import openai
import edge_tts
import asyncio
import time
import sounddevice as sd
from speechbrain.inference.classifiers import EncoderClassifier


In [None]:
# Constants for recording
SAMPLE_RATE = 16000  # Sample rate for recording
RECORD_SECONDS = 3  # Duration for recording in seconds

# Define available language options for transcription, GPT, and TTS
LANGUAGE_SETTINGS = {
    'ur': {
        'whisper_language': 'urdu',
        'tts_voice': 'ur-PK-AsadNeural',
        'gpt_language_code': 'ur',  # Urdu language code for GPT prompt
    },
    'en': {
        'whisper_language': 'english',
        'tts_voice': 'en-US-JennyNeural',
        'gpt_language_code': 'en',  # English language code for GPT prompt
    }
}

# Load the SpeechBrain language identification model
language_id_model = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")


In [None]:

# Function to detect language from audio using SpeechBrain
def detect_language(audio_file_path):
    """Detects the language of the given audio file."""
    signal, sample_rate = torchaudio.load(audio_file_path)

    # Ensure signal is in batch format (batch_size, num_samples)
    if signal.dim() == 1:
        signal = signal.unsqueeze(0)

    # Use SpeechBrain model to detect language
    prediction = language_id_model.classify_batch(signal)

    # Extract language code (prediction[3] gives the predicted language label)
    language_code = prediction[3][0].split(":")[0]  # Get the language code (e.g., 'ur', 'en')
    language_name = prediction[3][0].split(":")[1].strip()  # Get the language name (e.g., 'Urdu', 'English')

    print(f"Detected language: {language_name} ({language_code})")
    return language_code

# Function to select language (switches to Urdu if detected language is not English)
def select_language(language_code):
    """Selects language settings for Whisper, GPT, and Edge TTS based on detected language code."""
    if language_code != 'en':
        return LANGUAGE_SETTINGS['ur']  # Switch to Urdu for any non-English language
    return LANGUAGE_SETTINGS['en']  # Default to English

def clean_text(text):
    """Cleans the input text by removing unwanted characters."""
    try:
        text.encode('utf-8').decode('utf-8')
    except UnicodeDecodeError:
        text = text.encode('utf-8', 'replace').decode('utf-8')
    
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text.strip()  # Strip leading/trailing whitespace

def transcribe_audio(audio_file, whisper_language):
    """Transcribes audio using the Whisper model."""
    print("Transcribing audio using Whisper model...")
    waveform, sample_rate = torchaudio.load(audio_file)

    # Resample to 16kHz if necessary
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

    # Process the audio with Whisper
    forced_decoder_ids = processor.get_decoder_prompt_ids(language=whisper_language, task="transcribe")
    input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    print(f"Transcribed Text: {transcription}")
    return transcription

def generate_gpt_response(transcribed_text, language_code):
    """Generate a response based on the detected language."""
    if language_code == 'ur':
        prompt = (
            "آپ ایک مددگار اسسٹنٹ ہیں۔ براہ کرم مندرجہ ذیل معلومات پر ایک واضح اور جامع جواب فراہم کریں: \n\n"
            f"Input: {transcribed_text}"
        )
    else:
        prompt = (
            "You are a helpful assistant. Based on the following input, please provide a clear and concise response: \n\n"
            f"Input: {transcribed_text}"
        )
    
    # Generate GPT response using the custom prompt
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the appropriate model
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150
    )
    
    # Extract the generated response
    gpt_response = response['choices'][0]['message']['content'].strip()
    print("gpt_response", gpt_response)
    return gpt_response

async def edge_tts_speech(text, voice, output_file='response-brain-sp.mp3'):
    """Convert text to speech using Edge TTS and save it as an MP3 file."""
    print("Converting text to speech using Edge TTS...")
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(output_file)
    print(f"Audio file saved as {output_file}")

# Function to record audio
def record_audio(filename='output.wav', duration=RECORD_SECONDS, sample_rate=SAMPLE_RATE):
    """Record audio for a specified duration and save it to a file."""
    print("Recording...")
    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
    sd.wait()  # Wait until the recording is finished
    print("Recording finished.")

    # Save the recorded data to a WAV file
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)  # Mono
        wf.setsampwidth(2)  # 16-bit audio
        wf.setframerate(sample_rate)
        wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())  # Convert float to int16
    
    return filename  # Return the filename for later use





In [None]:
def main():
    """Main function to detect language, transcribe, respond, and synthesize speech."""
    # Record the audio
    audio_file_path = record_audio()

    # Detect language from the recorded audio file
    de = time.time()
    detected_language_code = detect_language(audio_file_path)
    d2 = time.time()
    print("Language detection took:", d2 - de)

    # Get language-specific settings based on detected language
    language_settings = select_language(detected_language_code)
    whisper_language = language_settings['whisper_language']
    tts_voice = language_settings['tts_voice']
    gpt_language_code = language_settings['gpt_language_code']

    # Transcribe the audio using Whisper
    t3 = time.time()
    transcribed_text = transcribe_audio(audio_file_path, whisper_language)
    t4 = time.time()
    print("Transcription took:", t4 - t3)

    # Generate GPT response with language-based prompt
    t5 = time.time()
    gpt_response = generate_gpt_response(transcribed_text, gpt_language_code)
    t6 = time.time()
    print("GPT response generation took:", t6 - t5)

    # Synthesize speech asynchronously using the selected TTS voice
    t7 = time.time()
    asyncio.create_task(edge_tts_speech(gpt_response, tts_voice))
    t8 = time.time()
    print("TTS generation took:", t8 - t7)


In [None]:
if __name__ == "__main__":
    main()