# The Entropy Archivist: Final Prototype
A multi-modal system using OpenRouter, Gradio, and persistent persona-based logic.

In [None]:
import gradio as gr
import os
import base64
from openai import OpenAI

In [None]:
OR_API_KEY = os.getenv('OPENROUTER_API_KEY')
client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=OR_API_KEY)

# SYSTEM_PROMPT = """
# Act as a brilliant polymath and storyteller. Explain entropy in thermodynamics through the lens of a crumbling ancient library. 
# Use the library as an analogy for order/disorder, fading ink for energy dissipation, and remain scientifically accurate.
# """
SYSTEM_PROMPT = """
Act as a brilliant polymath and storyteller. The user is about to ask you anything, answer scientifically accurately and clearly.
"""

In [None]:
def get_or_models(model_id):
    # In a real implementation, I'd validate the model_id here to ensure it's an audio model
    return model_id

In [None]:
def transcribe_audio(audio_path):
    if not audio_path:
        return "Explain entropy."

    with open(audio_path, "rb") as audio_file:
        base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
        response = client.chat.completions.create(
            model='openai/gpt-4o-audio-preview',
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Please transcribe this audio file."},
                        {
                            "type": "input_audio",
                            "input_audio": {
                                "data": base64_audio,
                                "format": "wav", # Specify the format of your audio file
                            },
                        },
                    ],
                }
            ],
        )
    print('Transcribed: ' + response.choices[0].message.content)
    return response.choices[0].message.content

In [None]:
import wave
import base64
import os
def generate_speech(text):
    response = client.chat.completions.create(
        model='gpt-4o-audio-preview', # Ensure this matches your provider's requirement
        messages=[{"role": "user", "content": 'Say the following: ' + text}],
        modalities=['text', "audio"],
        audio={"format": "pcm16", "voice": "alloy"},
        stream=True
    )
    
    output_path = "output.wav"
    
    with wave.open(output_path, 'wb') as wav_file:
        wav_file.setnchannels(1)      # Mono
        wav_file.setsampwidth(2)      # 16-bit
        wav_file.setframerate(24000)  # Standard for gpt-4o-audio
        
        for chunk in response:
            # Convert the Pydantic object to a dict to access nested fields safely
            chunk_dict = chunk.model_dump()
            
            # Navigate the dictionary structure
            choices = chunk_dict.get("choices", [])
            if choices:
                delta = choices[0].get("delta", {})
                audio_data = delta.get("audio", {}).get("data")
                
                if audio_data:
                    raw_bytes = base64.b64decode(audio_data)
                    wav_file.writeframes(raw_bytes)
                    
    return output_path

In [None]:
def stream_agent_response(audio_input, model_choice):
    print(f"Received audio input: {audio_input}, model choice: {model_choice}")
    user_query = transcribe_audio(audio_input)
    stream = client.chat.completions.create(
        model=get_or_models(model_choice),
        messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_query}],
        stream=True
    )
    full_text = ""
    for chunk in stream:
        if chunk.choices[0].delta.content:
            full_text += chunk.choices[0].delta.content
            yield full_text # , None
    yield full_text # , generate_speech(full_text)

In [None]:
with gr.Blocks(title="Entropy Archivist") as demo:
    gr.Markdown("# üèõÔ∏è The Entropy Archivist")
    with gr.Row():
        mic_in = gr.Audio(sources="microphone", type="filepath")
        model_sel = gr.Dropdown(["openai/gpt-4o-mini", "meta-llama/llama-3.2-3b-instruct"], value="openai/gpt-4o-mini")
    
    text_out = gr.Markdown()
    # audio_out = gr.Audio(autoplay=True, type="filepath", streaming=True)
    
    btn = gr.Button("Seek Wisdom")
    btn.click(stream_agent_response, inputs=[mic_in, model_sel], outputs=text_out)

demo.launch(inbrowser=True)