In [None]:
from openai import OpenAI
from pydub import AudioSegment
import os
import io
from dotenv import load_dotenv
load_dotenv("config.env")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
content = [
    {
        "A": "Hello, dear listeners, and welcome to this episode of the Technology Interview program. Today, we have invited Mr. Yucheng, the Head of Technology at SynthMind AI, to discuss the hot topic of AI Agents with us. Hello, Yucheng!",
        "B": "Hello, host, and hello to all the listeners! I'm delighted to be here to discuss AI Agents with everyone."
    },
    {
        "A": "AI Agents have been making waves lately, and many believe they are a crucial step towards achieving general artificial intelligence. Yucheng, could you please briefly introduce what exactly an AI Agent is?",
        "B": "Sure. An AI Agent is an artificial intelligence entity capable of perceiving its environment, making decisions, and taking actions like humans. They can be software programs or embodied robots. Unlike traditional AI systems, AI Agents are more autonomous and flexible, capable of operating independently in complex and dynamic environments."
    },
    {
        "A": "That sounds impressive! What are the specific characteristics of AI Agents?",
        "B": "AI Agents mainly have four characteristics: Autonomy: AI Agents can operate independently without human intervention, making decisions, and taking actions autonomously. For example, Auto-GPT requires you to set a goal, and it will independently plan and execute actions to achieve that goal. Reactivity: AI Agents can perceive changes in their surroundings and respond to them. For instance, in games, AI Agents can adjust their strategies based on opponents' actions. Proactivity: AI Agents can not only respond to the environment passively but also take proactive actions to change the environment. For example, a learning AI Agent will actively search for learning materials to enhance its skills. Social Ability: AI Agents can communicate and collaborate with other AI Agents or humans. For instance, in software development, multiple AI Agents can communicate in natural language to complete tasks together."
    },
    {
        "A": "Wow, AI Agents are quite capable! What are their current application scenarios?",
        "B": "AI Agents have a wide range of applications, such as: Rule-based Agents: These are relatively simple AI Agents whose decisions and actions follow predefined rules. They can be used in scenarios like traffic light control and automated production lines. Reactive Agents: These AI Agents react directly to environmental stimuli without complex planning or reasoning. They can be used in scenarios like robot navigation and game AI. Goal-oriented Agents: These AI Agents have clear objectives and use planning and decision algorithms to achieve these goals. They can be used in scenarios like chess games and path planning. Learning Agents: These AI Agents can learn from experience and improve their performance over time, such as in recommendation systems and personalized education."
    },
    {
        "A": "It sounds like the internal mechanisms of AI Agents are quite complex! How do they actually work?",
        "B": "The working principle of an AI Agent can be likened to the human brain. It usually consists of several core modules: 'Brain' Module: This is the core of the AI Agent, responsible for high-level cognition and decision-making, such as understanding human instructions, planning, reasoning, etc. Large Language Models (LLMs) are typically the core component of this module, providing strong language understanding and generation capabilities. 'Perception' Module: This module receives and processes information from the external environment, such as text, images, sound, etc. It's like the eyes, ears, and nose of the AI Agent, helping it understand its surroundings. 'Action' Module: This module executes decisions, such as controlling robot movements, generating text, using tools, etc. It's like the hands and feet of the AI Agent, enabling it to perform tasks in the real or virtual world. 'Memory' Module: This module stores the AI Agent's experiences and knowledge, which the 'Brain' module references during reasoning and planning. It's like the memory repository of the AI Agent's brain, helping it learn from past experiences and make better decisions in the future. 'Planning' Module: This module formulates specific steps to achieve goals and breaks down high-level plans into executable action sequences. It's like the planner for the AI Agent, helping it complete tasks efficiently. These modules collaborate to allow the AI Agent to process information, make decisions, and take actions like humans."
    },
    {
        "A": "Wow, that's amazing! It seems AI Agents have great potential for future development! What do you think are the future trends for AI Agents?",
        "B": "I think the future development trends for AI Agents are mainly in the following areas: LLM-Based Agents: In the future, more AI Agents will use large language models (LLMs) as their core components, giving them stronger language understanding and generation capabilities, allowing for more natural and fluent interactions with humans. Multi-modal Agents: Future AI Agents will be able to handle multiple modalities of information, such as images, sound, video, etc., enabling a more comprehensive perception of the world and completion of more complex tasks. Embodied Agents: Combining the intelligence of AI Agents with the physical world, allowing them to perform tasks in real environments, such as controlling robots for navigation or object manipulation in the real world, will be an important development direction. Agent Society: Exploring interactions and social phenomena among AI Agents and researching how to build smarter and more collaborative AI systems are also important directions for future development."
    },
    {
        "A": "Thank you very much for your wonderful insights, Yucheng! I believe that as AI technology continues to evolve, AI Agents will play an increasingly important role in the future, bringing more convenience and surprises to our lives.",
        "B": "Thank you, host! And thank you to all the listeners for tuning in!"
    }
]

def split_text(text, max_length=4096):
    """Split the text into multiple chunks, with each chunk not exceeding the specified maximum length."""
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

def generate_speech_chunk(client, model, input_text, voice, response_format='mp3', speed=1.0):
    """Generate speech for a single text chunk using OpenAI's TTS API."""
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=input_text,
        response_format=response_format,
        speed=speed
    )

    return response.content

def merge_audio_files(audio_files, output_path):
    """Merge multiple audio files into a single complete audio file."""
    combined_audio = AudioSegment.empty()
    
    for audio_content in audio_files:
        segment = AudioSegment.from_file(io.BytesIO(audio_content), format="mp3")
        combined_audio += segment

    combined_audio.export(output_path, format="mp3")
    print(f"Audio file has been saved as {output_path}")

def generate_podcast_from_content(client, content, podcast_title, model, voice_a, voice_b):
    """Generate a complete podcast audio based on the conversation content."""
    # Ensure the output directory exists
    output_dir = "/Users/ycyang/code/test/data/audio/"
    os.makedirs(output_dir, exist_ok=True)
    audio_path = os.path.join(output_dir, f"{podcast_title}.mp3")

    audio_files = []

    for i, conversation in enumerate(content):
        # Generate speech for each speaker
        for speaker, text in conversation.items():
            print(f"Generating speech for segment {i+1} by speaker {speaker}")
            try:
                # Generate speech for different speakers using different voices
                voice = voice_a if speaker == 'A' else voice_b
                audio_content = generate_speech_chunk(client, model, text, voice)
                audio_files.append(audio_content)
            except Exception as e:
                print(f"Failed to generate speech for segment {i+1} by speaker {speaker}: {e}")

    # Merge audio segments
    merge_audio_files(audio_files, audio_path)


model = 'tts-1'
voice_a = 'nova'  # The host's voice
voice_b = 'onyx'  # The interviewee's voice

generate_podcast_from_content(client, content, "Tech Sharing Podcast", model, voice_a, voice_b)

Generating speech for segment 1 by speaker A
Generating speech for segment 1 by speaker B
Generating speech for segment 2 by speaker A
Generating speech for segment 2 by speaker B
Generating speech for segment 3 by speaker A
Generating speech for segment 3 by speaker B
Generating speech for segment 4 by speaker A
Generating speech for segment 4 by speaker B
Generating speech for segment 5 by speaker A
Generating speech for segment 5 by speaker B
Generating speech for segment 6 by speaker A
Generating speech for segment 6 by speaker B
Generating speech for segment 7 by speaker A
