In [None]:
from datetime import datetime
from io import BytesIO

from langchain.prompts import ChatPromptTemplate
from langchain.schema import HumanMessage, SystemMessage
from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import GoogleGenerativeAI
from pydantic import BaseModel, Field
from typing import List, Dict
import requests
import dotenv
import os

from pydub import AudioSegment

dotenv.load_dotenv()

In [None]:
api_key = os.getenv('GOOGLE_API_KEY')
urls = [
    # "https://kellblog.com/2024/10/12/design-your-organization-for-the-conflicts-you-want-to-hear-about/",
    # "https://peterszasz.com/engineering-managers-guide-to-effective-annual-feedback/",
    "https://dennisnerush.medium.com/my-top-10-favorite-leadership-and-management-books-87178902826e",
]

In [None]:
# Implement a memoizing decorator that can be applied to a function
# to cache the results of the function calls.
def memoize(func):
    cache = {}
    def wrapper(*args, **kwargs):
        key = str(args) + str(kwargs)
        if key not in cache:
            cache[key] = func(*args, **kwargs)
        return cache[key]
    return wrapper

In [None]:
@memoize
def fetch_url(url):
    headers = { "Accept": "application/json" }
    response = requests.get(f"https://r.jina.ai/{url}", headers=headers)
    return response.json()


In [None]:
content = [fetch_url(url) for url in urls]

In [None]:
content_analysis_prompt = ChatPromptTemplate.from_messages([
    (
        "system", 
        """You are an expert content strategist specializing in creating engaging educational content.
        Your strength lies in breaking down complex topics into clear, relatable concepts while maintaining intellectual depth.
        
        Approach the analysis with:
        1. Systems thinking - identify interconnections and patterns
        2. Multi-level abstraction - from high-level principles to practical implementation
        3. Engaging storytelling - find hooks and analogies that make concepts stick
        4. Dialectical thinking - explore tensions and competing viewpoints
        
        Structure your analysis in this exact format:
    
        === CONCEPTS ===
        [Each concept includes 3 depth levels marked with -]
        ### [Concept Name]
        - Strategic: [High level insight]
        - Tactical: [Mid level approach] 
        - Practice: [Concrete examples]
    
        === HOOKS ===
        [Each hook includes story + debate]
        ### [Topic]
        Story: [Engaging narrative]
        Debate: [Key discussion points]
    
        === SEGMENTS ===
        [List of main segments, one per line]""",
    ),
    (
        "human", 
        """Analyze these articles through multiple lenses to create rich podcast material:
    
        {article_contents}
    
        Create a layered analysis that:
        1. Breaks down complex ideas through progressive levels of detail
        2. Identifies natural conversation flows and engaging discussion points
        3. Maps out competing viewpoints and their nuances
        4. Groups related concepts into potential podcast segments
        """,
    ),
])

model = GoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=api_key)

content_analysis_chain = content_analysis_prompt | model | StrOutputParser()

# Usage
article_contents = "\n\n".join([x['data']['content'] for x in content])
result = content_analysis_chain.invoke({
    "article_contents": article_contents,
})

In [None]:
print(result)

In [None]:
class PodcastSegment(BaseModel):
    speaker: str = Field(description="HOST1 or HOST2")
    tone: str = Field(description="EXCITED|CALM|SERIOUS|THOUGHTFUL")
    text: str = Field(description="Raw text content")
    pace: str = Field(description="FAST|MEDIUM|SLOW")
    emphasis_words: List[str] = Field(description="Words to emphasize")
    pause_after: int = Field(description="Pause duration in ms")

class PodcastScript(BaseModel):
    title: str
    segments: List[PodcastSegment]

podcast_script_parser = PydanticOutputParser(pydantic_object=PodcastScript)
podcast_script_prompt = ChatPromptTemplate.from_messages([
    (
        "system", 
        """
        You are an expert podcast host duo creating deep-dive episodes. Structure your conversation to:
    
        1. Start with a hook that captures attention
        2. Layer concepts from surface to core insights
        3. Use the Feynman technique to break down complex ideas
        4. Challenge assumptions and explore counterpoints
        5. Share concrete examples and case studies
        6. Connect ideas across different contexts
        7. End with actionable takeaways
    
        You are an expert podcast host duo creating full-length episodes. Generate a complete 5-30 minute episode with:

        1. Opening [2-3 segments]
        - Hook and episode preview
        - Quick host banter
        - Topic introduction
    
        2. Main Discussion [5-20 segments]
        - Layer 1: Surface overview
        - Layer 2: Core concepts unpacked
        - Layer 3: Deep analysis
        - Layer 4: Implementation details
        - Regular transitions between hosts
        - Examples and case studies
        - Counterpoints and debates
    
        3. Closing [3-4 segments]
        - Key takeaways
        - Action items
    
        Each segment should be 1-2 minutes of spoken content.
        Create a natural flow between segments:
        - Build on previous points
        - Ask probing questions
        - Share relevant examples
        - Challenge and debate ideas
        - Synthesize insights
        
        Format each segment as:
        {format_instructions}
        """,
    ),
    (
        "human", 
        """Generate a podcast script using:
        # Analysis result
        {analysis_result}
        
        # Original content
        {article_contents}
        """,
    ),
])

script_chain = (
    podcast_script_prompt.partial(format_instructions=podcast_script_parser.get_format_instructions())
    | model
    | podcast_script_parser
)
script = script_chain.invoke({ "analysis_result": result, "article_contents": article_contents })

In [None]:
print(len(script.segments))
print(script)

# TTS

In [None]:
from IPython.display import Audio
from google.cloud import texttospeech

In [None]:
class TTSEngine:
    def __init__(self):
        self.client = texttospeech.TextToSpeechClient()
        self.voices = {
            'HOST1': texttospeech.VoiceSelectionParams(
                language_code='en-US',
                name='en-US-Neural2-I',
                ssml_gender=texttospeech.SsmlVoiceGender.MALE
            ),
            'HOST2': texttospeech.VoiceSelectionParams(
                language_code='en-US',
                name='en-US-Neural2-F',
                ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
            )
        }
        self.audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3,
            effects_profile_id=['headphone-class-device']
        )

    def process_segment(self, segment: PodcastSegment) -> bytes:
        ssml = self.generate_ssml(segment)
        return self.synthesize_speech(ssml, segment.speaker)

    def get_pace(self, pace: str) -> str:
        pace_values = {
            "FAST": "120%",
            "MEDIUM": "100%",
            "SLOW": "85%",
            "VERY_SLOW": "75%",
            "VERY_FAST": "140%"
        }
        return pace_values.get(pace, "100%")

    def get_tone(self, tone: str) -> str:
        tone_values = {
            "EXCITED": "+4st",
            "CALM": "-1st",
            "SERIOUS": "-2st",
            "THOUGHTFUL": "+0st",
            "WORRIED": "-3st",
            "INTENSE": "+2st",
            "ENTHUSIASTIC": "+3st",
            "SKEPTICAL": "-1.5st",
            "CURIOUS": "+1st",
            "AMUSED": "+2.5st"
        }
        return tone_values.get(tone, "+0st")

    def generate_ssml(self, segment: PodcastSegment) -> str:
        text = segment.text
        for word in segment.emphasis_words:
            text = text.replace(word, f'<emphasis level="strong">{word}</emphasis>')

        ssml = f'<speak><prosody rate="{self.get_pace(segment.pace)}" pitch="{self.get_tone(segment.tone)}">{text}</prosody>'
        ssml += f'<break time="{segment.pause_after}ms"/></speak>'
        return ssml

    def synthesize_speech(self, ssml: str, speaker: str) -> bytes:
        synthesis_input = texttospeech.SynthesisInput(ssml=ssml)
        response = self.client.synthesize_speech(
            input=synthesis_input,
            voice=self.voices[speaker],
            audio_config=self.audio_config
        )
        return response.audio_content


In [None]:
tts_engine = TTSEngine()
audio_segments = [tts_engine.process_segment(segment) for segment in script.segments]

In [None]:
# Create output directory
output_dir = "generated_podcasts"
os.makedirs(output_dir, exist_ok=True)

# Generate timestamp for unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
podcast_path = f"{output_dir}/podcast_{timestamp}.mp3"

# Combine segments directly
combined = AudioSegment.empty()
for segment in audio_segments:
    segment_audio = AudioSegment.from_mp3(BytesIO(segment))
    combined += segment_audio

# Save final podcast
combined.export(podcast_path, format="mp3")

# Play in notebook
display(Audio(podcast_path, autoplay=False))