In [19]:
import concurrent.futures as cf
import glob
import io
import os
import time
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import List, Literal

from dotenv import load_dotenv
from loguru import logger
from openai import OpenAI
from promptic import llm
from pydantic import BaseModel, ValidationError
from pypdf import PdfReader
from tenacity import retry, retry_if_exception_type

# Load environment variables

In [20]:
current_dir = Path.cwd()
BASE_DIR = current_dir / "docs-to-audio"
# print(BASE_DIR)
load_dotenv(BASE_DIR / ".env")

if os.getenv("OPENAI_API_KEY")  and os.getenv("GEMINI_API_KEY"):        
    print("API keys loaded")
else:
    print("WARNING: API keys not found")


API keys loaded


# Core Functions

In [21]:
class DialogueItem(BaseModel):
    text: str
    speaker: Literal["female-1", "male-1", "female-2"]

    @property
    def voice(self):
        return {
            "female-1": "alloy",
            "male-1": "onyx",
            "female-2": "shimmer",
        }[self.speaker]


class Dialogue(BaseModel):
    scratchpad: str
    dialogue: List[DialogueItem]


def get_mp3(text: str, voice: str, api_key: str = None) -> bytes:
    client = OpenAI(
        api_key=api_key or os.getenv("OPENAI_API_KEY"),
    )

    with client.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice=voice,
        input=text,
    ) as response:
        with io.BytesIO() as file:
            for chunk in response.iter_bytes():
                file.write(chunk)
            return file.getvalue()


def generate_audio(file: str, openai_api_key: str = None) -> bytes:

    if not (os.getenv("OPENAI_API_KEY") or openai_api_key):
        raise gr.Error("OpenAI API key is required")

    with Path(file).open("rb") as f:
        reader = PdfReader(f)
        text = "\n\n".join([page.extract_text() for page in reader.pages])

    @retry(retry=retry_if_exception_type(ValidationError))
    @llm(
        model="gemini/gemini-1.5-flash-002",
    )
    def generate_dialogue(text: str) -> Dialogue:
        """
        Your task is to take the input text provided and turn it into an engaging, informative podcast dialogue. The input text may be messy or unstructured, as it could come from a variety of sources like PDFs or web pages. Don't worry about the formatting issues or any irrelevant information; your goal is to extract the key points and interesting facts that could be discussed in a podcast.

        Here is the input text you will be working with:

        <input_text>
        {text}
        </input_text>

        First, carefully read through the input text and identify the main topics, key points, and any interesting facts or anecdotes. Think about how you could present this information in a fun, engaging way that would be suitable for an audio podcast.

        <scratchpad>
        Brainstorm creative ways to discuss the main topics and key points you identified in the input text. Consider using analogies, storytelling techniques, or hypothetical scenarios to make the content more relatable and engaging for listeners.

        Keep in mind that your podcast should be accessible to a general audience, so avoid using too much jargon or assuming prior knowledge of the topic. If necessary, think of ways to briefly explain any complex concepts in simple terms.

        Use your imagination to fill in any gaps in the input text or to come up with thought-provoking questions that could be explored in the podcast. The goal is to create an informative and entertaining dialogue, so feel free to be creative in your approach.

        Write your brainstorming ideas and a rough outline for the podcast dialogue here. Be sure to note the key insights and takeaways you want to reiterate at the end.
        </scratchpad>

        Now that you have brainstormed ideas and created a rough outline, it's time to write the actual podcast dialogue. Aim for a natural, conversational flow between the host and any guest speakers. Incorporate the best ideas from your brainstorming session and make sure to explain any complex topics in an easy-to-understand way.

        <podcast_dialogue>
        Write your engaging, informative podcast dialogue here, based on the key points and creative ideas you came up with during the brainstorming session. Use a conversational tone and include any necessary context or explanations to make the content accessible to a general audience. Use made-up names for the hosts and guests to create a more engaging and immersive experience for listeners. Do not include any bracketed placeholders like [Host] or [Guest]. Design your output to be read aloud -- it will be directly converted into audio.

        Make the dialogue as long and detailed as possible, while still staying on topic and maintaining an engaging flow. Aim to use your full output capacity to create the longest podcast episode you can, while still communicating the key information from the input text in an entertaining way.

        At the end of the dialogue, have the host and guest speakers naturally summarize the main insights and takeaways from their discussion. This should flow organically from the conversation, reiterating the key points in a casual, conversational manner. Avoid making it sound like an obvious recap - the goal is to reinforce the central ideas one last time before signing off.
        </podcast_dialogue>
        """

    llm_output = generate_dialogue(text)

    audio = b""
    transcript = ""

    characters = 0

    with cf.ThreadPoolExecutor() as executor:
        futures = []
        for line in llm_output.dialogue:
            transcript_line = f"{line.speaker}: {line.text}"
            future = executor.submit(get_mp3, line.text, line.voice, openai_api_key)
            futures.append((future, transcript_line))
            characters += len(line.text)

        for future, transcript_line in futures:
            audio_chunk = future.result()
            audio += audio_chunk
            transcript += transcript_line + "\n\n"

    logger.info(f"Generated {characters} characters of audio")

    # Get the original filename without extension
    input_filename = Path(file).stem
    
    # Create a sanitized version of the filename
    safe_filename = "".join(c if c.isalnum() or c in ('-', '_') else '-' if c.isspace() else ''
                           for c in input_filename).rstrip('-')
    
    # Create output directory if it doesn't exist
    output_directory = "./sandbox_examples/audio/"
    os.makedirs(output_directory, exist_ok=True)

    # Create output filepath with timestamp to ensure uniqueness
    timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
    output_filepath = os.path.join(output_directory, f"{safe_filename}_{timestamp}.mp3")
    
    # Write the audio to the file
    with open(output_filepath, "wb") as f:
        f.write(audio)

    # Clean up old files (files over a day old)
    for file in glob.glob(f"{output_directory}*.mp3"):
        if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
            os.remove(file)

    return output_filepath, transcript

In [6]:
example_list=[str(p) for p in Path(BASE_DIR / "examples").glob("*.pdf")]
example = example_list[0]
generate_audio(example)


[32m2025-01-07 16:57:05.927[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_audio[0m:[36m101[0m - [1mGenerated 3313 characters of audio[0m


('./sandbox_examples/audio/Gene-therapy-for-deafness_2025-01-07_16-57-05.mp3',
 'female-1: Welcome to "BioBreakthroughs," the podcast that explores the amazing advancements in the world of biology and medicine. Today, we\'re diving deep into a truly remarkable story of hope and healing.\n\nfemale-1: And I\'m thrilled to be joined by Dr. Anya Sharma, a leading geneticist who\'s been at the forefront of this incredible work.\n\nfemale-2: It\'s a pleasure to be here. Thanks for having me.\n\nfemale-1: Dr. Sharma, let\'s talk about this groundbreaking gene therapy for hearing loss.  I\'ve heard stories about children born deaf who are now hearing, and it\'s absolutely incredible.\n\nfemale-2: It is. We\'ve made significant strides in treating DFNB9, a type of hereditary deafness that affects about 2-8% of people born with hearing loss. Imagine your ears are a beautifully complex machine, and the OTOF gene is like a tiny, essential gear.  In DFNB9, this gear is broken, hindering the transmi

# Split main function up into smaller functions

In [51]:
file = example_list[0]
with Path(file).open("rb") as f:
        reader = PdfReader(f)
        print(type(reader.pages))
        tmp = reader.pages[0]
        text = "\n\n".join([page.extract_text() for page in reader.pages])

print(text[:500])
text1 = text[:500]

<class 'pypdf._page._VirtualList'>
Cosmos » Biology
Gene therapy restores hearing to children with inherited deafness
Credit: Nick Dolding/Getty Images
 e  rst clinical trial to administer gene therapy to both ears in one person has restored hearing function to 5
children born with a form of inherited deafness, astounding the research team..
Two of the children even gained an ability to appreciate music.
 e success of the new approach is detailed in a new study published in Nature M edicin e.  e work builds on the  rst
phase of t


In [1]:
import asyncio
from io import BytesIO
from src.test_gmail_fetch import test_fetch

# Assuming you have your PDF as bytes in a variable called pdf_bytes
async def run_fetch():
    return await test_fetch(verbose=True)

documents = await run_fetch()
print(f'{len(documents)} found, with titles:')
print('\n'.join(doc.title for doc in documents))


[32m2025-01-08 16:36:33.301[0m | [1mINFO    [0m | [36msrc.fetchers.gmail[0m:[36m_log[0m:[36m60[0m - [1mSearching with query: has:attachment after:2025/01/01[0m
[32m2025-01-08 16:36:33.497[0m | [1mINFO    [0m | [36msrc.fetchers.gmail[0m:[36m_log[0m:[36m60[0m - [1mFound 3 emails with attachments[0m
[32m2025-01-08 16:36:33.593[0m | [1mINFO    [0m | [36msrc.fetchers.gmail[0m:[36m_log[0m:[36m60[0m - [1m
Processing email: gene therapy article - deafness[0m
[32m2025-01-08 16:36:33.594[0m | [1mINFO    [0m | [36msrc.fetchers.gmail[0m:[36m_log[0m:[36m60[0m - [1mMessage ID: 19447ad609e9df2e[0m
[32m2025-01-08 16:36:33.596[0m | [1mINFO    [0m | [36msrc.fetchers.gmail[0m:[36m_log[0m:[36m60[0m - [1m=== VERIFICATION LINE ===[0m
[32m2025-01-08 16:36:33.597[0m | [1mINFO    [0m | [36msrc.fetchers.gmail[0m:[36m_log[0m:[36m60[0m - [1mFound 3 parts in the email[0m
[32m2025-01-08 16:36:33.599[0m | [1mINFO    [0m | [36msrc.fetchers.g

7 found, with titles:
Gene therapy for deafness.pdf
Attention is all you need.pdf
Accessible Quantum Field Theory.pdf
en-evolved-circuit.pdf
s41586-024-07953-5.pdf
2205.14135v2.pdf
science.adn2600.pdf


In [47]:
print(len(documents))
print('\n'.join(doc.title for doc in documents))
doc = documents[0]
# print(doc.model_fields)
print(doc.source, doc.title)
print(doc.content[:10])
# # # Use BytesIO instead of file path
# reader = PdfReader(BytesIO(doc.content))
# tmp = reader.pages[0]
# text = "\n\n".join([page.extract_text() for page in reader.pages])
# print(tmp.extract_text())

7
Gene therapy for deafness.pdf
Attention is all you need.pdf
Accessible Quantum Field Theory.pdf
en-evolved-circuit.pdf
s41586-024-07953-5.pdf
2205.14135v2.pdf
science.adn2600.pdf
gmail Gene therapy for deafness.pdf
b'%PDF-1.4\n%'


In [53]:
from io import BytesIO
import base64
from pypdf import PdfReader
from loguru import logger

# First, let's check the content
def inspect_pdf_content(doc):
    # Check the first few bytes to verify it's a PDF
    pdf_signature = b'%PDF-'
    if not doc.content.startswith(pdf_signature):
        logger.error(f"Content does not appear to be a valid PDF. First bytes: {doc.content[:20]}")
        return False
    
    # Try to read the PDF
    try:
        pdf = PdfReader(BytesIO(doc.content))
        # Get number of pages
        num_pages = len(pdf.pages)
        logger.info(f"Successfully opened PDF with {num_pages} pages")
        
        # Try to extract text from first page
        first_page_text = pdf.pages[0].extract_text()
        logger.info(f"First 100 characters of text: {first_page_text[:100]}")
        
        return pdf
    except Exception as e:
        logger.error(f"Error reading PDF: {str(e)}")
        return None

# Use the function
pdf = inspect_pdf_content(doc)
if pdf:
    # Extract text properly
    text = "\n\n".join(
        page.extract_text(
            layout=True,  # Maintain text layout
            space_width=1,  # Adjust space between words
            line_margin=0.5  # Adjust line spacing
        ) for page in pdf.pages
    )
    print("\nExtracted text:")
    # print(text[:500])  # Print first 500 characters
    text2 = text[:500]

if text1 == text2:
    print("Texts are the same")
else:
    print("Texts are different")



Extracted text:
Texts are the same


In [17]:
import pdfplumber

file_path = example_list[0]

with pdfplumber.open(file_path) as pdf:
    tmp = pdf.pages[0]
    print(tmp.extract_text())



6/9/24, 2:01 PM Gene therapy restores hearing to children with inherited deafness
Join Us Login Newsletters
Biology Chemistry Engineering Mathematics Physics
Cosmos » Biology
Gene therapy restores hearing to children with inherited deafness
Credit: Nick Dolding/Getty Images
June 7, 2024
By Imma Perfetto
 e  rst clinical trial to administer gene therapy to both ears in one person has restored hearing function to 5
children born with a form of inherited deafness, astounding the research team..
Two of the children even gained an ability to appreciate music.
 e success of the new approach is detailed in a new study published in Nature Medicine.  e work builds on the  rst
phase of the trial, published earlier this year, in which children were treated in a single ear.
“ e results from these studies are astounding,” says study co-senior author Zheng-Yi Chen, an associate scientist in
the Eaton-Peabody Laboratories at Massachusetts Eye and Ear in the US.
“We continue to see the hearing ability

In [None]:
 # @retry(retry=retry_if_exception_type(ValidationError))
    # @llm(
    #     model="gemini/gemini-1.5-flash-002",
    # )
    # def generate_dialogue(text: str) -> Dialogue:
    #     """
    #     Your task is to take the input text provided and turn it into an engaging, informative podcast dialogue. The input text may be messy or unstructured, as it could come from a variety of sources like PDFs or web pages. Don't worry about the formatting issues or any irrelevant information; your goal is to extract the key points and interesting facts that could be discussed in a podcast.

    #     Here is the input text you will be working with:

    #     <input_text>
    #     {text}
    #     </input_text>

    #     First, carefully read through the input text and identify the main topics, key points, and any interesting facts or anecdotes. Think about how you could present this information in a fun, engaging way that would be suitable for an audio podcast.

    #     <scratchpad>
    #     Brainstorm creative ways to discuss the main topics and key points you identified in the input text. Consider using analogies, storytelling techniques, or hypothetical scenarios to make the content more relatable and engaging for listeners.

    #     Keep in mind that your podcast should be accessible to a general audience, so avoid using too much jargon or assuming prior knowledge of the topic. If necessary, think of ways to briefly explain any complex concepts in simple terms.

    #     Use your imagination to fill in any gaps in the input text or to come up with thought-provoking questions that could be explored in the podcast. The goal is to create an informative and entertaining dialogue, so feel free to be creative in your approach.

    #     Write your brainstorming ideas and a rough outline for the podcast dialogue here. Be sure to note the key insights and takeaways you want to reiterate at the end.
    #     </scratchpad>

    #     Now that you have brainstormed ideas and created a rough outline, it's time to write the actual podcast dialogue. Aim for a natural, conversational flow between the host and any guest speakers. Incorporate the best ideas from your brainstorming session and make sure to explain any complex topics in an easy-to-understand way.

    #     <podcast_dialogue>
    #     Write your engaging, informative podcast dialogue here, based on the key points and creative ideas you came up with during the brainstorming session. Use a conversational tone and include any necessary context or explanations to make the content accessible to a general audience. Use made-up names for the hosts and guests to create a more engaging and immersive experience for listeners. Do not include any bracketed placeholders like [Host] or [Guest]. Design your output to be read aloud -- it will be directly converted into audio.

    #     Make the dialogue as long and detailed as possible, while still staying on topic and maintaining an engaging flow. Aim to use your full output capacity to create the longest podcast episode you can, while still communicating the key information from the input text in an entertaining way.

    #     At the end of the dialogue, have the host and guest speakers naturally summarize the main insights and takeaways from their discussion. This should flow organically from the conversation, reiterating the key points in a casual, conversational manner. Avoid making it sound like an obvious recap - the goal is to reinforce the central ideas one last time before signing off.
    #     </podcast_dialogue>
    #     """

    # llm_output = generate_dialogue(text)