In [68]:
# Read the content of transcription_test.md
with open('transcription_test.md', 'r', encoding='utf-8') as file:
    data = file.read();

print(f"Loaded {len(data)} characters from transcription_test.md")



Loaded 8180 characters from transcription_test.md


In [81]:
import re
def split(text: str):
    """
    Split the text into sentences, preserving speaker labels and line breaks.

    Args:
        text (str): The input text to be split.

    Returns:
        List[str]: A list of sentences.
    """
    # Split by newlines first to preserve speaker labels and line breaks
    lines = text.split('\n')
    sentences = []
    for line in lines:
        if line.strip().startswith('**[SPEAKER'):
            sentences.append(line)
        elif line.strip() == '':
            sentences.append('\n')  # Preserve empty lines
        else:
            # Use regex to split into sentences
            line_sentences = re.split(r'(?<=[.!?])\s+', line)
            sentences.extend(line_sentences)
    return [s for s in sentences if s]  # Remove empty strings but keep '\n'

In [85]:
def create_chunks(sentences, max_tokens: int = 600):
    """
    Create chunks of sentences that fit within the max_tokens limit.

    Args:
        sentences (List[str]): List of sentences to be chunked.
        max_tokens (int): Maximum number of tokens per chunk.

    Returns:
        List[str]: A list of chunks.
    """
    chunks = []
    current_chunk = []
    current_token_count = 0

    for sentence in sentences:
        sentence_tokens = len(sentence.split())
        if current_token_count + sentence_tokens > max_tokens:
            chunks.append("".join(current_chunk))
            current_chunk = []
            current_token_count = 0

        current_chunk.append(sentence)
        current_token_count += sentence_tokens

    if current_chunk:
        chunks.append("".join(current_chunk))

    return chunks

In [86]:
sentences = split(data)

print(sentences)

chunks = create_chunks(sentences)

with open('chunks.md', 'w', encoding='utf-8') as f:
    for i, chunk in enumerate(chunks):
        f.write(f"Chunk {i+1}:\n{chunk}\n\n")

['Transcript before formatting:', '\n', '\n', '\n', '**[SPEAKER 1]**: ', "Hey, this is Here This Idea and in this episode I spoke with Jacob Trofeathern, who oversees Open Philanthropy's science and science policy programs.", 'Open Philanthropy is a philanthropic thunder, which I would describe as being unusually serious about researching and identifying causes where funding is likely to help people the most.', 'And in this episode, we spoke about global health research and development.', 'So when I think about philanthropy or charity funding in the context of global health, I normally think about things like, you know, humanitarian aid or just direct public health spending on things like malaria nets or distributing diet supplements or vaccines.', 'But vaccines and life saving drugs are health technologies.', "Often, they're really quite advanced technologies and, like, we all saw how it was only possible to spend money on buying up and distributing vaccines for COVID once we actually

In [92]:
from dotenv import load_dotenv
import time
from openai import OpenAI, RateLimitError
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def process_chunk_with_gpt4(chunk):
    """
    Process a chunk of text with GPT-4.

    Args:
        chunk (str): The chunk of text to be processed.

    Returns:
        str: The processed chunk of text.
    """
    max_retries = 3
    for attempt in range(max_retries):
        try:
            print(f"Processing chunk: {chunk[:200]}...")
            response = client.chat.completions.create(
                model="gpt-4-0613",  # Updated to a standard GPT-4 model
                messages=[
                    {
                        "role": "system",
                        "content": "You are an AI assistant that receives a verbatim transcript of an interview. You respond with the same text, lightly edited for clarity. You remove filler words, correct grammatical mistakes, and replace obvious transcription mistakes with the more likely alternative given the context. You remove obvious repetition. You add links in markdown format to resources mentioned where you are confident of the link (so 'World Health Organisation' could become '[World Health Organisation](https://www.who.int/)'). When an obvious new topic is introduced, you may add a markdown-formated h3 header (### Topic) before the next speaker is introduced in bold. You DO NOT invent any new sentences. You do NOT modify the speaker names in bold. The text should be returned in just the same format as it was received.",
                    },
                    {"role": "user", "content": chunk},
                ],
                max_tokens=4000,
                n=1,
                temperature=0.5,
            )

            if response.choices and len(response.choices) > 0:
                processed_chunk = response.choices[0].message.content.strip()
                print(f"Processed chunk: {processed_chunk[:200]}...")
                return processed_chunk
            else:
                print(f"Unexpected response structure: {response}")
                return None
        except Exception as e:
            if isinstance(e, RateLimitError) and attempt < max_retries - 1:
                print(f"Rate limit error: {e}")
                time.sleep(2**attempt)  # Exponential backoff
            else:
                print(f"Error processing chunk: {e}")
                return None

In [93]:
def post_process_transcript(transcript: str) -> str:
    """
    Post-process a transcript by splitting it into chunks and processing each chunk with GPT-4.

    Args:
        transcript (str): The input transcript to be processed.

    Returns:
        str: The processed transcript.
    """
    sentences = split(transcript)
    chunks = create_chunks(sentences)

    processed_chunks = []

    for chunk in chunks:
        processed_chunk = process_chunk_with_gpt4(chunk)
        processed_chunks.append(processed_chunk)

    return "\n\n".join(processed_chunks)

post_process_transcript(data)

Processing chunk: Transcript before formatting:


**[SPEAKER 1]**: Hey, this is Here This Idea and in this episode I spoke with Jacob Trofeathern, who oversees Open Philanthropy's science and science policy programs.Op...
Processed chunk: Formatted Transcript:

**[SPEAKER 1]**: Welcome to this episode of Here This Idea. Today, I spoke with Jacob Trefethen, who oversees the science and science policy programs at Open Philanthropy. Open ...
Processing chunk: And I think a big reason I wanted to talk to you was this blog you recently started, and it started with this post on health technologies, which probably won't exist in 5 years.Maybe a place to start ...
Processed chunk: I believe a significant reason I wanted to discuss with you is your recently started blog, particularly your post on health technologies that may not exist in five years. Maybe a good place to start i...
Processing chunk: The statistic you'll see if you Google it that gets bandied around is that a quarter of people a

"Formatted Transcript:\n\n**[SPEAKER 1]**: Welcome to this episode of Here This Idea. Today, I spoke with Jacob Trefethen, who oversees the science and science policy programs at Open Philanthropy. Open Philanthropy is a philanthropic fund that is notably serious about researching and identifying causes where funding is likely to have the most impact. \n\n### Global Health Research and Development\n\nIn this episode, we discussed global health research and development. When considering philanthropy or charity funding in the context of global health, one might typically think about humanitarian aid or direct public health spending on things like malaria nets, diet supplements, or vaccines. However, vaccines and life-saving drugs are advanced health technologies. For instance, it was only possible to spend money on purchasing and distributing vaccines for COVID-19 once we had demonstrably safe vaccines. Similarly, innovations in delivering, transporting, and storing smallpox vaccines led

NameError: name 'processed_chunks' is not defined