In [11]:
from pydantic import BaseModel, Field
from typing import List, Dict, Any

class Section(BaseModel):
    title: str = Field(description="main topic of this section of the document")
    start_index: int = Field(description="line number where the section begins")
    end_index: int = Field(description="line number where the section ends")


class StructuredDocument(BaseModel):
    """obtains meaningful sections, each centered around a single concept/topic"""
    sections: List[Section] = Field(description="a list of sections of the document")

In [12]:
def doc_with_lines(document):
    document_lines = document.split("\n")
    document_with_line_numbers = ""
    line2text = {}
    for i, line in enumerate(document_lines):
        document_with_line_numbers += f"[{i}] {line}\n"
        line2text[i] = line
    return document_with_line_numbers, line2text

### Document Segmentation

In [13]:
import instructor
import vertexai  
from vertexai.generative_models import GenerativeModel  

# Load transcript
def load_transcript(file_path):
    with open(file_path, "r") as f:
        return f.read()

transcript = load_transcript("test.txt")

system_prompt = f"""\
You are a world class educator working on organizing your lecture notes.
Read the document below and extract a StructuredDocument object from it where each section of the document is centered around a single concept/topic that can be taught in one lesson.
Each line of the document is marked with its line number in square brackets (e.g. [1], [2], [3], etc). Use the line numbers to indicate section start and end.
"""

# Apply the patch to the cohere client
# enables response_model keyword
client = instructor.from_vertexai(
    client=GenerativeModel(
        model_name="gemini-1.5-pro",
        system_instruction=system_prompt
        ),
    mode=instructor.Mode.VERTEXAI_TOOLS,
)

def get_structured_document(document_with_line_numbers) -> StructuredDocument:
    return client.create(
        messages = [
            {
                "role": "user",
                "content": document_with_line_numbers,
            }
        ],
        response_model=StructuredDocument
    ) # type: ignore

In [14]:
def get_sections_text(structured_doc, line2text):
    segments = []
    for s in structured_doc.sections:
        contents = []
        for line_id in range(s.start_index, s.end_index):
                contents.append(line2text.get(line_id, ''))
        segments.append({
            "title": s.title,
            "content": "\n".join(contents),
            "start": s.start_index,
            "end": s.end_index
        })
    return segments

In [15]:
document_with_line_numbers, line2text = doc_with_lines(transcript)
structured_doc = get_structured_document(document_with_line_numbers)
segments = get_sections_text(structured_doc, line2text)

In [16]:
display(len(segments))

23

In [17]:
for segment in segments:
    display(segment.get("title"))

'The Trillion Dollar Cluster'

'AI Progress in the Next Few Years'

'Societal and Economic Impact of AI'

'Geopolitical Implications of AI: CCP and National Security'

'Historical Perspective on Stakes and Potential for Dictatorship'

'Chinese AI Researchers and the Role of Westernized Elites'

'Building AI Clusters in the US vs. Middle East: National Security and System Competition'

'Protecting Algorithmic Secrets: Importance of Security and Potential for Espionage'

'International Cooperation and Arms Control in the Age of AGI'

'Vulnerability of Data Centers and Potential for Conflict over Taiwan'

'State-led vs. Private-led AI Development: Arguments for and Against Nationalization'

'Government vs. Private Control of ASI: Checks and Balances, Corporate Governance, and Potential for Coups'

"Leopold's Background: Valedictorian at Columbia, Future Fund, and OpenAI"

"Leopold's Interest in Economics and Peak Productivity"

'Leaving Economics and Discovering the \\"Twitter Weirdos\\"'

'OpenAI: Superalignment Team and Reasons for its Dissolution'

'Accelerating AI Research Progress: Automated AI Researchers and the Data Wall'

'Alignment: Challenges, Potential Solutions, and Dual-Use Nature'

'Germany: Post-WWII Recovery, Societal Norms, and Political Landscape'

"Dwarkesh's Immigration Story and Path to the Podcast"

'Launching an AGI Hedge Fund: Motivation, Investment Strategy, and Potential Risks'

'Lessons from WWII: Systems Competition, Industrial Capacity, and the Importance of Timing'

'Situational Awareness: A Continuous Process of Adaptation and Response to New Information'