In [1]:
from pydantic import BaseModel, Field
from typing import List

In [2]:
class NodeMetadata(BaseModel):
    """Node metadata."""

    entities: List[str] = Field(
        ..., description="Unique entities in this text chunk."
    )
    summary: str = Field(
        ..., description="A concise summary of this text chunk."
    )
    contains_number: bool = Field(
        ...,
        description=(
            "Whether the text chunk contains any numbers (ints, floats, etc.)"
        ),
    )

In [3]:
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.core.extractors import PydanticProgramExtractor

EXTRACT_TEMPLATE_STR = """\
Here is the content of the section:
----------------
{context_str}
----------------
Given the contextual information, extract out a {class_name} object.\
"""

openai_program = OpenAIPydanticProgram.from_defaults(
    output_cls=NodeMetadata,
    prompt_template_str="{input}",
    # extract_template_str=EXTRACT_TEMPLATE_STR
)

program_extractor = PydanticProgramExtractor(
    program=openai_program, input_key="input", show_progress=True
)

In [4]:
# load in blog

from llama_index.readers.web import SimpleWebPageReader
from llama_index.core.node_parser import SentenceSplitter

reader = SimpleWebPageReader(html_to_text=True)
docs = reader.load_data(urls=["https://eugeneyan.com/writing/llm-patterns/"])

In [9]:
import nest_asyncio
nest_asyncio.apply()

from llama_index.core.ingestion import IngestionPipeline

node_parser = SentenceSplitter(chunk_size=1024)

pipeline = IngestionPipeline(transformations=[node_parser, program_extractor])

orig_nodes = await pipeline.run(documents=docs)




 26%|██▌       | 8/31 [00:06<00:15,  1.49it/s]

TypeError: No constructor defined