## Introduction
This notebook is designed to guide you through the process of building a schema extraction application using Langchain and OpenAI's GPT-4 LLM. Schema extraction involves identifying and extracting structured information from unstructured text, which can be particularly useful in understanding and organizing data in a more accessible format.

In [1]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

True

In [2]:
from typing import List, Optional

from langchain_core.pydantic_v1 import BaseModel, Field


class Agent_Feature(BaseModel):
    """Information about feature of AI agents. AI Agents are a really amazing capability. It consists of various features, this defines those features"""

    # ^ Doc-string for the entity Agent_Feature.
    # This doc-string is sent to the LLM as the description of the schema Agent_Feature,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the agent feature")
    definition: Optional[str] = Field(default=None, description="A concise definition of this feature of AI agents")


class Data(BaseModel):
    """Extracted data about AI Agent Features."""

    # Creates a model so that we can extract multiple entities.
    features: List[Agent_Feature]

In [3]:
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        ("human", "{text}"),
    ]
)

In [4]:
from langchain_openai import ChatOpenAI
model = ChatOpenAI(model="gpt-4o")

In [8]:
runnable = prompt | model.with_structured_output(schema=Data)
text = "Tool use a key capability of AI agents. It let's the agent invoke methods that can calls APIs, read docs, perform actions, etc."
runnable.invoke({"text": text})

Data(features=[Agent_Feature(name='Tool use', definition='A key capability of AI agents. It lets the agent invoke methods that can call APIs, read docs, perform actions, etc.')])

In [6]:
from langchain_community.document_loaders import WebBaseLoader
url = "https://lilianweng.github.io/posts/2023-06-23-agent/"
loader = WebBaseLoader(url)
docs = loader.load()



In [7]:
runnable.invoke({"text": docs})

Data(features=[Agent_Feature(name='Planning', definition='The agent breaks down large tasks into smaller, manageable subgoals, enables efficient handling of complex tasks, and can self-reflect and refine actions based on past performance.'), Agent_Feature(name='Memory', definition='The agent retains and recalls information over extended periods. It is divided into short-term memory (in-context learning) and long-term memory (external vector store for fast retrieval).'), Agent_Feature(name='Tool Use', definition='The agent calls external APIs for additional information, code execution, and access to proprietary data sources, extending its capabilities beyond the pre-trained model.')])