In [1]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.2)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [2]:
from llama_index.core import SimpleDirectoryReader

# load documents
documents = SimpleDirectoryReader("./data/pg").load_data()

In [3]:
from llama_index.core import Settings

# initialize settings (set chunk size)
Settings.chunk_size = 1024
nodes = Settings.node_parser.get_nodes_from_documents(documents)

In [5]:
from llama_index.core import StorageContext

# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

In [6]:
from llama_index.core import SummaryIndex
from llama_index.core import VectorStoreIndex

summary_index = SummaryIndex(nodes, storage_context=storage_context)
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)

In [7]:
list_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)
vector_query_engine = vector_index.as_query_engine()

In [9]:
from llama_index.core.tools import QueryEngineTool


list_tool = QueryEngineTool.from_defaults(
    query_engine=list_query_engine,
    description=(
        "Useful for summarization questions related to Paul Graham eassy on"
        " What I Worked On."
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context from Paul Graham essay on What"
        " I Worked On."
    ),
)

In [10]:
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector
from llama_index.core.selectors import (
    PydanticMultiSelector,
    PydanticSingleSelector,
)


query_engine = RouterQueryEngine(
    selector=PydanticSingleSelector.from_defaults(),
    query_engine_tools=[
        list_tool,
        vector_tool,
    ],
)

In [12]:
import nest_asyncio

nest_asyncio.apply()

In [13]:
response = query_engine.query("What is the summary of the document?")
print(str(response))

The document provides a comprehensive account of the author's diverse experiences and reflections, encompassing writing, programming, founding startups, and investing in startups. It details the challenges and successes encountered throughout the author's career, including the establishment and growth of the Summer Founders Program, running Y Combinator, personal experiences with family and health, the development of a new programming language called Bel, and the author's transition to painting and writing essays. The narrative also offers insights into decision-making, challenges faced, and the evolution of personal and professional interests.


In [14]:
response = query_engine.query("What did Paul Graham do after RICS?")
print(str(response))

After RICS, Paul Graham got a job at a company called Interleaf, which made software for creating documents. He learned some useful things at Interleaf, such as the importance of being the "entry level" option and the dangers of letting low-end software eat high-end software. He also did freelance work for the group that did projects for customers and later moved back to Providence to continue at RISD.


In [15]:
# [optional] look at selected results
print(str(response.metadata["selector_result"]))

selections=[SingleSelection(index=1, reason="The question requires retrieving specific context from Paul Graham's essay on What I Worked On.")]


In [16]:
from llama_index.core import SimpleKeywordTableIndex

keyword_index = SimpleKeywordTableIndex(nodes, storage_context=storage_context)

keyword_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context using keywords from Paul"
        " Graham essay on What I Worked On."
    ),
)

In [17]:
query_engine = RouterQueryEngine(
    selector=PydanticMultiSelector.from_defaults(),
    query_engine_tools=[
        list_tool,
        vector_tool,
        keyword_tool,
    ],
)

In [18]:
# This query could use either a keyword or vector query engine, so it will combine responses from both
response = query_engine.query(
    "What were noteable events and people from the authors time at Interleaf"
    " and YC?"
)
print(str(response))

The author's time at Interleaf and Y Combinator involved notable events such as the development of a code editor in Viaweb for users to define their own page styles, the renaming of the company to Y Combinator after a trick in the lambda calculus, and the transition of YC into a fund for a couple of years before becoming self-funded again. Additionally, notable people from the author's time at Y Combinator included startup founders, Justin Kan and Emmett Shear who went on to found Twitch, Aaron Swartz, and Sam Altman.


In [19]:
# [optional] look at selected results
print(str(response.metadata["selector_result"]))

selections=[SingleSelection(index=0, reason='Useful for summarization questions related to Paul Graham eassy on What I Worked On.'), SingleSelection(index=2, reason='Useful for retrieving specific context using keywords from Paul Graham essay on What I Worked On.')]
