In [1]:
import os

# os.environ["OPENAI_API_KEY"] = "INSERT OPENAI KEY"
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
# https://github.com/run-llama/llama_index/blob/main/examples/paul_graham_essay/TestEssay.ipynb
# print(os.environ["OPENAI_API_KEY"])

In [3]:

from llama_index import TreeIndex, SimpleDirectoryReader
from IPython.display import Markdown, display

In [4]:
# 计数

import tiktoken
from llama_index.llms import Anthropic
from llama_index.callbacks import CallbackManager, TokenCountingHandler

from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
    set_global_service_context,
)

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext

# 设置tokenizer和TokenCountingHandler
tokenizer = tiktoken.encoding_for_model("text-davinci-003").encode

token_counter = TokenCountingHandler(tokenizer=tokenizer)
callback_manager = CallbackManager([token_counter])

service_context = ServiceContext.from_defaults(callback_manager=callback_manager)
set_global_service_context(service_context)


In [5]:
documents = SimpleDirectoryReader("data").load_data()


In [6]:
new_index = TreeIndex.from_documents(documents)


INFO:llama_index.indices.common_tree.base:> Building index from nodes: 1 chunks
> Building index from nodes: 1 chunks


In [7]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
    "\n",
)

Embedding Tokens:  0 
 LLM Prompt Tokens:  7004 
 LLM Completion Tokens:  407 
 Total LLM Token Count:  7411 



In [8]:
# set Logging to DEBUG for more detailed outputs
query_engine = new_index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 0] Selected node: [1]/[1]
>[Level 0] Selected node: [1]/[1]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 1] Selected node: [1]/[1]
>[Level 1] Selected node: [1]/[1]


In [9]:
display(Markdown(f"<b>{response}</b>"))


<b>The author wrote short stories and also worked on programming, specifically on an IBM 1401 computer in their junior high school's basement. They used an early version of Fortran and typed programs on punch cards. Later, the author got a microcomputer, a TRS-80, and started programming on it, writing simple games and a word processor.</b>

In [10]:
# set Logging to DEBUG for more detailed outputs
response = query_engine.query("What did the author do after his time at Y Combinator?")

INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 0] Selected node: [1]/[1]
>[Level 0] Selected node: [1]/[1]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 1] Selected node: [10]/[10]
>[Level 1] Selected node: [10]/[10]


In [11]:
display(Markdown(f"<b>{response}</b>"))


<b>After his time at Y Combinator, the author worked on building a new dialect of Lisp called Arc.</b>

In [12]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
    "\n",
)

Embedding Tokens:  0 
 LLM Prompt Tokens:  17632 
 LLM Completion Tokens:  815 
 Total LLM Token Count:  18447 



# Build Tree Index with a custom Summary Prompt, directly retrieve answer from root node

In [13]:
from llama_index.prompts import PromptTemplate


In [14]:
documents = SimpleDirectoryReader("data").load_data()

query_str = "What did the author do growing up?"
SUMMARY_PROMPT_TMPL = (
    "Context information is below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given the context information and not prior knowledge, "
    f"answer the question: {query_str}\n"
)
SUMMARY_PROMPT = PromptTemplate(SUMMARY_PROMPT_TMPL)
index_with_query = TreeIndex.from_documents(documents, summary_template=SUMMARY_PROMPT)

INFO:llama_index.indices.common_tree.base:> Building index from nodes: 1 chunks
> Building index from nodes: 1 chunks


In [15]:
# directly retrieve response from root nodes instead of traversing tree
query_engine = index_with_query.as_query_engine(retriever_mode="root")
response = query_engine.query(query_str)

INFO:llama_index.indices.tree.tree_root_retriever:> Starting query: What did the author do growing up?
> Starting query: What did the author do growing up?


In [16]:
display(Markdown(f"<b>{response}</b>"))


<b>The author engaged in activities such as writing short stories and programming, including working on an IBM 1401 computer in 9th grade and teaching themselves Lisp. They also worked on reverse-engineering a program called SHRDLU for their undergraduate thesis.</b>

# Using GPT Keyword Table Index

In [17]:
from llama_index import KeywordTableIndex, SimpleDirectoryReader
from IPython.display import Markdown, display

In [18]:
# build keyword index
documents = SimpleDirectoryReader("data").load_data()
index = KeywordTableIndex.from_documents(documents)

In [19]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do after his time at Y Combinator?")

INFO:llama_index.indices.keyword_table.retrievers:> Starting query: What did the author do after his time at Y Combinator?
> Starting query: What did the author do after his time at Y Combinator?
INFO:llama_index.indices.keyword_table.retrievers:query keywords: ['author', 'combinator', 'time', 'y combinator']
query keywords: ['author', 'combinator', 'time', 'y combinator']
INFO:llama_index.indices.keyword_table.retrievers:> Extracted keywords: ['combinator', 'time', 'y combinator']
> Extracted keywords: ['combinator', 'time', 'y combinator']


In [20]:
display(Markdown(f"<b>{response}</b>"))


<b>After his time at Y Combinator, the author pursued a new project in Cambridge. He formed a team and began working on a web application for building web applications. However, he had a change of heart during the summer and decided not to continue running a company. Instead, he chose to transform a part of the project into an open-source initiative. He then shifted his attention to developing a new Lisp dialect called Arc. Eventually, he presented his work at a Lisp conference and shared a postscript file of the talk online, which generated considerable interest.</b>

# Using GPT List Index

In [21]:
from llama_index import SummaryIndex, SimpleDirectoryReader
from IPython.display import Markdown, display

In [22]:

# build summary index
documents = SimpleDirectoryReader("data").load_data()
index = SummaryIndex.from_documents(documents)

In [23]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do after his time at Y Combinator?")

In [24]:
display(Markdown(f"<b>{response}</b>"))


<b>The context information does not provide any information about the author's time at Y Combinator or what they did after it.</b>

In [25]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
    "\n",
)

Embedding Tokens:  0 
 LLM Prompt Tokens:  74947 
 LLM Completion Tokens:  4225 
 Total LLM Token Count:  79172 

