# Lesson 3: Sentence Window Retrieval

In [2]:
import warnings
import utils
import os
import openai

warnings.filterwarnings("ignore")


openai.api_key = utils.get_openai_api_key()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [4]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [5]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

41 

<class 'llama_index.schema.Document'>
Doc ID: b0b79326-ab8f-43a8-9ddb-6985530d6cde
Text: PAGE 1Founder, DeepLearning.AICollected Insights from Andrew Ng
How to  Build Your Career in AIA Simple Guide


In [6]:
from llama_index import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

## Window-sentence retrieval setup

In [7]:
from llama_index.node_parser import SentenceWindowNodeParser

# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
text = "hello. how are you? I am fine!  "

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [12]:
# Each sentence from the above is split into it's own element.
print([x.text for x in nodes])

# Loop through each node within the nodes list and print the text of each node.
for node in nodes:
    print(node.text)

['hello. ', 'how are you? ', 'I am fine!  ']
hello. 
how are you? 
I am fine!  


In [13]:
print(nodes[0].metadata["window"])
print(nodes[1].metadata["window"])
print(nodes[2].metadata["window"])

hello.  how are you?  I am fine!  
hello.  how are you?  I am fine!  
hello.  how are you?  I am fine!  


In [19]:
text = """
Exploring varied color schemes and patterns in your rooms infuses them with life and personality. Get in touch if you need expert guidance on where to begin. Your understanding of basic color theory and pattern use will help you craft visually captivating, harmonious spaces that are true to your style.

There is no definitive blueprint for design. The most enchanting spaces are the ones that bring you comfort and joy. Therefore, feel free to experiment, bend the rules, and design a home that is distinctly yours.

May these guidelines inspire you to delve into the world of color and pattern in your home. Enjoy the process of decorating! Remember, if you need further inspiration or tips, you can sign up for my newsletter. Happy designing!
"""

In [20]:
nodes = node_parser.get_nodes_from_documents([Document(text=text)])
# Loop through each node within the nodes list and print the text of each node.
for node in nodes:
    print(node.text)

for i in range(len(nodes)):
    print(f"node {i} = {nodes[i].text}")


Exploring varied color schemes and patterns in your rooms infuses them with life and personality. 
Get in touch if you need expert guidance on where to begin. 
Your understanding of basic color theory and pattern use will help you craft visually captivating, harmonious spaces that are true to your style.


There is no definitive blueprint for design. 
The most enchanting spaces are the ones that bring you comfort and joy. 
Therefore, feel free to experiment, bend the rules, and design a home that is distinctly yours.


May these guidelines inspire you to delve into the world of color and pattern in your home. 
Enjoy the process of decorating! 
Remember, if you need further inspiration or tips, you can sign up for my newsletter. 
Happy designing!

node 0 = 
Exploring varied color schemes and patterns in your rooms infuses them with life and personality. 
node 1 = Get in touch if you need expert guidance on where to begin. 
node 2 = Your understanding of basic color theory and pattern u

In [24]:
for i in range(len(nodes)):
    print(f"window {i} = {nodes[i].metadata['window']}")

window 0 = 
Exploring varied color schemes and patterns in your rooms infuses them with life and personality.  Get in touch if you need expert guidance on where to begin.  Your understanding of basic color theory and pattern use will help you craft visually captivating, harmonious spaces that are true to your style.


window 1 = 
Exploring varied color schemes and patterns in your rooms infuses them with life and personality.  Get in touch if you need expert guidance on where to begin.  Your understanding of basic color theory and pattern use will help you craft visually captivating, harmonious spaces that are true to your style.

 There is no definitive blueprint for design. 
window 2 = 
Exploring varied color schemes and patterns in your rooms infuses them with life and personality.  Get in touch if you need expert guidance on where to begin.  Your understanding of basic color theory and pattern use will help you craft visually captivating, harmonious spaces that are true to your sty

### Building the index

In [25]:
# create the LLM
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [30]:
# Create the Service Context
from llama_index import ServiceContext

sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-large-en-v1.5",
    node_parser=node_parser,
)

config.json: 100%|██████████| 779/779 [00:00<00:00, 3.13MB/s]
model.safetensors: 100%|██████████| 1.34G/1.34G [00:27<00:00, 48.5MB/s]
tokenizer_config.json: 100%|██████████| 366/366 [00:00<00:00, 2.61MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 22.8MB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 21.1MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 973kB/s]


In [34]:
# Create the VectorStore
from llama_index import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
    [document], service_context=sentence_context
)

In [35]:
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it

import os
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import load_index_from_storage


if not os.path.exists("./sentence_index"):
    # If the index doesn't yet exist, create it
    sentence_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_context
    )

    sentence_index.storage_context.persist(persist_dir="./sentence_index")
else:
    # If the index already exists, load it from storage
    sentence_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./sentence_index"),
        service_context=sentence_context,
    )

### Building the postprocessor

In [36]:
# This will pull the context window out for each selected node.
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(target_metadata_key="window")

In [37]:
from llama_index.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [44]:
scored_nodes

[NodeWithScore(node=TextNode(id_='18105f2a-60ae-4556-b0df-55d771051c34', embedding=None, metadata={'window': '\nExploring varied color schemes and patterns in your rooms infuses them with life and personality.  Get in touch if you need expert guidance on where to begin.  Your understanding of basic color theory and pattern use will help you craft visually captivating, harmonious spaces that are true to your style.\n\n', 'original_text': '\nExploring varied color schemes and patterns in your rooms infuses them with life and personality. '}, excluded_embed_metadata_keys=['window', 'original_text'], excluded_llm_metadata_keys=['window', 'original_text'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0e7e2058-699b-4ddf-8f3e-bb5dd2feafab', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='a72f3f78499c07bca9853273f1f4359e7c83c28614918f61c6e85288aaf9e5ea'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='4a5317fa-7de4-47d5-a89d-495b7f731c27', node_type=

In [38]:
nodes_old[1].text

'Get in touch if you need expert guidance on where to begin. '

In [39]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

In [41]:
print(replaced_nodes[1].text)


Exploring varied color schemes and patterns in your rooms infuses them with life and personality.  Get in touch if you need expert guidance on where to begin.  Your understanding of basic color theory and pattern use will help you craft visually captivating, harmonious spaces that are true to your style.

 There is no definitive blueprint for design. 


### Adding a reranker

In [45]:
from llama_index.indices.postprocessor import SentenceTransformerRerank

# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(top_n=2, model="BAAI/bge-reranker-base")

In [46]:
# Exploring re-ranking
# This assumes the orig query is "I want a dog"
# And the original results are "This is a cat" which is RANKED HIGHER (0.6) than "This is a dog" (0.4)
# But the re-ranked results should have "This is a dog" ranked higher

from llama_index import QueryBundle
from llama_index.schema import TextNode, NodeWithScore

query = QueryBundle("I want a dog.")

scored_nodes = [
    NodeWithScore(node=TextNode(text="This is a cat"), score=0.6),
    NodeWithScore(node=TextNode(text="This is a dog"), score=0.4),
]

reranked_nodes = rerank.postprocess_nodes(scored_nodes, query_bundle=query)

print([(x.text, x.score) for x in reranked_nodes])

[('This is a dog', 0.91827404), ('This is a cat', 0.0014040867)]


### Runing the query engine

In [47]:
# NOTE that the k value here (6) is higher than that used in the re-ranker.  This gives the re-ranker some options.
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postproc, rerank]
)

In [48]:
window_response = sentence_window_engine.query(
    "What are the keys to building a career in AI?"
)

In [49]:
from llama_index.response.notebook_utils import display_response

display_response(window_response)

**`Final Response:`** The keys to building a career in AI are learning foundational technical skills, working on projects, and finding a job, all of which is supported by being part of a community.

## Putting it all Together

In [50]:
import os
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage


def build_sentence_window_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir="sentence_index",
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            documents, service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index


def get_sentence_window_query_engine(
    sentence_index, similarity_top_k=6, rerank_top_n=2
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

In [51]:
from llama_index.llms import OpenAI

index = build_sentence_window_index(
    [document],
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    save_dir="./sentence_index",
)

In [52]:
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)