In [1]:
%pip install llama-index

Collecting llama-index
  Using cached llama_index-0.9.39-py3-none-any.whl.metadata (8.3 kB)
Collecting SQLAlchemy>=1.4.49 (from SQLAlchemy[asyncio]>=1.4.49->llama-index)
  Using cached SQLAlchemy-2.0.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.6 (from llama-index)
  Using cached aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting dataclasses-json (from llama-index)
  Using cached dataclasses_json-0.6.3-py3-none-any.whl.metadata (25 kB)
Collecting deprecated>=1.2.9.3 (from llama-index)
  Using cached Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting openai>=1.1.0 (from llama-index)
  Using cached openai-1.10.0-py3-none-any.whl.metadata (18 kB)
Collecting typing-inspect>=0.8.0 (from llama-index)
  Using cached typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting aiosignal>=1.1.2 (from aiohttp<4.0.0,>=3.8.6->llama-index)
  Using cached aiosignal-

In [15]:
%pip install pypdf

Collecting pypdf
  Downloading pypdf-4.0.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.0.1-py3-none-any.whl (283 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.0/284.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.0.1
Note: you may need to restart the kernel to use updated packages.


In [20]:
import openai
from llama_index.llms import OpenAI
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [3]:
import llama_index
from llama_index.vector_stores import ElasticsearchStore

In [18]:
# create documents
from llama_index import VectorStoreIndex, SimpleDirectoryReader, Document
reader = SimpleDirectoryReader(input_dir="../data/")
docs = reader.load_data()
docs

Overwriting cache for 0 422


[Document(id_='c4a8f634-2835-43f2-ad63-5c7c30f6f282', embedding=None, metadata={'page_label': '1', 'file_name': 'Anti-Harassment and Anti-Discrimination Policy (Global)_Rev. March 2023_English.pdf', 'file_path': '../data/Anti-Harassment and Anti-Discrimination Policy (Global)_Rev. March 2023_English.pdf', 'file_type': 'application/pdf', 'file_size': 756149, 'creation_date': '2024-01-29', 'last_modified_date': '2024-01-29', 'last_accessed_date': '2024-01-29'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text=' \n1 Rev. March  2023  \nAnti- Harassment and Anti -Discrimination Policy  (Global)  \nPurpose  \nGraphic Packaging ’s Core Values include Respec t—for the personal dignity, rights, and diversity of every \nemployee.  The Company is committed to foster

In [19]:
document = Document(text="\n\n".join([doc.text for doc in docs]))

## Setup Vector Store

In [10]:
# vector store connect
es_store = ElasticsearchStore(
    index_name="test-llamaidx",
    es_url="http://localhost:9200",)

## Sentence Window Retrieval

https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/MetadataReplacementDemo.html

In [11]:
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank


In [12]:
# build index of sentence windows from document. 
def build_sentence_window_index(
    document, llm, vector_store, embed_model="local:BAAI/bge-small-en-v1.5"
):
    # parse documents into sentence windows
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )

    # manage services needed to build the index
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser
    )
    # manage the storage of index
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    # create index using service context and storage context
    sentence_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_context, storage_context=storage_context
    )

    return sentence_index



In [13]:
# Create a query engine from given sentence window index.
def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6, # num of similar results to return 
    rerank_top_n=2, # num of top results to rerank
):

    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    # reranks top results using sentence transformers
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    
    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

> The reranking process involves using the sentence transformer model to compute new similarity scores for the top rerank_top_n results and then sorting these results based on the new scores. This can help improve the relevance of the results by taking into account more complex semantic similarities that might not be captured by the initial ranking.



In [40]:
sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    vector_store=es_store
)

query_engine = get_sentence_window_query_engine(sentence_index=sentence_index, rerank_top_n=5)


### Prompt Templates

In [72]:
# prompt templates
print(f"Prompt Templates:\n")
print(f"#"*20)
print(query_engine.get_prompts()["response_synthesizer:text_qa_template"].default_template.template)
print(f"#"*20)
print(query_engine.get_prompts()["response_synthesizer:refine_template"].default_template.template)

Prompt Templates:

####################
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 
####################
The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


### Response

In [75]:
resp = query_engine.query(
    "There is no manager option in gospot account"
)
print(resp)


If your company has not purchased Role-based Access as an addition to your GoSpotCheck Account, you will not see the option for the Manager role in your account. To find out more about setting up roles and access levels, please contact your designated GoSpotCheck Customer Success Manager.


In [79]:
print(resp.source_nodes[0].node.metadata["window"])
print("-"*30)
print(resp.source_nodes[0].node.metadata["original_text"])


Roles
Roles  allow you to determine the level of access you want each User to have.
 USER  - Access to the GoSpotCheck Mobile Application/myGSC only (default)
COMPANY ADMIN  - Full View and Edit access to both the GoSpotCheck Dashboard
& the Mobile Application
REVIEWER - FULL access to the GoSpotCheck Mobile Application & SCOPED,
CUSTOMIZED access to view/edit within the GoSpotCheck Dashboard.  For more
information on Role Based Access, click HERE . 
 A word on the Manager Role:  You will only see the role of Manager as a potential option
if your company has purchased Role-based Access as an addition to your GoSpotCheck
Account.  To find out more about setting up Roles, please contact your designated GSC
Customer Success Manager.
 When finished inputting all necessary information, click the blue " Create User " button in
the bottom righthand corner of the page. 
 
------------------------------
A word on the Manager Role:  You will only see the role of Manager as a potential option
if 

### Source documents

In [86]:
for source_node in resp.source_nodes:
    print(source_node.node.text)
    print("-"*20)

Roles
Roles  allow you to determine the level of access you want each User to have.
 USER  - Access to the GoSpotCheck Mobile Application/myGSC only (default)
COMPANY ADMIN  - Full View and Edit access to both the GoSpotCheck Dashboard
& the Mobile Application
REVIEWER - FULL access to the GoSpotCheck Mobile Application & SCOPED,
CUSTOMIZED access to view/edit within the GoSpotCheck Dashboard.  For more
information on Role Based Access, click HERE . 
 A word on the Manager Role:  You will only see the role of Manager as a potential option
if your company has purchased Role-based Access as an addition to your GoSpotCheck
Account.  To find out more about setting up Roles, please contact your designated GSC
Customer Success Manager.
 When finished inputting all necessary information, click the blue " Create User " button in
the bottom righthand corner of the page. 
 
--------------------
The Company  is further committed to prohibiting retaliation against qualified employees who request a