In [1]:
!pip install -q llama-index llama-index-vector-stores-neo4jvector llama-index-llms-gemini llama-index-embeddings-gemini

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/150.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.7/150.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.1/679.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.0/189.0 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.vector_stores.neo4jvector import Neo4jVectorStore
from llama_index.llms.gemini import Gemini
from llama_index.core.prompts.base import PromptTemplate
from llama_index.core.prompts.prompt_type import PromptType
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.response.pprint_utils import pprint_response
from google.colab import userdata

In [3]:
def get_page_numbers(nodes):
    page_numbers = set()
    for node in nodes:
        page_numbers.add(node.metadata["page_label"])
    return sorted(list(page_numbers))

In [50]:
api_key = userdata.get('GOOGLE_API_KEY')
Settings.embed_model = GeminiEmbedding(
    api_key = api_key,
    model_name = "models/text-embedding-004",
)
Settings.llm = Gemini(
    model="models/gemini-1.5-flash",
    api_key=api_key
)

In [5]:
username = "neo4j"
password = userdata.get('NEO4J_PASSWORD')
url = userdata.get('NEO4J_URL')
embed_dim = 768

neo4j_vector = Neo4jVectorStore(username, password, url, embed_dim, hybrid_search=True)

In [6]:
!mkdir -p 'data/'
!wget 'https://content.dgft.gov.in/Website/NCSTC Booklet.pdf' -O 'data/NCSTC Booklet.pdf'

--2024-11-30 19:15:06--  https://content.dgft.gov.in/Website/NCSTC%20Booklet.pdf
Resolving content.dgft.gov.in (content.dgft.gov.in)... 3.169.137.12, 3.169.137.41, 3.169.137.119, ...
Connecting to content.dgft.gov.in (content.dgft.gov.in)|3.169.137.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17231175 (16M) [application/pdf]
Saving to: ‘data/NCSTC Booklet.pdf’


2024-11-30 19:15:09 (11.6 MB/s) - ‘data/NCSTC Booklet.pdf’ saved [17231175/17231175]



In [7]:
documents = SimpleDirectoryReader("./data").load_data()

In [8]:
storage_context = StorageContext.from_defaults(vector_store=neo4j_vector)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, show_progress=True
)

Parsing nodes:   0%|          | 0/42 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/42 [00:00<?, ?it/s]



# Retriever Query Engine

In [None]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=5,  # Adjust based on how many nodes you want to retrieve
)

DEFAULT_TEXT_QA_PROMPT_TMPL = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and no prior knowledge, "
    "answer the query.\n"
    "Please include all dependencies and details in the answer.\n"
    "If there are dependencies that exist in the provided context, include that in detail, completely."
    "Do not just reference it.\n"
    "Query: {query_str}\n"
    "Answer: "
)
DEFAULT_TEXT_QA_PROMPT = PromptTemplate(
    DEFAULT_TEXT_QA_PROMPT_TMPL, prompt_type=PromptType.QUESTION_ANSWER
)

# Configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="compact",
    text_qa_template=DEFAULT_TEXT_QA_PROMPT,
)

# Initialize query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [None]:
# Perform the query
prompt = "List the requirements for export of spare parts under SCOMET under stock and sale."
response = query_engine.query(prompt)



In [None]:
pprint_response(response, show_source=True)

Final Response: The requirements for exporting spare parts under
SCOMET under the Stock and Sale policy are the same as those for
exporting main items/equipment under Stock and Sale, plus an
additional requirement:  **Requirements from Section 5 (Export for
Stock and Sale purpose):**  *   Application in the prescribed proforma
(ANF-10B) *   Documentary proof regarding the corporate relationship
between the Indian exporter and stockist. *   End-use/End-user
Certificate from the stockist entity abroad in Appendix 10J(iii). *
List of countries (in the EUC) to which the items imported from India
would be exported by the stockist.   **Additional Requirement from
Section 6 (Export of spare parts under SCOMET under Stock and Sale):**
*   The applicant needs to indicate the requirement of spare parts
after judicious and reasonable assessment with justification thereof.
This means a clear explanation of why the spare parts are needed,
demonstrating a reasonable and well-considered need.   In su

# Chat Engine

In [87]:
def check_for_references(text):
  prompt = """
  Provided is a generated output for a user's query by checking the available context.
  Check the output if it contains references to a specific thing only without explaining it in detail.
  These include scenarios and cases where the output asks the user specifically to refer to another area such as Appendix, Para, Document etc to get some more info.
  If it contains any, return them one per line, else return None.
  Skip abstract references.

  Generated Output:
  {text}
  """
  references = Settings.llm.complete(prompt=prompt.format(text=text)).text.strip()
  if references == "None":
    return None
  references = list(set([reference.lower() for reference in references.split("\n")]))
  return references

def refine_response(text):
  references = check_for_references(text)
  prompts = []

  for reference in references:
    refine_prompt = """
    Provided are a generated output for a user's query and a reference to a specific thing the output contains.
    Understand the generated output and check where the reference is mentioned.
    Based on that, create a prompt, that would be used to query for the reference.
    The reference needs to be emphasized and should be the focus point, so as to improve querying for it.
    Return the prompt in plain text question format.

    Generated Output:
    {text}

    Reference:
    {reference}

    Query Prompt:
    """
    prompts.append(Settings.llm.complete(prompt=refine_prompt.format(text=text, reference=reference)).text.strip())

  if len(prompts) == 0:
    return text, []

  source_nodes = []

  for prompt in prompts:
    refine_chat_engine = index.as_chat_engine(
        chat_mode="context",
        memory=memory,
        system_prompt=(
            "You are a chatbot designed to refine the following text, if possible,"
            " using the provided context. If not, append the detailed explanation to"
            " the provided text."
            f"Text:\n{text}"
        ),
    )
    response = refine_chat_engine.chat(prompt)
    source_nodes.append(response.source_nodes)
    text = response.response

  return text, source_nodes

In [10]:
memory = ChatMemoryBuffer.from_defaults(
    token_limit=4096
)
chat_engine = index.as_chat_engine(
    chat_mode="context",
    memory=memory,
    system_prompt=(
        "You are a chatbot, able to have normal interactions, as well as use"
        " the context provided to return complete, in detail answers with dependencies completely listed"
        " and explained, rather than just being referenced. If the dependencies are not available in the context,"
        " just reference them."
    ),
)

In [11]:
# Perform the query
prompt = "List the requirements for export of spare parts under SCOMET under stock and sale."
response = chat_engine.chat(prompt)



In [68]:
# References extracted successfully!
check_for_references(response.response)

['appendix 10j(i)',
 'anf-10b',
 'appendix 10j(iii)',
 'appendix 10j(ii)',
 'para 10.10 of hbp 2023']

In [67]:
# Null example
check_for_references("""
Requirements for tea:
1. cup
2. tea
""") == None

True

In [88]:
refined_prompts = refine_response(response.response)



In [13]:
pprint_response(response, show_source=True)

Final Response: To export spare parts under SCOMET as part of a stock
and sale arrangement, the following requirements must be met:  All
requirements listed for bulk export of SCOMET items for stock and sale
(point 5)  must be fulfilled.  These include:  * **Same
Country/Entities:** The foreign buyer, consignee (and any
intermediaries), end-user, and end-use must be identical to those for
which export authorization was previously issued to the applicant
exporter after proper consultation/verification. * **Copy of Original
SCOMET Authorization:**  A copy of the original SCOMET authorization
is needed. * **Purchase Order Copy:** A copy of the purchase order is
required. * **End-user Certificate:** An end-user certificate in the
prescribed format (Appendix 10j(i), 10j(ii), 10j(iii) as applicable)
is necessary. * **Technical Specification of the Item:**  Technical
specifications of the item must be provided. * **Para 10.10 of HBP
2023:** Adherence to Para 10.10 of HBP 2023 is mandatory. *


In [68]:
print("Relevant Page Numbers:", ', '.join(map(str, get_page_numbers(response.source_nodes))))

Page Numbers: 18, 19, 25, 27, 28
