# Constructing a Knowledge Graph from Text Documents

In [1]:
import os

# common data processing
import json
import textwrap

from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
# from langchain_google_genai import GoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv

### Use HuggingFace endpoint if you dont have openai key or paid service API key
```python
from langchain_community.llms import HuggingFaceEndpoint

ENDPOINT_URL = "<YOUR_ENDPOINT_URL_HERE>"
llm = HuggingFaceEndpoint(
    endpoint_url=ENDPOINT_URL,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 50,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)
```
Learn more on [official documentation of langachain](https://python.langchain.com/v0.1/docs/integrations/chat/huggingface/#huggingfaceendpoint)

**OR**
Use local llm from `Ollama`, We are using `ChatOpenAI` Class since [ollama is compatible with OpenAI sdk](https://ollama.com/blog/openai-compatibility)

In [2]:
# load from environment
load_dotenv()

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [3]:
# initialize the model
# llm = GoogleGenerativeAI(model="models/text-bison-001", temperature=0)
# llm = GoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0.0)
llm = ChatOpenAI(
    base_url='http://localhost:11434/v1',
    api_key='ollama', # required, but unused
    model="llama3.1",
    temperature=0
)

In [4]:
GOOGLE_EMBEDDING_MODEL = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTIES = ['text']
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

### Take a look at a Form 10-K json file

- Publicly traded companies are required to fill a form 10-K each year with the Securities and Exchange Commision (SEC)
- You can search these filings using the SEC's [EDGAR database](https://www.sec.gov/edgar/search/)
- For the next few lessons, you'll work with a single 10-K form for a company called [NetApp](https://www.netapp.com/)

In [5]:
first_file_name = "./data/form10k/0000950170-23-027948.json"

first_file_as_object = json.load(open(first_file_name))
print("Type of object: ", type(first_file_as_object))

Type of object:  <class 'dict'>


In [6]:
print("key\t|\tValue Type")
print("_"*40)
for k, v in first_file_as_object.items():
    print(f"{k}\t|\t{type(v)}")

key	|	Value Type
________________________________________
item1	|	<class 'str'>
item1a	|	<class 'str'>
item7	|	<class 'str'>
item7a	|	<class 'str'>
cik	|	<class 'str'>
cusip6	|	<class 'str'>
cusip	|	<class 'list'>
names	|	<class 'list'>
source	|	<class 'str'>


In [7]:
item1_text = first_file_as_object['item1']
item1_text[0:1000]

'>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provi

### Split Form 10-K sections into chunks
- Set up text splitter using LangChain
- For now, split only the text from the "item 1" section 

In [8]:
# define text splitter for docs
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [9]:
# use text splitter for item1
item1_text_chunks = text_splitter.split_text(item1_text)
print("Number of chunks for item1: ", len(item1_text_chunks))
print("Sample of chunk::::::::::::: \n", item1_text_chunks[0][:68])

Number of chunks for item1:  254
Sample of chunk::::::::::::: 
 >Item 1.  
Business


Overview


NetApp, Inc. (NetApp, we, us or the


### Helper function to chunk the sections
- Set up helper function to chunk all sections of the Form 10-K
- Lets limit the number of chunks in each section to 20 to speed things up for demo

In [10]:
def split_form10k_data_from_file(file: str):
    # create list to accumulate chunk records
    chunks_with_metadata = []
    # open json file
    file_as_object = json.load(open(file))
    # iterate through these key
    for item in ['item1', 'item1a', 'item7', 'item7a']:
        print(f"Processing {item} from {file}")
        # grab text from item
        item_text = file_as_object[item]
        # split into chunks
        item_text_chunks = text_splitter.split_text(item_text)
        chunk_seq_id = 0

        # take only 20 chunks for   demo
        for chunk in item_text_chunks[:20]:
            # .rindex find the index of element in list(non-greedy)
            form_id = file[file.rindex('/') + 1: file.rindex('.')]
            # construct a record with metadata
            chunks_with_metadata.append({
                "text": chunk,
                # metadata from looping..
                "f10kItem": item,
                "chunkSeqId": chunk_seq_id,
                # constructed metadata....
                "formId": f"{form_id}",
                "chunkId": f"{form_id}--{item}-chunk{chunk_seq_id:04d}",
                # metadata from file...
                "names": file_as_object["names"],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f"\tSplit into {chunk_seq_id} chunks")
    return chunks_with_metadata

In [11]:
first_file_chunks = split_form10k_data_from_file(first_file_name)
first_file_chunks[0]

Processing item1 from ./data/form10k/0000950170-23-027948.json
	Split into 20 chunks
Processing item1a from ./data/form10k/0000950170-23-027948.json
	Split into 1 chunks
Processing item7 from ./data/form10k/0000950170-23-027948.json
	Split into 1 chunks
Processing item7a from ./data/form10k/0000950170-23-027948.json
	Split into 1 chunks


{'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved clou

### Setup connection to graph instance

In [12]:
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE
)

#### Create graph nodes using text chunks

In [None]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.names = $chunkParam.names,
        mergedChunk.formId = $chunkParam.formId, 
        mergedChunk.cik = $chunkParam.cik, 
        mergedChunk.cusip6 = $chunkParam.cusip6, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.f10kItem = $chunkParam.f10kItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

# create a single node 
graph.query(merge_chunk_node_query,
            params={"chunkParam": first_file_chunks[0]})

In [14]:
# create a uniqueness constraint to avoid duplicate chunks
graph.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

# show indexes
graph.query("SHOW INDEXES")

[{'id': 7,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 8, 22, 14, 48, 21, 288000000, tzinfo=<UTC>),
  'readCount': 7},
 {'id': 1,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 8, 22, 14, 48, 12, 854000000, tzinfo=<UTC>),
  'readCount': 537},
 {'id': 2,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  're

#### Create node for all chunks
- Should create 23 nodes becase we have set limit of 20 chunks in the text splitting functions

In [15]:
node_count = 0
for chunk in first_file_chunks:
    print(f"Creating `:Chunk` node for chunk ID chunk ID {chunk['chunkId']}")
    graph.query(merge_chunk_node_query,
        params={'chunkParam': chunk
               })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0000
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0001
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0002
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0003
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0004
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0005
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0006
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0007
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0008
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0009
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0010
Creating `:Chunk` node for chunk ID chunk ID 0000950170-23-027948--item1-chunk0011
Crea

In [16]:
# count total number of nodes in our databsae
graph.query("""
MATCH (n)
RETURN count(n) as nodeCount
""")

[{'nodeCount': 182}]

#### Create a vector index

Using neo4j query for configuring vector dimensions and metric to measure similarity function:
```python
graph.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")
```
To see the indexes in graph database in neo44
```python
graph.query("SHOW INDEXES")
```

In [17]:
# operation using langchain integration with neo4j database
vector_index = Neo4jVector.from_existing_graph(
    GOOGLE_EMBEDDING_MODEL,
    database="neo4j",
    index_name=VECTOR_INDEX_NAME,
    node_label="Chunk",
    text_node_properties=VECTOR_SOURCE_PROPERTIES,
    embedding_node_property="textEmbedding"
)

# retriver
retriever = vector_index.as_retriever()

# show indexes
graph.query("SHOW INDEXES")

[{'id': 7,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 8, 22, 14, 48, 21, 288000000, tzinfo=<UTC>),
  'readCount': 7},
 {'id': 1,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 8, 22, 14, 57, 15, 305000000, tzinfo=<UTC>),
  'readCount': 549},
 {'id': 2,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  're

In [18]:
# refresh schema to get updated schema oif database
graph.refresh_schema()
# display the schema of neo4j database
print(graph.schema)

Node properties:
Movie {title: STRING, taglineEmbedding: LIST, tagline: STRING, released: INTEGER}
Person {born: INTEGER, name: STRING}
Chunk {text: STRING, textEmbedding: LIST, source: STRING, f10kItem: STRING, chunkSeqId: INTEGER, cik: STRING, cusip6: STRING, chunkId: STRING, names: LIST, formId: STRING}
Form {cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING}
Relationship properties:
SECTION {f10kItem: STRING}
The relationships:
(:Person)-[:WORKS_WITH]->(:Person)
(:Chunk)-[:NEXT]->(:Chunk)
(:Chunk)-[:PART_OF]->(:Form)
(:Form)-[:SECTION]->(:Chunk)


#### Perform Similarity Search

In [19]:
search_result = vector_index.similarity_search(
    'In a single sentence, tell me about Netapp.'
)
print(search_result[0])

page_content='
text: •
Flexibility and consistency: NetApp makes moving data and applications between environments seamless through a common storage foundation across on-premises and multicloud environments.


•
Cyber resilience: NetApp unifies monitoring, data protection, security, governance, and compliance for total cyber resilience - with consistency and automation across environments. 


•
Continuous operations: NetApp uses AI-driven automation for continuous optimization to service applications and store stateless and stateful applications at the lowest possible costs.


•
Sustainability: NetApp has industry-leading tools to audit consumption, locate waste, and set guardrails to stop overprovisioning.


Product, Solutions and Services Portfolio
 


NetApp's portfolio of cloud services and storage infrastructure is powered by intelligent data management software. Our operations are organized into two segments: Hybrid Cloud and Public Cloud.


 


Hybrid Cloud


Hybrid Cloud 
offer

#### Setup Retriver chain to carry Question Answering


In [20]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm,
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [21]:
def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain.invoke({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

### Ask a question

In [22]:
question = "What is Netapp's primary business?"
prettychain(question)



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
It appears that the provided text is a snippet from a
company's (NetApp) SEC filing, specifically an index of
content related to their product and solutions portfolio.
The final answer would be:  **There is no specific numerical
answer to this question**, as it seems to be a descriptive
passage about NetApp's products and services.


In [23]:
prettychain("Where is Netapp headquartered?")



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
It appears that the provided text is a part of NetApp's
annual report to the Securities and Exchange Commission
(SEC). The content seems to be focused on various aspects of
NetApp's business, including:  1. **Data Management**:
Describing ONTAP as a storage operating system with features
like data protection against cyber-attacks, built-in data
transport features, and storage efficiency capabilities. 2.
**Human Capital**: Highlighting NetApp's commitment to
attracting and retaining top talent through a culture-fit
approach, diversity, inclusion, and belonging initiatives,
benefits, wellbeing, and engagement programs. 3. **Product,
Solutions, and Services Portfolio**: Outlining NetApp's
portfolio of cloud services and storage infrastructure
powered by intelligent data management software, including
hybrid cloud, public cloud, and related offerings.  To
answer your question directly: The final answer

In [24]:
prettychain("""
    Tell me about Netapp. 
    Limit your answer to a single sentence.
""")



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
It appears that you've provided a detailed report from
NetApp's SEC filing, highlighting their governance,
compliance, and sustainability efforts. Here are the key
points:  **Governance and Compliance**  * NetApp has
implemented AI-driven automation for continuous optimization
to service applications and store stateless and stateful
applications at the lowest possible costs. * The company
uses industry-leading tools to audit consumption, locate
waste, and set guardrails to stop overprovisioning.
**Product and Solutions Portfolio**  * NetApp's portfolio
includes cloud services and storage infrastructure powered
by intelligent data management software. * The Hybrid Cloud
segment offers a range of storage management and
infrastructure solutions that help customers transition
their traditional data centers into modern data centers with
the power of the cloud.  **Human Capital**  * NetApp
prioritizes at

In [25]:
prettychain("""
    Tell me about Apple. 
    Limit your answer to a single sentence.
""")



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
It seems like you've provided a mix of text from different
sources, including a speech by a president ( likely Joe
Biden), and a report from NetApp's SEC filing.  To answer
your original question, "A unity agenda for the nation. We
can do this." is a quote from a speech, but I couldn't
identify the specific source or context. If you'd like to
provide more information about the speech or the topic
you're interested in discussing, I'd be happy to help!  As
for the NetApp report, it appears to be discussing their
cloud storage and data services offerings, as well as their
human capital management practices.  If you have any
specific questions or topics related to these sources, feel
free to ask!


# Adding Relationships to the SEC Knowledge Graph

### Create a Form 10-K node
- Create a node to represent the entire Form 10-K
- Populate with metadata taken from a single chunk of the form

In [26]:
cypher = """
  MATCH (anyChunk:Chunk) 
  WITH anyChunk LIMIT 1
  RETURN anyChunk { .names, .source, .formId, .cik, .cusip6 } as formInfo
"""
form_info_list = graph.query(cypher)

form_info_list


[{'formInfo': {'cik': '1002047',
   'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm',
   'formId': '0000950170-23-027948',
   'names': ['Netapp Inc', 'NETAPP INC'],
   'cusip6': '64110D'}}]

In [27]:
form_info = form_info_list[0]['formInfo']
form_info

{'cik': '1002047',
 'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm',
 'formId': '0000950170-23-027948',
 'names': ['Netapp Inc', 'NETAPP INC'],
 'cusip6': '64110D'}

In [28]:
# creating Form node with parameter of formInfo above
cypher = """
    MERGE (f:Form {formId: $formInfoParam.formId })
      ON CREATE 
        SET f.names = $formInfoParam.names
        SET f.source = $formInfoParam.source
        SET f.cik = $formInfoParam.cik
        SET f.cusip6 = $formInfoParam.cusip6
"""

graph.query(cypher, params={'formInfoParam': form_info})

[]

In [29]:
graph.query("MATCH (f:Form) RETURN count(f) as formCount")

[{'formCount': 1}]

### Create a linked list of Chunk nodes for each section
- Start by identifying chunks from the same section

In [30]:
cypher = """
  MATCH (from_same_form:Chunk)
    WHERE from_same_form.formId = $formIdParam
  RETURN from_same_form {.formId, .f10kItem, .chunkId, .chunkSeqId } as chunkInfo
    LIMIT 10
"""

graph.query(cypher, params={'formIdParam': form_info['formId']})

[{'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0001',
   'chunkSeqId': 1}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0002',
   'chunkSeqId': 2}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0003',
   'chunkSeqId': 3}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0004',
   'chunkSeqId': 4}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0005',
   'chunkSeqId': 5}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '00009

In [31]:
# order chunks by sequence id which has same formID
cypher = """
MATCH (from_same_form:Chunk)
    WHERE from_same_form.formId = $formIdParam
RETURN from_same_form {.formId, .f10kItem, .chunkId, .chunkSeqId } as chunkInfo
    ORDER BY from_same_form.chunkSeqId ASC
    LIMIT 10
"""

graph.query(cypher, params={"formIdParam": form_info['formId']})

[{'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item7a',
   'chunkId': '0000950170-23-027948--item7a-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item7',
   'chunkId': '0000950170-23-027948--item7-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1a',
   'chunkId': '0000950170-23-027948--item1a-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0001',
   'chunkSeqId': 1}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0002',
   'chunkSeqId': 2}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0

In [32]:
# Limit chunks to just the "Item 1" section, the organize in ascending order
cypher = """
MATCH (from_same_section:Chunk)
WHERE from_same_section.formId = $formIdParam
  AND from_same_section.f10kItem = $f10kItemParam // NEW!
RETURN from_same_section {.formId, .f10kItem, .chunkId, .chunkSeqId }
ORDER BY from_same_section.chunkSeqId ASC
LIMIT 10
"""

graph.query(cypher, params={"formIdParam": form_info["formId"],
                           "f10kItemParam": "item1"})

[{'from_same_section': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0000',
   'chunkSeqId': 0}},
 {'from_same_section': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0001',
   'chunkSeqId': 1}},
 {'from_same_section': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0002',
   'chunkSeqId': 2}},
 {'from_same_section': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0003',
   'chunkSeqId': 3}},
 {'from_same_section': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0004',
   'chunkSeqId': 4}},
 {'from_same_section': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948--item1-chunk0005',
   'chunkSeqId': 5}},
 {'from_same_section': {'formId': '0000950170-

In [33]:
# collect ordered chunks into a list
cypher = """
MATCH (from_same_section:Chunk)
WHERE from_same_section.formId = $formIdParam
  AND from_same_section.f10kItem = $f10kItemParam
WITH from_same_section {.formId, .f10kItem, .chunkId, .chunkSeqId }
ORDER BY from_same_section.chunkSeqId ASC
LIMIT 10
RETURN collect(from_same_section) // NEW!
"""


graph.query(cypher, params={"formIdParam": form_info["formId"],
                            "f10kItemParam": "item1"})

[{'collect(from_same_section)': [{'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948--item1-chunk0000',
    'chunkSeqId': 0},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948--item1-chunk0001',
    'chunkSeqId': 1},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948--item1-chunk0002',
    'chunkSeqId': 2},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948--item1-chunk0003',
    'chunkSeqId': 3},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948--item1-chunk0004',
    'chunkSeqId': 4},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948--item1-chunk0005',
    'chunkSeqId': 5},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948--item1-chunk0006',
    '

### Add a NEXT relationship between subsequent chunks
- Use the `apoc.nodes.link` function from Neo4j to link ordered list of `Chunk` nodes with a `NEXT` relationship
- Do this for just the "Item 1" section to start

In [34]:
cypher = """
MATCH (from_same_section:Chunk)
WHERE from_same_section.formId = $formIdParam
  AND from_same_section.f10kItem = $f10kItemParam
WITH from_same_section 
ORDER BY from_same_section.chunkSeqId ASC
WITH collect(from_same_section) as section_chunk_list
CALL apoc.nodes.link(
    section_chunk_list,
    "NEXT",
    {avoidDuplicates: true}
) // NEW !
RETURN size(section_chunk_list)
"""

graph.query(cypher, params={"formIdParam": form_info["formId"],
                            "f10kItemParam": "item1"})

[{'size(section_chunk_list)': 20}]

In [35]:
# REFRESH GRAPH SCHEMA
graph.refresh_schema()
print(graph.schema)

Node properties:
Movie {title: STRING, taglineEmbedding: LIST, tagline: STRING, released: INTEGER}
Person {born: INTEGER, name: STRING}
Chunk {text: STRING, textEmbedding: LIST, source: STRING, f10kItem: STRING, chunkSeqId: INTEGER, cik: STRING, cusip6: STRING, chunkId: STRING, names: LIST, formId: STRING}
Form {cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING}
Relationship properties:
SECTION {f10kItem: STRING}
The relationships:
(:Person)-[:WORKS_WITH]->(:Person)
(:Chunk)-[:NEXT]->(:Chunk)
(:Chunk)-[:PART_OF]->(:Form)
(:Form)-[:SECTION]->(:Chunk)


### Connect chunks to their parent form with a `PART_OF` relationship

In [36]:
cypher = """
MATCH (c:Chunk), (f:Form)
    WHERE c.formId = f.formId
MERGE (c)-[newRelationship:PART_OF]->(f)
    RETURN count(newRelationship)
"""

graph.query(cypher)

[{'count(newRelationship)': 23}]

### Create a SECTION relationship on first chunk of each section

In [37]:
cypher = """
MATCH (first:Chunk), (f:Form)
WHERE first.formId = f.formId
    AND first.chunkSeqId = 0
WITH first, f
    MERGE (f)-[r:SECTION {f10kItem: first.f10kItem}]->(first)
RETURN count(r)
"""

graph.query(cypher)

[{'count(r)': 4}]

### Example cypher queries
- Return the first chunk of the Item 1 section

In [38]:
cypher = """
MATCH (f:Form)-[r:SECTION]->(first:Chunk)
    WHERE f.formId = $formIdParam
        AND r.f10kItem = $f10kItemParam
RETURN first.chunkId as chunkId, first.text as text
"""

first_chunk_info = graph.query(cypher, params={
    "formIdParam": form_info["formId"],
    "f10kItemParam": "item1"
})[0]

first_chunk_info

{'chunkId': '0000950170-23-027948--item1-chunk0000',
 'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastruc

#### Get second Chunk of `Item 1` section

In [39]:
# return second chunk of item-1 section.
cypher = """
  MATCH (first:Chunk)-[:NEXT]->(nextChunk:Chunk)
    WHERE first.chunkId = $chunkIdParam
  RETURN nextChunk.chunkId as chunkId, nextChunk.text as text
"""

next_chunk_info = graph.query(cypher, params={
    'chunkIdParam': first_chunk_info['chunkId']
})[0]

next_chunk_info

{'chunkId': '0000950170-23-027948--item1-chunk0001',
 'text': "•\nFlexibility and consistency: NetApp makes moving data and applications between environments seamless through a common storage foundation across on-premises and multicloud environments.\n\n\n•\nCyber resilience: NetApp unifies monitoring, data protection, security, governance, and compliance for total cyber resilience - with consistency and automation across environments. \n\n\n•\nContinuous operations: NetApp uses AI-driven automation for continuous optimization to service applications and store stateless and stateful applications at the lowest possible costs.\n\n\n•\nSustainability: NetApp has industry-leading tools to audit consumption, locate waste, and set guardrails to stop overprovisioning.\n\n\nProduct, Solutions and Services Portfolio\n \n\n\nNetApp's portfolio of cloud services and storage infrastructure is powered by intelligent data management software. Our operations are organized into two segments: Hybrid Cl

#### Return a window of three chunks

In [40]:
cypher = """
    MATCH (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk) 
        WHERE c2.chunkId = $chunkIdParam
    RETURN c1.chunkId, c2.chunkId, c3.chunkId
    """

graph.query(cypher, params={'chunkIdParam': next_chunk_info['chunkId']}
)

[{'c1.chunkId': '0000950170-23-027948--item1-chunk0000',
  'c2.chunkId': '0000950170-23-027948--item1-chunk0001',
  'c3.chunkId': '0000950170-23-027948--item1-chunk0002'}]

### Information is stored in the structure of a graph
- Matched patterns of nodes and relationships in a graph are called **paths**
- The length of a path is equal to the number of relationships in the path
- Paths can be captured as variables and used elsewhere in queries

In [41]:
cypher = """
    MATCH window = (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk) 
        WHERE c1.chunkId = $chunkIdParam
    RETURN length(window) as windowPathLength
    """

graph.query(cypher, params={
    'chunkIdParam': next_chunk_info['chunkId']})

[{'windowPathLength': 2}]

### Finding variable length windows
- A pattern match will fail if the relationship doesn't exist in the graph
- For example, the first chunk in a section has no preceding chunk, so the next query won't return anything

In [42]:
cypher = """
    MATCH window=(c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk) 
        WHERE c2.chunkId = $chunkIdParam
    RETURN nodes(window) as chunkList
    """
# pull the chunk ID from the first 
graph.query(cypher,
         params={'chunkIdParam': first_chunk_info['chunkId']})


[]

In [43]:
# lets modify the query to have variable length
cypher = """
  MATCH window=
      (:Chunk)-[:NEXT*0..1]->(c:Chunk)-[:NEXT*0..1]->(:Chunk) 
    WHERE c.chunkId = $chunkIdParam
  RETURN length(window)
  """
# pull the chunk ID from the first 
graph.query(cypher,
         params={'chunkIdParam': first_chunk_info['chunkId']})


[{'length(window)': 0}, {'length(window)': 1}]

#### Retreive only the longest Path

In [44]:
cypher = """
MATCH window=(c1:Chunk)-[:NEXT*0..1]->(c2:Chunk)-[:NEXT*0..1]->(c3:Chunk)
    WHERE c2.chunkId = $chunkIdParam
with window as longestChunkWindow
    ORDER BY length(window) DESC LIMIT 1
RETURN length(longestChunkWindow)
"""

graph.query(cypher, params={
    "chunkIdParam": first_chunk_info["chunkId"]
})

[{'length(longestChunkWindow)': 1}]

### Customize the results of the similarity search using Cypher
- Extend the vector store definition to accept a Cypher query
- The Cypher query takes the results of the vector similarity search and then modifies them in some way
- Start with a simple query that just returns some extra text along with the search results

In [45]:
retrieval_query_extra_text = """
WITH node, score, "Andreas knows Cypher. " as extraText
RETURN extraText + "\n" + node.text as text,
    score,
    node {.source, .chunkSeqId, .chunkId} AS metadata
"""

#### Set up the vector store to use the query, then instantiate a retriever and Question-Answer chain in LangChain


In [46]:
vector_store_extra_text = Neo4jVector.from_existing_index(
    embedding=GOOGLE_EMBEDDING_MODEL,
    index_name=VECTOR_INDEX_NAME,
    retrieval_query=retrieval_query_extra_text, # NEW!
)

# Create a retriever from the vector store
retriever_extra_text = vector_store_extra_text.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain_extra_text = RetrievalQAWithSourcesChain.from_chain_type(
    llm, 
    chain_type="stuff", 
    retriever=retriever_extra_text,
    verbose=True
)

In [47]:
# retrived_list = vector_store_extra_text.similarity_search_with_score("What is NetApp?")
# for doc, score in retrived_list:
#     print(doc.page_content)
#     print("Metadata: ", doc.metadata)
#     print("_"*12)

### ASk Question

In [48]:
chain_extra_text.invoke(
    {"question": "What topics does Andreas know about?"},
    return_only_outputs=True)



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m


{'answer': "It seems like you're looking for a specific answer, but the provided text doesn't contain a question that leads to a single final answer. The content appears to be an excerpt from a company's annual report or SEC filing, discussing various aspects of their business, including environmental policies, intellectual property, manufacturing and supply chain, and research and development.\n\nIf you could provide more context or specify what you're looking for (e.g., a specific piece of information, a summary, etc.), I'd be happy to help!",
 'sources': ''}

- Note, the LLM hallucinates here, using the information in the retrieved text as well as the extra text.
- Modify the prompt to try and get a more accurate answer

In [49]:
chain_extra_text.invoke(
    {"question": "What single topic does Andreas know about?"},
    return_only_outputs=True)



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m


{'answer': "I don't know.\n\n", 'sources': ''}

In [50]:
chain_extra_text.invoke(
    {"question": "In a single sentence, tell me about Netapp's business."},
    return_only_outputs=True)



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m


{'answer': "It seems like you're looking for a final answer, but there isn't a specific question provided. However, I can summarize the key points from the text:\n\n**Company Overview**\n\n* NetApp is a company that provides cloud services and storage infrastructure.\n* Their operations are organized into two segments: Hybrid Cloud and Public Cloud.\n\n**Product and Services Portfolio**\n\n* NetApp's portfolio includes cloud services and storage infrastructure powered by intelligent data management software.\n* The Hybrid Cloud segment offers storage management and infrastructure solutions to help customers transition from traditional data centers to modern, cloud-enabled data centers.\n\n**Partnerships and Customer Base**\n\n* NetApp has a diversified customer base across various industries, including energy, finance, government, technology, and more.\n* They partner with leading cloud providers and other industry partners to support their customers' success.\n\n**Competitive Differen

#### Change the retrival query 
**Note**: We need to reset vector store, retriver. and chain every time we change the cypher query

In [51]:
# modify the retrieval extra text here then run the entire cell
retrieval_query_extra_text = """
WITH node, score, "Andreas knows Cypher. " as extraText
RETURN extraText + "\n" + node.text as text,
    score,
    node {.source} AS metadata
"""

vector_store_extra_text = Neo4jVector.from_existing_index(
    embedding=GOOGLE_EMBEDDING_MODEL,
    index_name=VECTOR_INDEX_NAME,
    text_node_property=VECTOR_SOURCE_PROPERTIES,
    retrieval_query=retrieval_query_extra_text, # NEW !!!
)

# Create a retriever from the vector store
retriever_extra_text = vector_store_extra_text.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain_extra_text = RetrievalQAWithSourcesChain.from_chain_type(
    llm, 
    chain_type="stuff", 
    retriever=retriever_extra_text
)

#### Windowless Retriver
- Create a regular vector store that retrives a single node

In [52]:
neo4j_vector_store = Neo4jVector.from_existing_index(
    GOOGLE_EMBEDDING_MODEL,
    database="neo4j",
    index_name=VECTOR_INDEX_NAME,
    node_label="Chunk",
    embedding_node_property="textEmbedding"
)

# Create a retriever from the vector store
windowless_retriever = neo4j_vector_store.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
windowless_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm, 
    chain_type="stuff", 
    retriever=windowless_retriever, 
    verbose=True
)

windowless_chain.invoke(
    {"question": question},
)



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m


{'question': "What is Netapp's primary business?",
 'answer': "It appears that the provided text is a disclosure document from the Securities and Exchange Commission (SEC) related to NetApp, Inc. The content seems to be an overview of NetApp's portfolio, products, and services, particularly in the areas of cloud storage, data management, and cybersecurity.\n\nBased on this information, I will attempt to provide a concise answer to your question:\n\n**What are the key benefits and features of NetApp's offerings?**\n\nThe key benefits and features of NetApp's offerings include:\n\n1. **Flexibility and consistency**: Seamless movement of data and applications between environments.\n2. **Cyber resilience**: Unified monitoring, data protection, security, governance, and compliance for total cyber resilience.\n3. **Continuous operations**: AI-driven automation for continuous optimization to service applications at the lowest possible costs.\n4. **Sustainability**: Industry-leading tools to a

#### Define window retrival to get consecutive chunks

In [53]:
retrieval_query_window = """
MATCH window=
    (:Chunk)-[:NEXT*0..1]->(node)-[:NEXT*0..1]->(:Chunk)
WITH node, score, window as longestWindow 
  ORDER BY length(window) DESC LIMIT 1
WITH nodes(longestWindow) as chunkList, node, score
  UNWIND chunkList as chunkRows
WITH collect(chunkRows.text) as textList, node, score
RETURN apoc.text.join(textList, " \n ") as text,
    score,
    node {.source} AS metadata
"""

vector_store_window = Neo4jVector.from_existing_index(
    embedding=GOOGLE_EMBEDDING_MODEL,
    index_name=VECTOR_INDEX_NAME,
    text_node_property=VECTOR_SOURCE_PROPERTIES,
    retrieval_query=retrieval_query_window, 
)

# Create a retriever from the vector store
retriever_window = vector_store_window.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain_window = RetrievalQAWithSourcesChain.from_chain_type(
    llm, 
    chain_type="stuff", 
    retriever=retriever_window,
    verbose=True,
)

### Compare the Chains

In [54]:
question = "In a single sentence, tell me about Netapp's business."

In [55]:
answer = windowless_chain.invoke(
    {"question": question},
    return_only_outputs=True,
)
print(textwrap.fill(answer["answer"]))



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
It appears that you've provided a large block of text from NetApp's
SEC filing. I'll do my best to summarize the key points and answer
your question.  **Summary:**  NetApp is a company that provides data
management software, storage infrastructure, and cloud services to
businesses and service providers. Their products and services are
designed to help customers manage their data across on-premises and
multicloud environments. NetApp has a strong focus on innovation,
partnerships, and customer experience.  **Key Points:**  1. **Product
Portfolio:** NetApp offers a range of storage management and
infrastructure solutions, including cloud-connected all-flash, hybrid-
flash, and object storage systems. 2. **Hybrid Cloud:** NetApp's
Hybrid Cloud segment provides a portfolio of storage management and
infrastructure solutions that help customers transition from
traditional data centers to modern data cent

In [56]:
answer = chain_window.invoke(
    {"question": question},
)
print(textwrap.fill(answer["answer"]))



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
It appears that you have provided a passage from a speech by an
American president, likely George W. Bush, and then a block of text
about NetApp's business.  To answer your question, the final answer is
not related to Michael Jackson, as mentioned in the first part of the
passage.  However, if I had to extract a key takeaway from the second
part of the passage, it would be that NetApp provides a range of
services and solutions for cloud operations, data management, and
storage, including strategic consulting, professional services,
managed services, and support.


In [57]:
chain_window.invoke(
    {"question": question},
)



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m


{'question': "In a single sentence, tell me about Netapp's business.",
 'answer': 'It appears that you\'ve provided a passage from a company\'s annual report (SEC filing) for NetApp, and not a response to the initial question about Michael Jackson.\n\nTo answer your original question:\n\nThe text you provided is an excerpt from a speech or a message, likely given by a president of the United States. The tone and language suggest that it might be a State of the Union address or a similar occasion where the president addresses the nation on important issues.\n\nIf I had to identify the president based on this passage, I would take a guess:\n\nThe text mentions "our moment of responsibility," "our test of resolve and conscience," and "the hour" - phrases that evoke a sense of national importance and urgency. The tone is also somewhat inspirational and patriotic.\n\nBased on these characteristics, my educated guess would be that the president who delivered this speech might be George W. Bu