In [2]:
from llama_index.core import ( 
    VectorStoreIndex,
    SimpleDirectoryReader
)
from llama_index.core.settings import Settings
from llama_index.llms.bedrock import Bedrock
from llama_index.embeddings.bedrock import BedrockEmbedding

# ------------------------------------------------------------------------
# LlamaIndex - Amazon Bedrock

llm = Bedrock(model = "amazon.titan-text-express-v1")
embed_model = BedrockEmbedding(model = "amazon.titan-embed-g1-text-02")

Settings.llm = llm
Settings.embed_model = embed_model

# documents = SimpleDirectoryReader('files').load_data()
# index = VectorStoreIndex.from_documents(documents)
# query_engine = index.as_query_engine()


# # Perform a query on the documents
# response = query_engine.query("summarize each document in a few sentences")
# print(response)

## Documents

In [11]:
from llama_index.core import Document
text = "The quick brown fox jumps over the lazy dog."
doc = Document(
    text=text,
    metadata={'author': 'John Doe','category': 'others'},
    id_='1'
)
print(doc)

Doc ID: 1
Text: The quick brown fox jumps over the lazy dog.


In [12]:
from llama_index.core import Document
text = "The quick brown fox jumps over the lazy dog."
doc = Document(
    text=text,
    metadata={'author': 'John Doe','category': 'others'},
    # id_='1'
)
print(doc)

Doc ID: a38b1de9-b0aa-46f4-a13e-97bfe039f774
Text: The quick brown fox jumps over the lazy dog.


In [13]:
doc.as_related_node_info

<bound method BaseNode.as_related_node_info of Document(id_='a38b1de9-b0aa-46f4-a13e-97bfe039f774', embedding=None, metadata={'author': 'John Doe', 'category': 'others'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='The quick brown fox jumps over the lazy dog.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')>

In [14]:
from llama_index.readers.wikipedia import WikipediaReader

loader = WikipediaReader()
documents = loader.load_data(
    pages=['Pythagorean theorem','General relativity']
)
print(f"loaded {len(documents)} documents")

loaded 2 documents


In [10]:
len(documents)

2

## Manually creating the Node objects

In [15]:
from llama_index.core import Document
from llama_index.core.schema import TextNode

doc = Document(text="This is a sample document text")
n1 = TextNode(text=doc.text[0:16], doc_id=doc.id_)
n2 = TextNode(text=doc.text[17:30], doc_id=doc.id_)
print(n1)
print(n2)

Node ID: b3124368-a303-4cb6-aea5-21975741ad8b
Text: This is a sample
Node ID: cad3d291-f310-45c8-93ec-6898bdadc0c0
Text: document text


In [17]:
print(doc)

Doc ID: 5840b6db-8b13-438a-b0b0-c40a17c7ff4f
Text: This is a sample document text


In [18]:
n1

TextNode(id_='b3124368-a303-4cb6-aea5-21975741ad8b', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='This is a sample', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [20]:
n1.id_

'b3124368-a303-4cb6-aea5-21975741ad8b'

In [21]:
n1.dict()

{'id_': 'b3124368-a303-4cb6-aea5-21975741ad8b',
 'embedding': None,
 'metadata': {},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'text': 'This is a sample',
 'mimetype': 'text/plain',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n',
 'class_name': 'TextNode'}

In [19]:
doc

Document(id_='5840b6db-8b13-438a-b0b0-c40a17c7ff4f', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='This is a sample document text', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

## Automatically extracting Nodes from Documents using splitters


In [24]:
from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter

doc = Document(
    text=(
    "This is sentence 1. This is sentence 2. "
    "Sentence 3 here."
    ),
    metadata={"author": "John Smith"}
)

splitter = TokenTextSplitter(
    chunk_size=12,
    chunk_overlap=0,
    separator=" "
)

nodes = splitter.get_nodes_from_documents([doc])

for node in nodes:
    print(node.text)
    print(node.metadata)

Metadata length (6) is close to chunk size (12). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
This is sentence 1.
{'author': 'John Smith'}
This is sentence 2.
{'author': 'John Smith'}
Sentence 3 here.
{'author': 'John Smith'}


## Nodes don’t like to be alone – they crave relationships


In [31]:
from llama_index.core.schema import TextNode
from llama_index.core.schema import NodeRelationship
from llama_index.core.schema import RelatedNodeInfo


doc = Document(text="First sentence. Second Sentence")
n1 = TextNode(text="First sentence", node_id=doc.doc_id)
n2 = TextNode(text="Second sentence", node_id=doc.doc_id)

n1.relationships[NodeRelationship.NEXT] = n2.node_id
n2.relationships[NodeRelationship.PREVIOUS] = n1.node_id

print(n1.relationships)
print(n2.relationships)


{<NodeRelationship.NEXT: '3'>: '825a6a65-b5d7-45ab-ad94-7d6854b0e6f4'}
{<NodeRelationship.PREVIOUS: '2'>: 'c3783017-89bf-4210-ad97-cdca85fc68ef'}


In [33]:
n1.node_id

'c3783017-89bf-4210-ad97-cdca85fc68ef'

In [34]:
n2.node_id

'825a6a65-b5d7-45ab-ad94-7d6854b0e6f4'

In [37]:
n1.dict()

{'id_': 'c3783017-89bf-4210-ad97-cdca85fc68ef',
 'embedding': None,
 'metadata': {},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {<NodeRelationship.NEXT: '3'>: '825a6a65-b5d7-45ab-ad94-7d6854b0e6f4'},
 'text': 'First sentence',
 'mimetype': 'text/plain',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n',
 'class_name': 'TextNode'}

## Indexes