In [1]:
import os
from dotenv import load_dotenv, find_dotenv


In [2]:
load_dotenv('.env')

True

In [3]:
OPENAI_KEY = os.environ['OPENAI_API_KEY']

In [4]:
%%capture
!pip install tree-sitter==0.21.3
!pip install tree-sitter_languages

In [5]:
import warnings
warnings.filterwarnings("ignore")

### Creating Nodes Manually

In [6]:
from llama_index.core import Document
from llama_index.core.schema import TextNode, NodeRelationship

doc = Document(text="This is a simple document text. This is another document.")

In [7]:
doc.metadata

{}

In [8]:
doc.doc_id

'3b6e3acf-52f4-4c08-9bcb-2dfc557d9841'

In [9]:
doc

Document(id_='3b6e3acf-52f4-4c08-9bcb-2dfc557d9841', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='This is a simple document text', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}')

In [10]:
doc.to_dict()

{'id_': '3b6e3acf-52f4-4c08-9bcb-2dfc557d9841',
 'embedding': None,
 'metadata': {},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': {'embeddings': None,
  'text': 'This is a simple document text',
  'path': None,
  'url': None,
  'mimetype': None},
 'image_resource': None,
 'audio_resource': None,
 'video_resource': None,
 'text_template': '{metadata_str}\n\n{content}',
 'class_name': 'Document',
 'text': 'This is a simple document text'}

In [11]:
doc.metadata = {'report_name': "Competetion analysis report May 2024",'department': "Marketing",'author':'Siva'}

In [12]:
doc.to_dict()

{'id_': '3b6e3acf-52f4-4c08-9bcb-2dfc557d9841',
 'embedding': None,
 'metadata': {'report_name': 'Competetion analysis report May 2024',
  'department': 'Marketing',
  'author': 'Siva'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': {'embeddings': None,
  'text': 'This is a simple document text',
  'path': None,
  'url': None,
  'mimetype': None},
 'image_resource': None,
 'audio_resource': None,
 'video_resource': None,
 'text_template': '{metadata_str}\n\n{content}',
 'class_name': 'Document',
 'text': 'This is a simple document text'}

#### Lets createa two nodes manually from doc 

In [14]:
n1 = TextNode(text="first sentence", node_id=doc.doc_id)
n2 = TextNode(text="second sentence", node_id=doc.doc_id)

In [15]:
n1.text

'first sentence'

In [16]:
n1.dict()

{'id_': 'b23bbc32-9a86-414e-b151-199621b3c1de',
 'embedding': None,
 'metadata': {},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text': 'first sentence',
 'mimetype': 'text/plain',
 'start_char_idx': None,
 'end_char_idx': None,
 'metadata_seperator': '\n',
 'text_template': '{metadata_str}\n\n{content}',
 'class_name': 'TextNode'}

In [20]:
# specifying the rel between nodes
n1.relationships[NodeRelationship.NEXT] = n2.node_id
n2.relationships[NodeRelationship.PREVIOUS] = n1.node_id

In [21]:
print(n1.relationships)
print(n2.relationships)

{<NodeRelationship.NEXT: '3'>: '8fc339ee-9caf-4f97-94ec-a4b54e9a122d'}
{<NodeRelationship.PREVIOUS: '2'>: 'b23bbc32-9a86-414e-b151-199621b3c1de'}


### File-Based Node Parsers
#### HTML Parser

In [7]:
import requests
from llama_index.core.node_parser import HTMLNodeParser

In [8]:
# ok lets fetch an HTML form
url = "https://docs.llamaindex.ai/en/stable"

# send a get request to the url
response = requests.get(url)
print(response)

<Response [200]>


In [9]:
# check if request was successful
if response.status_code == 200:
    # extract the html content from the response
    html_doc = response.text

    #create a document object with the HTML content
    html_document = Document(id_=url, text=html_doc)

    # Initialize the HTMLNodeParser with optional list of tags
    parser = HTMLNodeParser()

    # Parse nodes from HTML Document
    nodes = parser.get_nodes_from_documents([html_document])
else:
    # Print an error message
    print("Failes to fetch the content")

In [10]:
len(nodes)

48

In [11]:
type(nodes)

list

In [14]:
nodes[1].dict()

{'id_': '05ecd76e-8d64-4dc8-941e-5df0ed320f96',
 'embedding': None,
 'metadata': {'tag': 'h1'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {'1': {'node_id': 'https://docs.llamaindex.ai/en/stable',
   'node_type': '4',
   'metadata': {},
   'hash': '822c075f98cb5c260dd7775c7884c52f37b8faba5a79d458f9f777fb63a17977',
   'class_name': 'RelatedNodeInfo'},
  '2': {'node_id': 'bb8aa1ba-de15-45e7-bbe3-d95e2f4a7764',
   'node_type': '1',
   'metadata': {'tag': 'li'},
   'hash': '8210dc70bd4f0aaf80b768534c5160b1c458969dc070e13a6f26ef6011a3bd63',
   'class_name': 'RelatedNodeInfo'},
  '3': {'node_id': '0d533cc9-2c0f-4a2f-844e-c15700425b52',
   'node_type': '1',
   'metadata': {'tag': 'p'},
   'hash': '627729ad274386073ab3208436c5cb22815cbb175995ea0e6a581d7c5b1d3cf3',
   'class_name': 'RelatedNodeInfo'}},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text': 'Welcome to LlamaIndex 🦙 !\n#',
 'mimetype': 'text/plain',
 'start_cha

In [15]:
# lets customize our tags
my_tags = ["p","span"]
html_parser = HTMLNodeParser(tags=my_tags)
nodes = html_parser.get_nodes_from_documents([html_document])

In [16]:
print('<span> elements:')
for node in nodes:
    if node.metadata['tag']=='span':
        print(node.text)

<span> elements:
LlamaIndex
LlamaIndex
Home


High-Level Concepts
Installation and Setup
How to read these docs
Starter Examples

Discover LlamaIndex Video Series
Frequently Asked Questions (FAQ)
Starter Tools

Learn

Use Cases

Examples

Component Guides

Advanced Topics

API Reference

Open-Source Community

LlamaCloud

Introduction
What are agents?
What are workflows?
What is context augmentation?
LlamaIndex is the framework for Context-Augmented LLM Applications
Use cases
👨‍👩‍👧‍👦 Who is LlamaIndex for?
Getting Started
30 second quickstart
LlamaCloud
Community
Getting the library
Contributing
LlamaIndex Ecosystem
Introduction
Use cases
Getting started
LlamaCloud
Community
Related projects

from
llama_index.core
import
VectorStoreIndex
,
SimpleDirectoryReader
documents
=
SimpleDirectoryReader
(
"data"
)
.
load_data
()
index
=
VectorStoreIndex
.
from_documents
(
documents
)
query_engine
=
index
.
as_query_engine
()
response
=
query_engine
.
query
(
"Some question about the data should

In [17]:
print('<p> elements:')
for node in nodes:
    if node.metadata['tag'] == 'p':
        print(node.text)

<p> elements:
LlamaIndex is the leading framework for building LLM-powered agents over your data with
LLMs
and
workflows
.
What is context augmentation? What are agents and workflows? How does LlamaIndex help build them?
What kind of apps can you build with LlamaIndex? Who should use it?
Get started in Python or TypeScript in just 5 lines of code!
Managed services for LlamaIndex including
LlamaParse
, the world's best document parser.
Get help and meet collaborators on Discord, Twitter, LinkedIn, and learn how to contribute to the project.
Check out our library of connectors, readers, and other integrations at
LlamaHub
as well as demos and starter apps like
create-llama
.
Agents
are LLM-powered knowledge assistants that use tools to perform tasks like research, data extraction, and more. Agents range from simple question-answering to being able to sense, decide and take actions in order to complete tasks.
LlamaIndex provides a framework for building agents including the ability to use 