In [2]:
import os

In [3]:
from dotenv import load_dotenv

In [4]:
load_dotenv('/home/santhosh/Projects/courses/Pinnacle/.env')

True

In [5]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

# File Loader

In [6]:
from llama_index.core import SimpleDirectoryReader

https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt

In [7]:
documents = SimpleDirectoryReader(input_files=["./data/paul_graham_essay.txt"], filename_as_id=True).load_data(show_progress=True)

Loading files: 100%|██████████| 1/1 [00:00<00:00, 125.68file/s]


In [8]:
type(documents)

list

In [9]:
len(documents)

1

In [10]:
documents[0].to_dict().keys()

dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator', 'class_name'])

In [11]:
documents[0].id_

'data/paul_graham_essay.txt'

In [12]:
documents[0].doc_id

'data/paul_graham_essay.txt'

In [13]:
documents[0].node_id

'data/paul_graham_essay.txt'

In [14]:
documents[0].hash

'17d7c59760a41d9abe07ffe931fd7ee08790e081aff1789642dc0351e954585f'

In [15]:
documents[0].metadata

{'file_path': 'data/paul_graham_essay.txt',
 'file_name': 'paul_graham_essay.txt',
 'file_type': 'text/plain',
 'file_size': 75042,
 'creation_date': '2024-04-16',
 'last_modified_date': '2024-04-15'}

In [16]:
documents[0].metadata.update({'author': 'paul_graham'})

In [17]:
documents[0].metadata

{'file_path': 'data/paul_graham_essay.txt',
 'file_name': 'paul_graham_essay.txt',
 'file_type': 'text/plain',
 'file_size': 75042,
 'creation_date': '2024-04-16',
 'last_modified_date': '2024-04-15',
 'author': 'paul_graham'}

In [18]:
documents[0].excluded_llm_metadata_keys

['file_name',
 'file_type',
 'file_size',
 'creation_date',
 'last_modified_date',
 'last_accessed_date']

In [19]:
documents[0].text

'\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran. You had to type programs on punch cards, then s

# Ingestion Pipeline

In [21]:
import tiktoken

In [22]:
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

In [23]:
from llama_index.core import VectorStoreIndex, StorageContext, SummaryIndex

In [24]:
from llama_index.core.node_parser import SentenceSplitter, SentenceWindowNodeParser
from llama_index.core.ingestion import IngestionPipeline

In [25]:
from llama_index.embeddings.openai import OpenAIEmbedding

In [26]:
from llama_index.core.storage.docstore import SimpleDocumentStore

In [27]:
from llama_index.vector_stores.chroma import ChromaVectorStore

In [28]:
import chromadb

In [29]:
token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode,
    verbose=True
)

## 0 1

In [28]:
token_counter.reset_counts()

In [29]:
token_counter.total_embedding_token_count

0

In [30]:
doc_store = SimpleDocumentStore()

In [31]:
doc_store.docs

{}

In [32]:
chroma_client = chromadb.PersistentClient(path="./vector_storage/chroma_db_0_1")

In [33]:
chroma_client.count_collections()

1

In [34]:
chroma_client.list_collections()

[Collection(name=paul_essay)]

In [35]:
chroma_collection = chroma_client.get_or_create_collection("paul_essay")

In [36]:
chroma_client.list_collections()

[Collection(name=paul_essay)]

In [37]:
chroma_collection.count()

94

In [38]:
chroma_collection.get()

{'ids': ['016edf4b-89b8-47ae-b1fd-33e730c8e238',
  '09de7288-9746-4894-9127-69cef87b6b4a',
  '1445b4ed-883b-4dcd-b47e-d1c2864a53fa',
  '151a812e-ae7b-4f7d-ba13-5fb894c38d29',
  '2076e23c-2459-40fc-b0ee-1ee3c6cdcf21',
  '290636bf-acf3-4790-950d-1f957d82fd99',
  '2eab3f8b-b17b-4ad5-8f82-df6fb3cb7833',
  '2f2e1648-706c-4313-8232-6e217c5f009b',
  '316b2fdf-a61d-422a-a243-ee2aa2fd6ea2',
  '361a9731-07c1-4e22-b949-74c93f323501',
  '37488244-b283-4b06-bc03-c5b9f20e2c88',
  '3977af3c-5b6f-45c4-8fc7-1c7205d1c056',
  '3b8e8c72-5590-4bce-abea-5fffd8b5ecd0',
  '43fccb20-9ec5-480b-8891-45575edc3c6c',
  '45fbfa16-1118-49c5-b64b-bda0ea9d752d',
  '461818fb-d4f2-45f2-b7e7-7dab7f002979',
  '46f733b1-1ddc-4f55-8342-9919ab6bf189',
  '49e50234-6004-4136-a714-cfafe82b949f',
  '4e43e615-43d9-47dd-bf79-b59aaf011000',
  '52f71dba-528b-4bfc-8944-bef4cad954d7',
  '56d8dc44-76f2-454a-a776-c038b1d75fdb',
  '5b25c8a8-6262-4fb0-b12d-bc2fed658508',
  '5b30e02a-7513-4943-b8c1-e39aa94b81cf',
  '5de68a6b-bf74-455c-a1e6-

In [39]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [40]:
vector_store.to_dict()

{'stores_text': True,
 'is_embedding_query': True,
 'flat_metadata': True,
 'collection_name': None,
 'host': None,
 'port': None,
 'ssl': False,
 'headers': None,
 'persist_dir': None,
 'collection_kwargs': {},
 'class_name': 'ChromaVectorStore'}

In [41]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [42]:
documents[0].get_doc_id()

'data/paul_graham_essay.txt'

In [43]:
pipeline = IngestionPipeline(
    transformations = [SentenceSplitter(chunk_size=512, chunk_overlap=128),
                       OpenAIEmbedding(model_name='text-embedding-3-small', callback_manager=CallbackManager([token_counter]))],
    docstore=doc_store,
    vector_store=vector_store
)

In [44]:
for i in doc_store.docs.keys():
    print(i)

In [45]:
nodes = pipeline.run(documents=documents, show_progress=True, num_workers=-1)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/47 [00:00<?, ?it/s]

Embedding Token Usage: 22449


In [46]:
token_counter.total_embedding_token_count

22449

## 1 0

In [28]:
token_counter.reset_counts()

In [29]:
token_counter.total_embedding_token_count

0

In [49]:
doc_store = SimpleDocumentStore()

In [31]:
doc_store.docs

{'data/paul_graham_essay.txt': Document(id_='data/paul_graham_essay.txt', embedding=None, metadata={'file_path': 'data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-04-16', 'last_modified_date': '2024-04-15', 'author': 'paul_graham'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first p

In [32]:
chroma_client = chromadb.EphemeralClient()

In [33]:
chroma_client.count_collections()

0

In [34]:
chroma_client.list_collections()

[]

In [35]:
chroma_collection = chroma_client.get_or_create_collection("paul_essay")

In [36]:
chroma_client.list_collections()

[Collection(name=paul_essay)]

In [37]:
chroma_collection.count()

0

In [38]:
chroma_collection.get()

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [39]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [40]:
vector_store.to_dict()

{'stores_text': True,
 'is_embedding_query': True,
 'flat_metadata': True,
 'collection_name': None,
 'host': None,
 'port': None,
 'ssl': False,
 'headers': None,
 'persist_dir': None,
 'collection_kwargs': {},
 'class_name': 'ChromaVectorStore'}

In [41]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [42]:
documents[0].get_doc_id()

'data/paul_graham_essay.txt'

In [43]:
pipeline = IngestionPipeline(
    transformations = [SentenceSplitter(chunk_size=512, chunk_overlap=128),
                       OpenAIEmbedding(model_name='text-embedding-3-small', callback_manager=CallbackManager([token_counter]))],
    docstore=doc_store,
    vector_store=vector_store
)

In [44]:
for i in doc_store.docs.keys():
    print(i)

data/paul_graham_essay.txt


In [45]:
nodes = pipeline.run(documents=documents, show_progress=True, num_workers=-1)

Parsing nodes: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

In [46]:
token_counter.total_embedding_token_count

0

In [66]:
doc_store.persist('./document_storage/doc_store_1_0.json')

In [30]:
doc_store = SimpleDocumentStore.from_persist_path('./document_storage/doc_store_1_0.json')

## 1 1

In [28]:
token_counter.reset_counts()

In [29]:
token_counter.total_embedding_token_count

0

In [30]:
doc_store = SimpleDocumentStore()

In [32]:
doc_store = SimpleDocumentStore.from_persist_path('./document_storage_1/doc_store.json')

In [30]:
doc_store = SimpleDocumentStore.from_persist_path('./document_storage/doc_store_1_1.json')

In [31]:
for i in doc_store.docs.keys():
    print(i)

data/paul_graham_essay.txt


In [32]:
doc_store.docs

{'data/paul_graham_essay.txt': Document(id_='data/paul_graham_essay.txt', embedding=None, metadata={'file_path': 'data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-04-16', 'last_modified_date': '2024-04-15', 'author': 'paul_graham'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first p

In [33]:
chroma_client = chromadb.PersistentClient(path="./vector_storage/chroma_db_1_1")

In [34]:
chroma_client.count_collections()

1

In [35]:
chroma_client.list_collections()

[Collection(name=paul_essay)]

In [36]:
chroma_collection = chroma_client.get_or_create_collection("paul_essay")

In [37]:
chroma_client.list_collections()

[Collection(name=paul_essay)]

In [38]:
chroma_collection.count()

47

In [39]:
chroma_collection.get()

{'ids': ['0b81b28c-92ea-4172-883e-da44c865b303',
  '0bdd71dc-959a-4f59-ae6d-7cbd55c85803',
  '0d693a88-9406-4990-be69-ade984c67c71',
  '1e29886c-1ac0-41f6-acfe-8d2b256cca6c',
  '20c4c4d4-37ff-4bb7-9c5b-0e60f0364b96',
  '21e29b81-0236-4b57-b685-1fb6a1eb93a0',
  '23de20a8-0e99-4745-aceb-e7fe740f63cc',
  '28ba1494-9346-42c7-b1bc-69b91b542076',
  '2b4ea005-3fa5-4ef6-aee2-b71d898cdb9e',
  '2e00ac0a-f5e2-48c6-88c5-b288f6d223d2',
  '2fff7c06-11c1-4d37-9433-d22c1b88ff9b',
  '37668277-ce82-44cd-bf65-9a096fe59256',
  '38ff4af2-af98-44ce-a1cb-548adb20200a',
  '47b22513-3c12-4af7-aaba-5adf073e5c7e',
  '50a9e186-97c8-42ce-8feb-44b9020a235c',
  '56d3c444-8eed-467b-94b4-2d934dcd0a77',
  '5f938b10-f728-4345-98df-78986aa3c053',
  '65e8c6ae-38eb-44b2-83f2-b0bf674c6d69',
  '695d31a7-69ea-4a2e-96d7-960d093544a6',
  '6eaaf4be-3139-4330-a3ba-a223d698b2f9',
  '6f8ff6c2-00d0-4ff6-8019-5fb10dc110ac',
  '707cab42-52db-4d0a-9205-4017fc16d0bd',
  '7151dce4-3cb9-41b3-9a15-9316f97b31a7',
  '7812dad8-9d4a-43cd-ab03-

In [40]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [41]:
vector_store.to_dict()

{'stores_text': True,
 'is_embedding_query': True,
 'flat_metadata': True,
 'collection_name': None,
 'host': None,
 'port': None,
 'ssl': False,
 'headers': None,
 'persist_dir': None,
 'collection_kwargs': {},
 'class_name': 'ChromaVectorStore'}

In [42]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [43]:
documents[0].get_doc_id()

'data/paul_graham_essay.txt'

In [47]:
pipeline = IngestionPipeline(
    transformations = [SentenceSplitter(chunk_size=512, chunk_overlap=64),
                       OpenAIEmbedding(model_name='text-embedding-3-small', callback_manager=CallbackManager([token_counter]))],
    docstore=doc_store,
    vector_store=vector_store
)

In [48]:
for i in doc_store.docs.keys():
    print(i)

data/paul_graham_essay.txt


In [49]:
nodes = pipeline.run(documents=documents, show_progress=True, num_workers=-1)

Parsing nodes: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

In [49]:
token_counter.total_embedding_token_count

22449

In [50]:
doc_store.persist('./document_storage/doc_store_1_1.json')

In [30]:
doc_store = SimpleDocumentStore.from_persist_path('./document_storage/doc_store_1_0.json')

In [32]:
chroma_client = chromadb.PersistentClient(path="./vector_storage/chroma_db_1_1")

In [33]:
chroma_client.count_collections()

1

In [34]:
chroma_client.list_collections()

[Collection(name=paul_essay)]

In [35]:
chroma_collection = chroma_client.get_or_create_collection("paul_essay")

In [36]:
chroma_client.list_collections()

[Collection(name=paul_essay)]

In [37]:
chroma_collection.count()

47

In [38]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [39]:
pipeline = IngestionPipeline(
    transformations = [SentenceSplitter(chunk_size=512, chunk_overlap=128),
                       OpenAIEmbedding(model_name='text-embedding-3-small', callback_manager=CallbackManager([token_counter]))],
    docstore=doc_store,
    vector_store=vector_store
)

In [40]:
nodes = pipeline.run(documents=documents, show_progress=True, num_workers=-1)

Parsing nodes: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

In [41]:
chroma_collection.count()

47

In [42]:
token_counter.total_embedding_token_count

0

In [49]:
len(pipeline.docstore.docs)

1

In [50]:
len(nodes)

47

In [75]:
for i, j in doc_store.docs.items():
    print(i)
    print(j.to_dict().keys())

data/paul_graham_essay.txt
dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator', 'class_name'])


In [34]:
for i, j in doc_store.docs.items():
    print(i)
    print(j.to_dict().keys())

data/paul_graham_essay.txt
dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator', 'class_name'])


In [101]:
pipeline.docstore.get_all_document_hashes()

{'48f3b379e825cf288ae03eb7b6746d9f6703c698ab5b0ec7b83426fa8725d5dc': 'data/paul_graham_essay.txt'}

In [77]:
nodes[0].to_dict().keys()

dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator', 'class_name'])

In [86]:
pipeline.docstore.get_document_hash('data/paul_graham_essay.txt')

'48f3b379e825cf288ae03eb7b6746d9f6703c698ab5b0ec7b83426fa8725d5dc'

In [87]:
pipeline.docstore.docs.keys()

dict_keys(['data/paul_graham_essay.txt'])

In [88]:
pipeline.docstore.docs.get('data/paul_graham_essay.txt').to_dict()

{'id_': 'data/paul_graham_essay.txt',
 'embedding': None,
 'metadata': {'file_path': 'data/paul_graham_essay.txt',
  'file_name': 'paul_graham_essay.txt',
  'file_type': 'text/plain',
  'file_size': 75042,
  'creation_date': '2024-04-16',
  'last_modified_date': '2024-04-15',
  'author': 'paul_graham'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'text': '\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them 

In [89]:
nodes[0].ref_doc_id

'data/paul_graham_essay.txt'

In [90]:
nodes[0].node_info

{'start': 2, 'end': 1983}

In [48]:
nodes[-1].get_content()

NameError: name 'nodes' is not defined

In [92]:
len(pipeline.docstore.docs)

1

In [93]:
len(nodes)

47

In [94]:
nodes[1].get_content()

"The only other option was to do things that didn't rely on any input, like calculate approximations of pi, but I didn't know enough math to do anything interesting of that type. So I'm not surprised I can't remember any programs I wrote, because they can't have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn't. On a machine without time-sharing, this was a social as well as a technical error, as the data center manager's expression made clear.\n\nWith microcomputers, everything changed. Now you could have a computer sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punch cards and then stopping. [1]\n\nThe first of my friends to get a microcomputer built it himself. It was sold as a kit by Heathkit. I remember vividly how impressed and envious I felt watching him sitting in front of it, typing programs right into the 

In [56]:
chroma_collection.get()['ids']

['0091e075-c23f-4979-b79d-21483a6b7212',
 '0106ef9d-686d-47a7-8e04-3e298e48151c',
 '055cdb12-dd2d-46c6-a5f5-8768e52fd9b6',
 '0a5b4eee-fd64-4c74-8b8e-c75b6745db9a',
 '0fa2cc4b-2fa1-4592-b33e-f7a7bda49b2e',
 '10679646-c4bc-40bb-9de0-93f2b1ba3c2e',
 '13cac8a6-7971-4cea-9489-63da7cf9d79b',
 '1a9568e2-0e0f-4097-b9aa-ea814f1689c1',
 '23e85be3-af94-44ea-b86e-df809d0882c4',
 '25f7bd0c-d5e7-4e9a-a9d6-c3d42824317b',
 '28f42e56-c3d9-4b63-8dc3-620236e8e9ba',
 '3c577370-430e-4b6d-99b9-474070c98a57',
 '3ef96a4e-1870-4135-b9c0-a6ff629125c6',
 '4398bedf-b0be-4de7-9879-aa01f897667e',
 '498388e9-e7b4-49e4-9ab6-095ce7d00730',
 '4ae8daa2-acc2-4fec-9268-7e7552cb39b9',
 '4c33982e-658e-4fa1-837a-dcde299febc6',
 '4eebcd19-103c-405d-a5a7-0573adb6d70e',
 '5bd35c7b-f5a7-4cee-900c-89ae75fba0c8',
 '6455b6fd-c446-4897-a263-c4f1999542f7',
 '6d56d7ff-05ac-4b13-bbce-49b60a140a5f',
 '6f54d15a-0fab-4b89-a250-28565d4b4a68',
 '7547e4ea-a415-4409-b9ee-5b186798f9d9',
 '763befbc-d858-482f-9f2a-23c8af31e43d',
 '7b14de8a-8478-