In [1]:
import os

In [2]:
from dotenv import load_dotenv

In [3]:
load_dotenv('/home/santhosh/Projects/courses/Pinnacle/.env')

True

In [4]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [5]:
HF_TOKEN = os.environ['HUGGINGFACE_API_KEY']

# File Loader

In [6]:
from llama_index.core import SimpleDirectoryReader

https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt

In [7]:
documents = SimpleDirectoryReader(input_dir="./data", filename_as_id=True).load_data(show_progress=True)

Loading files: 100%|██████████| 2/2 [00:00<00:00, 136.57file/s]


In [8]:
documents = SimpleDirectoryReader(input_files=["./data/paul_graham_essay.txt"], 
                                  filename_as_id=True).load_data(show_progress=True)

Loading files: 100%|██████████| 1/1 [00:00<00:00, 1129.02file/s]


# Ingestion Pipeline

In [None]:
import nest_asyncio

nest_asyncio.apply()

import tiktoken

from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

from llama_index.core import MockEmbedding
from llama_index.core.llms import MockLLM

from llama_index.core.node_parser import SentenceSplitter, SentenceWindowNodeParser, HierarchicalNodeParser

from llama_index.core.ingestion import IngestionPipeline

from llama_index.core.extractors import TitleExtractor, SummaryExtractor

In [38]:
import nest_asyncio

nest_asyncio.apply()


In [19]:
import tiktoken

In [20]:
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

In [21]:
from llama_index.core import MockEmbedding
from llama_index.core.llms import MockLLM

In [22]:
from llama_index.core.node_parser import SentenceSplitter, SentenceWindowNodeParser, SemanticSplitterNodeParser, HierarchicalNodeParser

In [23]:
from llama_index.core.ingestion import IngestionPipeline

In [24]:
from llama_index.core.extractors import TitleExtractor, SummaryExtractor

In [25]:
from llama_index.embeddings.openai import OpenAIEmbedding

In [26]:
from llama_index.core.storage.docstore import SimpleDocumentStore

In [27]:
from llama_index.core import VectorStoreIndex

In [52]:
token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode,
    verbose=True
)

In [53]:
token_counter.reset_counts()

In [54]:
token_counter.total_embedding_token_count

0

In [55]:
mock_pipeline = IngestionPipeline(
    transformations = [SentenceSplitter(chunk_size=512, chunk_overlap=64),
                       TitleExtractor(llm=MockLLM(callback_manager=CallbackManager([token_counter]))),
                      MockEmbedding(embed_dim=1536, callback_manager=CallbackManager([token_counter]))])

In [25]:
mock_pipeline = IngestionPipeline(
    transformations = [SentenceWindowNodeParser.from_defaults(window_size=3),
                      MockEmbedding(embed_dim=1536, callback_manager=CallbackManager([token_counter]))])

In [30]:
mock_pipeline = IngestionPipeline(
    transformations = [SemanticSplitterNodeParser(breakpoint_percentile_threshold=95, 
                                                  embed_model=MockEmbedding(embed_dim=1536, 
                                                                            callback_manager=CallbackManager([token_counter]))),
                      MockEmbedding(embed_dim=1536, callback_manager=CallbackManager([token_counter]))])

In [33]:
mock_pipeline = IngestionPipeline(
    transformations = [HierarchicalNodeParser.from_defaults(chunk_sizes=[2048, 512, 128]),
                      MockEmbedding(embed_dim=1536, callback_manager=CallbackManager([token_counter]))])

In [32]:
mock_pipeline.transformations[0].dict()

{'include_metadata': True,
 'include_prev_next_rel': True,
 'chunk_size': 512,
 'chunk_overlap': 128,
 'separator': ' ',
 'paragraph_separator': '\n\n\n',
 'secondary_chunking_regex': '[^,.;。？！]+[,.;。？！]?',
 'class_name': 'SentenceSplitter'}

In [56]:
nodes = mock_pipeline.run(documents=documents, show_progress=True, num_workers=-1)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

LLM Prompt Token Usage: 490
LLM Completion Token Usage: 490
LLM Prompt Token Usage: 490
LLM Completion Token Usage: 490
LLM Prompt Token Usage: 500
LLM Completion Token Usage: 500
LLM Prompt Token Usage: 500
LLM Completion Token Usage: 500
LLM Prompt Token Usage: 508
LLM Completion Token Usage: 508
LLM Prompt Token Usage: 508
LLM Completion Token Usage: 508
LLM Prompt Token Usage: 487
LLM Completion Token Usage: 487
LLM Prompt Token Usage: 487
LLM Completion Token Usage: 487
LLM Prompt Token Usage: 503
LLM Completion Token Usage: 503
LLM Prompt Token Usage: 503
LLM Completion Token Usage: 503


100%|██████████| 5/5 [00:00<00:00, 348.85it/s]

LLM Prompt Token Usage: 2509
LLM Completion Token Usage: 2509
LLM Prompt Token Usage: 2509
LLM Completion Token Usage: 2509





Generating embeddings:   0%|          | 0/40 [00:00<?, ?it/s]

Embedding Token Usage: 30036
Embedding Token Usage: 30009
Embedding Token Usage: 29976
Embedding Token Usage: 29741


In [57]:
token_counter.total_embedding_token_count

119762

In [58]:
token_counter.total_llm_token_count

19988

In [32]:
token_counter.total_embedding_token_count

126890

In [35]:
token_counter.total_embedding_token_count

186505

In [23]:
pipeline = IngestionPipeline(
    transformations = [SentenceSplitter(chunk_size=512, chunk_overlap=128),
                       OpenAIEmbedding(model_name='text-embedding-3-small', callback_manager=CallbackManager([token_counter]))],
    docstore=SimpleDocumentStore())

In [24]:
nodes = pipeline.run(documents=documents, show_progress=True, num_workers=-1)

Docstore strategy set to upserts, but no vector store. Switching to duplicates_only strategy.


Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/50 [00:00<?, ?it/s]

Embedding Token Usage: 23673


In [25]:
token_counter.total_embedding_token_count

47346

In [14]:
from llama_index.core import MockEmbedding
from llama_index.core.llms import MockLLM

In [15]:
import tiktoken

In [16]:
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

In [17]:
token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode,
    verbose=True
)

In [18]:
token_counter.reset_counts()

In [19]:
token_counter.total_embedding_token_count

0

In [23]:
index = VectorStoreIndex.from_documents(documents, 
                                        embed_model=MockEmbedding(embed_dim=1536, callback_manager=CallbackManager([token_counter])), 
                                        llm=MockLLM(callback_manager=CallbackManager([token_counter])),
                                       callback_manager=CallbackManager([token_counter]))

Embedding Token Usage: 9014
Embedding Token Usage: 7793
Embedding Token Usage: 8338
Embedding Token Usage: 8068
Embedding Token Usage: 7999
Embedding Token Usage: 8031
Embedding Token Usage: 8256
Embedding Token Usage: 8234
Embedding Token Usage: 8173
Embedding Token Usage: 8432
Embedding Token Usage: 8466
Embedding Token Usage: 7694
Embedding Token Usage: 7872
Embedding Token Usage: 8549
Embedding Token Usage: 8030
Embedding Token Usage: 8309
Embedding Token Usage: 8227
Embedding Token Usage: 8830
Embedding Token Usage: 8593
Embedding Token Usage: 8771
Embedding Token Usage: 8518
Embedding Token Usage: 8242
Embedding Token Usage: 8354
Embedding Token Usage: 8565
Embedding Token Usage: 8062
Embedding Token Usage: 8699
Embedding Token Usage: 8927
Embedding Token Usage: 8140
Embedding Token Usage: 5541


In [24]:
token_counter.total_embedding_token_count

238727

In [25]:
token_counter.total_llm_token_count

0