## Paul graham essay indexing

In [1]:
from indexing.components.loaders import ChunkParser, get_paul_graham_documents
from indexing.components.data_stores import ElasticSearchVectorStoreModule

INDEX_NAME = "paul_graham"

documents = get_paul_graham_documents("data/paul_graham")
nodes = ChunkParser(documents).base_parse()

In [7]:
len(documents), len(nodes)

(1, 83)

In [11]:
documents[0].dict().keys(), nodes[0].dict().keys()

(dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator', 'class_name']),
 dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator', 'class_name']))

In [None]:
ElasticSearchVectorStoreModule(index_name=INDEX_NAME).add_nodes(nodes)

## Survey paper indexing

In [12]:
from enum import Enum
from pprint import pp


class PaperPathEnum(str, Enum):
    rag = "data/papers/rag_survey.pdf"
    eval = "data/papers/eval_survey.pdf"


INDEX_NAME = "papers"

In [13]:
from llama_index.readers.smart_pdf_loader import SmartPDFLoader

llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_url = "https://arxiv.org/abs/2312.10997"

paper_documents = {
    PaperPathEnum.rag: [],
    PaperPathEnum.eval: []
}

pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)

for key in paper_documents:
    paper_documents[key] = pdf_loader.load_data(pdf_path_or_url=key)

In [16]:
len(paper_documents[PaperPathEnum.rag])

146

In [17]:
len(paper_documents[PaperPathEnum.eval])

160

In [18]:
pp(paper_documents[PaperPathEnum.rag][0].dict())

{'id_': '972e47b1-1d72-435a-bffa-158757028fd5',
 'embedding': None,
 'metadata': {'chunk_type': 'para'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'text': 'Retrieval-Augmented Generation for Large Language Models: A Survey\n'
         'Yunfan Gaoa, Yun Xiongb, Xinyu Gaob, Kangxiang Jiab, Jinliu Panb, '
         'Yuxi Bic, Yi Daia, Jiawei Suna, Meng Wangc, and Haofen Wang a,c',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n',
 'class_name': 'Document'}


In [20]:
pp(paper_documents[PaperPathEnum.eval][0].dict())

{'id_': 'ab34c5d6-ac0b-46d1-8870-78d073cfe91e',
 'embedding': None,
 'metadata': {'chunk_type': 'para'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'text': '\n'
         'Evaluation of Retrieval-Augmented Generation: A Survey Hao Yu1,2, '
         'Aoran Gan3, Kai Zhang3, Shiwei Tong1†, Qi Liu3, and Zhaofeng Liu1',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n',
 'class_name': 'Document'}


In [23]:
set([item.dict()['metadata']['chunk_type'] for item in paper_documents[PaperPathEnum.rag]])

{'list_item', 'para', 'table'}

In [24]:
import nest_asyncio
nest_asyncio.apply()

In [25]:
from indexing.components.data_stores import get_bedrock_li_embedding_model, get_openai_li_embedding_model, ElasticSearchVectorStoreModule
from inferences.components.models import get_openai_model
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor, SummaryExtractor

Settings.llm = get_openai_model()
Settings.embed_model = get_openai_li_embedding_model()

transformations = []
# summary_extractor = SummaryExtractor()
# title_extractor = TitleExtractor(nodes=10)
# qa_extractor = QuestionsAnsweredExtractor(questions=2)
# transformations = [summary_extractor, title_extractor, qa_extractor]

for key, documents in paper_documents.items():
    vector_store_module = ElasticSearchVectorStoreModule(
        index_name=INDEX_NAME,
        embedding=get_openai_li_embedding_model()
    )
    ElasticSearchVectorStoreModule(index_name=INDEX_NAME).add_documents(documents, transformations=transformations)

Parsing nodes:   0%|          | 0/146 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/160 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/160 [00:00<?, ?it/s]