From 4d5f30c3b18807d55de4e954e54acbc2efd95176 Mon Sep 17 00:00:00 2001 From: Felipe Aros Date: Fri, 26 Apr 2024 17:36:13 -0400 Subject: [PATCH] Fix: Bug in ingestion, removed UnstructuredReader and the ability to ingest html --- .gitignore | 2 +- .../ingest.py | 27 ++-- .../requirements.txt | 145 ++++++++++++++++-- 3 files changed, 145 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index 1314cae..f71d542 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ *.env -*requirements_test.txt +*requirements_plain.txt *__pycache__ **/storage/ *.gitattributes diff --git a/2.Pinecone - HybridRetriever - Adv.Ingestion/ingest.py b/2.Pinecone - HybridRetriever - Adv.Ingestion/ingest.py index 41bdc8d..e431f0d 100644 --- a/2.Pinecone - HybridRetriever - Adv.Ingestion/ingest.py +++ b/2.Pinecone - HybridRetriever - Adv.Ingestion/ingest.py @@ -1,14 +1,13 @@ import os import openai -import asyncio import argparse from dotenv import load_dotenv -from pinecone import Pinecone, PodSpec +from pinecone import Pinecone, ServerlessSpec +from llama_parse import LlamaParse from llama_index.core import SimpleDirectoryReader from llama_index.llms.openai import OpenAI -from llama_index.readers.file import UnstructuredReader from llama_index.core.ingestion import IngestionPipeline from llama_index.core.node_parser import SentenceSplitter from llama_index.embeddings.openai import OpenAIEmbedding @@ -19,7 +18,6 @@ # SummaryExtractor, # KeywordExtractor, ) -from llama_parse import LlamaParse load_dotenv() openai.api_key = os.environ.get("OPENAI_API_KEY") @@ -43,13 +41,13 @@ def get_pinecone_vector_store(pinecone_index): return vector_store -def create_pinecone_pod(pc, index_name): - print("Creating pinecone pod") +def create_pinecone_serverless_index(pc, index_name): + print("Creating pinecone serverless index") pc.create_index( name=index_name, dimension=3072, metric="dotproduct", - spec=PodSpec(environment="gcp-starter"), + spec=ServerlessSpec(cloud="aws", region="us-east-1"), ) @@ -60,8 +58,6 @@ def get_documents(input_dir): file_extractor = { ".pdf": llama_parser, - ".html": UnstructuredReader(), - ".txt": UnstructuredReader(), } print("Reading directory") director_reader = SimpleDirectoryReader( @@ -86,15 +82,11 @@ def run_pipeline(documents, vector_store, llm, num_workers): ], vector_store=vector_store, ) - for doc in documents: # Small patch to remove last_accessed_date from metadata - k = vars(doc) - del k["metadata"]["last_accessed_date"] pipeline.run(documents=documents, show_progress=True, num_workers=num_workers) -async def main(): - print("Starting ingestion") - input_dir = "./data/source_files/" +def main(): + input_dir = "./data/" index_name = "rag-index" num_cores = os.cpu_count() num_workers = min(4, num_cores) @@ -107,7 +99,7 @@ async def main(): ) args = parser.parse_args() if args.gen: - create_pinecone_pod(pc, index_name) + create_pinecone_serverless_index(pc, index_name) llm = OpenAI(temperature=0.1, model=MODEL, max_tokens=1024) pinecone_index = get_pinecone_index(pc, index_name) vector_store = get_pinecone_vector_store(pinecone_index) @@ -117,4 +109,5 @@ async def main(): if __name__ == "__main__": - asyncio.run(main()) + print("Starting ingestion") + main() diff --git a/2.Pinecone - HybridRetriever - Adv.Ingestion/requirements.txt b/2.Pinecone - HybridRetriever - Adv.Ingestion/requirements.txt index 0ef637d..c84bf73 100644 --- a/2.Pinecone - HybridRetriever - Adv.Ingestion/requirements.txt +++ b/2.Pinecone - HybridRetriever - Adv.Ingestion/requirements.txt @@ -1,11 +1,134 @@ -openai -chainlit -llama-hub -llama-index -llama-parse -unstructured -pinecone-client -llama-index-core # Feb 21 2024 -llama-index-llms-openai # Feb 21 2024 -llama-index-embeddings-openai -llama-index-vector-stores-pinecone \ No newline at end of file +aiofiles==23.2.1 +aiohttp==3.9.5 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==3.7.1 +asyncer==0.0.2 +attrs==23.2.0 +backoff==2.2.1 +beautifulsoup4==4.12.3 +bidict==0.23.1 +certifi==2024.2.2 +chainlit==1.0.505 +chardet==5.2.0 +charset-normalizer==3.3.2 +chevron==0.14.0 +click==8.1.7 +dataclasses-json==0.5.14 +dataclasses-json-speakeasy==0.5.11 +Deprecated==1.2.14 +dirtyjson==1.0.8 +distro==1.9.0 +emoji==2.11.1 +fastapi==0.110.2 +fastapi-socketio==0.0.10 +filelock==3.13.4 +filetype==1.2.0 +frozenlist==1.4.1 +fsspec==2024.3.1 +googleapis-common-protos==1.63.0 +greenlet==3.0.3 +grpcio==1.62.2 +h11==0.14.0 +html2text==2024.2.26 +httpcore==1.0.5 +httpx==0.27.0 +huggingface-hub==0.22.2 +idna==3.7 +importlib-metadata==7.0.0 +joblib==1.4.0 +jsonpath-python==1.0.6 +langdetect==1.0.9 +Lazify==0.4.0 +literalai==0.0.507 +llama-hub==0.0.79.post1 +llama-index==0.10.32 +llama-index-agent-openai==0.2.3 +llama-index-cli==0.1.12 +llama-index-core==0.10.32 +llama-index-embeddings-openai==0.1.9 +llama-index-indices-managed-llama-cloud==0.1.6 +llama-index-legacy==0.9.48 +llama-index-llms-openai==0.1.16 +llama-index-multi-modal-llms-openai==0.1.5 +llama-index-program-openai==0.1.6 +llama-index-question-gen-openai==0.1.3 +llama-index-readers-file==0.1.19 +llama-index-readers-llama-parse==0.1.4 +llama-index-vector-stores-pinecone==0.1.6 +llama-parse==0.4.2 +llamaindex-py-client==0.1.19 +lxml==5.2.1 +marshmallow==3.21.1 +multidict==6.0.5 +mypy-extensions==1.0.0 +nest-asyncio==1.6.0 +networkx==3.3 +nltk==3.8.1 +numpy==1.26.4 +openai==1.23.6 +opentelemetry-api==1.24.0 +opentelemetry-exporter-otlp==1.24.0 +opentelemetry-exporter-otlp-proto-common==1.24.0 +opentelemetry-exporter-otlp-proto-grpc==1.24.0 +opentelemetry-exporter-otlp-proto-http==1.24.0 +opentelemetry-instrumentation==0.45b0 +opentelemetry-proto==1.24.0 +opentelemetry-sdk==1.24.0 +opentelemetry-semantic-conventions==0.45b0 +packaging==23.2 +pandas==2.2.2 +pillow==10.3.0 +pinecone-client==3.2.2 +protobuf==4.25.3 +psutil==5.9.8 +pyaml==23.12.0 +pydantic==2.7.1 +pydantic_core==2.18.2 +PyJWT==2.8.0 +pypdf==4.2.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-engineio==4.9.0 +python-graphql-client==0.4.3 +python-iso639==2024.2.7 +python-magic==0.4.27 +python-multipart==0.0.9 +python-socketio==5.11.2 +pytz==2024.1 +PyYAML==6.0.1 +rapidfuzz==3.8.1 +regex==2024.4.16 +requests==2.31.0 +retrying==1.3.4 +safetensors==0.4.3 +simple-websocket==1.0.0 +six==1.16.0 +sniffio==1.3.1 +soupsieve==2.5 +SQLAlchemy==2.0.29 +starlette==0.37.2 +striprtf==0.0.26 +syncer==2.0.3 +tabulate==0.9.0 +tenacity==8.2.3 +tiktoken==0.6.0 +tokenizer==3.4.3 +tokenizers==0.19.1 +tomli==2.0.1 +tqdm==4.66.2 +transformers==4.40.1 +typing-inspect==0.9.0 +typing_extensions==4.11.0 +tzdata==2024.1 +unstructured==0.13.4 +unstructured-client==0.18.0 +uptrace==1.24.0 +urllib3==2.2.1 +uvicorn==0.25.0 +watchfiles==0.20.0 +websockets==12.0 +wrapt==1.16.0 +wsproto==1.2.0 +yarl==1.9.4 +zipp==3.18.1