Skip to content

Commit

Permalink
Merge pull request #11 from felipearosr/feature/ServerlessPinecone
Browse files Browse the repository at this point in the history
Fix: Bug in ingestion, removed UnstructuredReader and the ability to …
  • Loading branch information
felipearosr committed Apr 26, 2024
2 parents 90da17c + 4d5f30c commit 93ccc57
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 29 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
*.env
*requirements_test.txt
*requirements_plain.txt
*__pycache__
**/storage/
*.gitattributes
Expand Down
27 changes: 10 additions & 17 deletions 2.Pinecone - HybridRetriever - Adv.Ingestion/ingest.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import os
import openai
import asyncio
import argparse

from dotenv import load_dotenv
from pinecone import Pinecone, PodSpec
from pinecone import Pinecone, ServerlessSpec

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.readers.file import UnstructuredReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
Expand All @@ -19,7 +18,6 @@
# SummaryExtractor,
# KeywordExtractor,
)
from llama_parse import LlamaParse

load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")
Expand All @@ -43,13 +41,13 @@ def get_pinecone_vector_store(pinecone_index):
return vector_store


def create_pinecone_pod(pc, index_name):
print("Creating pinecone pod")
def create_pinecone_serverless_index(pc, index_name):
print("Creating pinecone serverless index")
pc.create_index(
name=index_name,
dimension=3072,
metric="dotproduct",
spec=PodSpec(environment="gcp-starter"),
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)


Expand All @@ -60,8 +58,6 @@ def get_documents(input_dir):

file_extractor = {
".pdf": llama_parser,
".html": UnstructuredReader(),
".txt": UnstructuredReader(),
}
print("Reading directory")
director_reader = SimpleDirectoryReader(
Expand All @@ -86,15 +82,11 @@ def run_pipeline(documents, vector_store, llm, num_workers):
],
vector_store=vector_store,
)
for doc in documents: # Small patch to remove last_accessed_date from metadata
k = vars(doc)
del k["metadata"]["last_accessed_date"]
pipeline.run(documents=documents, show_progress=True, num_workers=num_workers)


async def main():
print("Starting ingestion")
input_dir = "./data/source_files/"
def main():
input_dir = "./data/"
index_name = "rag-index"
num_cores = os.cpu_count()
num_workers = min(4, num_cores)
Expand All @@ -107,7 +99,7 @@ async def main():
)
args = parser.parse_args()
if args.gen:
create_pinecone_pod(pc, index_name)
create_pinecone_serverless_index(pc, index_name)
llm = OpenAI(temperature=0.1, model=MODEL, max_tokens=1024)
pinecone_index = get_pinecone_index(pc, index_name)
vector_store = get_pinecone_vector_store(pinecone_index)
Expand All @@ -117,4 +109,5 @@ async def main():


if __name__ == "__main__":
asyncio.run(main())
print("Starting ingestion")
main()
145 changes: 134 additions & 11 deletions 2.Pinecone - HybridRetriever - Adv.Ingestion/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,134 @@
openai
chainlit
llama-hub
llama-index
llama-parse
unstructured
pinecone-client
llama-index-core # Feb 21 2024
llama-index-llms-openai # Feb 21 2024
llama-index-embeddings-openai
llama-index-vector-stores-pinecone
aiofiles==23.2.1
aiohttp==3.9.5
aiosignal==1.3.1
annotated-types==0.6.0
anyio==3.7.1
asyncer==0.0.2
attrs==23.2.0
backoff==2.2.1
beautifulsoup4==4.12.3
bidict==0.23.1
certifi==2024.2.2
chainlit==1.0.505
chardet==5.2.0
charset-normalizer==3.3.2
chevron==0.14.0
click==8.1.7
dataclasses-json==0.5.14
dataclasses-json-speakeasy==0.5.11
Deprecated==1.2.14
dirtyjson==1.0.8
distro==1.9.0
emoji==2.11.1
fastapi==0.110.2
fastapi-socketio==0.0.10
filelock==3.13.4
filetype==1.2.0
frozenlist==1.4.1
fsspec==2024.3.1
googleapis-common-protos==1.63.0
greenlet==3.0.3
grpcio==1.62.2
h11==0.14.0
html2text==2024.2.26
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.22.2
idna==3.7
importlib-metadata==7.0.0
joblib==1.4.0
jsonpath-python==1.0.6
langdetect==1.0.9
Lazify==0.4.0
literalai==0.0.507
llama-hub==0.0.79.post1
llama-index==0.10.32
llama-index-agent-openai==0.2.3
llama-index-cli==0.1.12
llama-index-core==0.10.32
llama-index-embeddings-openai==0.1.9
llama-index-indices-managed-llama-cloud==0.1.6
llama-index-legacy==0.9.48
llama-index-llms-openai==0.1.16
llama-index-multi-modal-llms-openai==0.1.5
llama-index-program-openai==0.1.6
llama-index-question-gen-openai==0.1.3
llama-index-readers-file==0.1.19
llama-index-readers-llama-parse==0.1.4
llama-index-vector-stores-pinecone==0.1.6
llama-parse==0.4.2
llamaindex-py-client==0.1.19
lxml==5.2.1
marshmallow==3.21.1
multidict==6.0.5
mypy-extensions==1.0.0
nest-asyncio==1.6.0
networkx==3.3
nltk==3.8.1
numpy==1.26.4
openai==1.23.6
opentelemetry-api==1.24.0
opentelemetry-exporter-otlp==1.24.0
opentelemetry-exporter-otlp-proto-common==1.24.0
opentelemetry-exporter-otlp-proto-grpc==1.24.0
opentelemetry-exporter-otlp-proto-http==1.24.0
opentelemetry-instrumentation==0.45b0
opentelemetry-proto==1.24.0
opentelemetry-sdk==1.24.0
opentelemetry-semantic-conventions==0.45b0
packaging==23.2
pandas==2.2.2
pillow==10.3.0
pinecone-client==3.2.2
protobuf==4.25.3
psutil==5.9.8
pyaml==23.12.0
pydantic==2.7.1
pydantic_core==2.18.2
PyJWT==2.8.0
pypdf==4.2.0
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-engineio==4.9.0
python-graphql-client==0.4.3
python-iso639==2024.2.7
python-magic==0.4.27
python-multipart==0.0.9
python-socketio==5.11.2
pytz==2024.1
PyYAML==6.0.1
rapidfuzz==3.8.1
regex==2024.4.16
requests==2.31.0
retrying==1.3.4
safetensors==0.4.3
simple-websocket==1.0.0
six==1.16.0
sniffio==1.3.1
soupsieve==2.5
SQLAlchemy==2.0.29
starlette==0.37.2
striprtf==0.0.26
syncer==2.0.3
tabulate==0.9.0
tenacity==8.2.3
tiktoken==0.6.0
tokenizer==3.4.3
tokenizers==0.19.1
tomli==2.0.1
tqdm==4.66.2
transformers==4.40.1
typing-inspect==0.9.0
typing_extensions==4.11.0
tzdata==2024.1
unstructured==0.13.4
unstructured-client==0.18.0
uptrace==1.24.0
urllib3==2.2.1
uvicorn==0.25.0
watchfiles==0.20.0
websockets==12.0
wrapt==1.16.0
wsproto==1.2.0
yarl==1.9.4
zipp==3.18.1

0 comments on commit 93ccc57

Please sign in to comment.