In [12]:
! pip install pinecone-client==2.2.4 # restart kernel after running this cell

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


# Process The Data

### Load the needed libraries

In [1]:
from domino_data.vectordb import DominoPineconeConfiguration
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Pinecone

import csv
import os
import random
import pinecone
import sys

from mlflow.deployments import get_deploy_client
import os

client = get_deploy_client(os.environ['DOMINO_MLFLOW_DEPLOYMENTS'])

  from tqdm.autonotebook import tqdm
* 'schema_extra' has been renamed to 'json_schema_extra'


### Set variables

In [2]:
texts = []
metadata = []
chunk_size=768
chunk_overlap=0
strip_whitespace = True
separators=["\n\n", "\n", ".", " ", ""]
PINECONE_ENV="gcp-starter"

In [3]:
# Load the embedding model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model_name = "BAAI/bge-small-en"
os.environ['SENTENCE_TRANSFORMERS_HOME'] = './model_cache/'
embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model_name,
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )

In [4]:

# Load the document that you need to parse, please change the location to where the pdf resides

# Load 1 PDF file
loader = PyPDFLoader("/mnt/code/data/apple-10K-20230930.pdf")
# or load an entire folder
# loader = PyPDFDirectoryLoader("/mnt/data/RAG/")
data = loader.load_and_split(RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    strip_whitespace=strip_whitespace,
    add_start_index = True,))

In [5]:
print(f"There are {len(data)} chunks in the document")

There are 419 chunks in the document


In [6]:
# Pick a sample page
print(data[random.randint(0, len(data))])

page_content='paper progra m for general corporate purposes, including dividends and share repurchases. As of September 30, 2023 and September 24, 2022, the Company\nhad $6.0 billion and $10.0 billion of commercial paper outstanding, respectively , with maturities generally less than nine months. The weighted-average interest\nrate of the Company’ s commercial paper was 5.28% and 2.31% as of September 30, 2023 and September 24, 2022, respectively . The following table provides a\nsummary of cash flows associated with the issuance and maturities of commercial paper for 2023, 2022 and 2021 (in millions):\n2023 2022 2021\nMaturities 90 days or less:\nProceeds from/(Repayments of) commercial paper , net $ (1,333) $ 5,264 $ (357)\nMaturities greater than 90 days:' metadata={'source': '/mnt/code/data/apple-10K-20230930.pdf', 'page': 45, 'start_index': 1422}


In [None]:
print(data[0][20])

In [None]:
# Split the data into pages
metadatas = []
texts = []
for row in data:
  metadatas.append(row.metadata)
  texts.append(row.page_content)
print(len(metadatas),len(texts))

In [7]:
datasource_name = "mrag-fin-docs-ja"
conf = DominoPineconeConfiguration(datasource=datasource_name)
# The pinecone API key should be provided when creating the Domino Data Source and persisted securely.
# This api_key variable here is only used for satisfying the native pinecone python client initialization where
# api_key is a mandatory non-empty field.
api_key = os.environ.get("DOMINO_VECTOR_DB_METADATA", datasource_name)

pinecone.init(
    api_key=api_key,
    environment=PINECONE_ENV,
    openapi_config=conf)

In [8]:
# Previously created index
index_name = "mrag-fin-docs"
index = pinecone.Index(index_name)

In [14]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.00419,
 'namespaces': {'': {'vector_count': 419}},
 'total_vector_count': 419}

In [10]:
docsearch = Pinecone.from_texts([d.page_content for d in data], embeddings.embed_query, index_name=index_name)



In [None]:
vector_store = Pinecone.from_documents(
    data,
    embeddings,
    index_name=index_name
)

In [12]:
# Ask your query
query = "What is the expected effective tax rate for Apple in FY23?"
# Get the closest matches to create some context and information for the answer
docs = docsearch.similarity_search(query)

In [13]:
print(docs)

[Document(page_content='Apple Inc. | 2023 Form 10-K | 13'), Document(page_content='Apple Inc. | 2023 Form 10-K | 40'), Document(page_content='A reconciliation of the Company’ s segment operating income to the Consolidated Statements of Operations for 2023, 2022 and 2021 is as follows (in millions):\n2023 2022 2021\nSegment operating income $ 150,888 $ 152,895 $ 137,006 \nResearch and development expense (29,915) (26,251) (21,914)\nOther corporate expenses, net (6,672) (7,207) (6,143)\nTotal operating income $ 114,301 $ 119,437 $ 108,949 \n(1)Includes corporate marketing expenses, certain share-based compensation expenses, various nonrecurring charges, and other separately managed general\nand administrative costs.(1)\nApple Inc. | 2023 Form 10-K | 47'), Document(page_content='As of September 30, 2023, the balance of the deemed repatriation tax payable imposed by the U.S. Tax Cuts and Jobs Act of 2017 (the “Act”) was $22.0 billion,\nwith $6.5 billion expected to be paid within 12 months