# Process The Data

### Load the needed libraries

In [1]:
from domino_data.vectordb import DominoPineconeConfiguration
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.embeddings import MlflowEmbeddings
from langchain.vectorstores import Pinecone


import csv
import os
import random
import pinecone
import sys

from mlflow.deployments import get_deploy_client
import os

client = get_deploy_client(os.environ['DOMINO_MLFLOW_DEPLOYMENTS'])

  from tqdm.autonotebook import tqdm
* 'schema_extra' has been renamed to 'json_schema_extra'


### Set variables

In [2]:
texts = []
metadata = []
chunk_size=1000
chunk_overlap=200
strip_whitespace = True
separators=["\n\n", "\n", ".", " ", ""]
PINECONE_ENV="domino"

In [3]:
embed = MlflowEmbeddings(
    target_uri=os.environ["DOMINO_MLFLOW_DEPLOYMENTS"],
    endpoint="embedding-ada-002ja2",
)

In [5]:

# Load the document that you need to parse, please change the location to where the pdf resides

# Load 1 PDF file
loader = PyPDFLoader("/mnt/code/data/apple-10K-20230930.pdf")
# or load an entire folder
# loader = PyPDFDirectoryLoader("/mnt/data/RAG/")
data = loader.load_and_split(RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    strip_whitespace=strip_whitespace,
    add_start_index = True,))

In [6]:
print(f"There are {len(data)} chunks in the document")

There are 361 chunks in the document


In [7]:
# Pick a sample page for verification
print(data[random.randint(0, len(data))])

page_content='notes representing the 0.000% Notes due 2025 and 0.500% Notes due 2031.8-K 4.1 11/15/19\n4.23 Officer’s Certificate of the Registrant, dated as of May 11, 2020, including forms of global notes\nrepresenting the 0.750% Notes due 2023, 1.125% Notes due 2025, 1.650% Notes due 2030 and\n2.650% Notes due 2050.8-K 4.1 5/11/20\n4.24 Officer’s Certificate of the Registrant, dated as of August 20, 2020, including forms of global notes\nrepresenting the 0.550% Notes due 2025, 1.25% Notes due 2030, 2.400% Notes due 2050 and\n2.550% Notes due 2060.8-K 4.1 8/20/20\n4.25 Officer’s Certificate of the Registrant, dated as of February 8, 2021, including forms of global notes\nrepresenting the 0.700% Notes due 2026, 1.200% Notes due 2028, 1.650% Notes due\n2031, 2.375% Notes due 2041, 2.650% Notes due 2051 and 2.800% Notes due 2061.8-K 4.1 2/8/21\n4.26 Officer’s Certificate of the Registrant, dated as of August 5, 2021, including forms of global notes' metadata={'source': '/mnt/code/data/a

create an index on pinecone with name: mrag-fin-docs, 1536

In [8]:
datasource_name = "mrag-fin-docs-ja"
conf = DominoPineconeConfiguration(datasource=datasource_name)
# The pinecone API key should be provided when creating the Domino Data Source and persisted securely.
# This api_key variable here is only used for satisfying the native pinecone python client initialization where
# api_key is a mandatory non-empty field.
api_key = os.environ.get("DOMINO_VECTOR_DB_METADATA", datasource_name)

pinecone.init(
    api_key=api_key,
    environment=PINECONE_ENV,
    openapi_config=conf)

In [9]:
# Previously created index
index_name = "mrag-fin-docs"
index = pinecone.Index(index_name)

In [10]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [11]:
docsearch = Pinecone.from_texts([d.page_content for d in data], embed.embed_query, index_name=index_name)



In [12]:
# Ask your query
query = "How did the Americas do in net sales in FY23?" #"What is the expected effective tax rate for Apple in FY23?"
# Get the closest matches to create some context and information for the answer
docs = docsearch.similarity_search(query)

In [13]:
print(docs)

[Document(page_content='income before provision for income taxes for 2023, 2022 and 2021, is as follows (dollars in millions):\n2023 2022 2021\nComputed expected tax $ 23,885 $ 25,012 $ 22,933 \nState taxes, net of federal ef fect 1,124 1,518 1,151 \nEarnings of foreign subsidiaries (5,744) (4,366) (4,715)\nResearch and development credit, net (1,212) (1,153) (1,033)\nExcess tax benefits from equity awards (1,120) (1,871) (2,137)\nForeign-derived intangible income deduction — (296) (1,372)\nOther (192) 456 (300)\nProvision for income taxes $ 16,741 $ 19,300 $ 14,527 \nEffective tax rate 14.7 % 16.2 % 13.3 %\nApple Inc. | 2023 Form 10-K | 40'), Document(page_content='effective tax rate. In accounting for some of the uncertain tax positions, Apple Inc. uses significant judgment in the\ninterpretation and application of complex domestic and international tax laws.\nAuditing management’ s evaluation of whether an uncertain tax position is more likely than not to be sustained and\nthe measu