# Process The Data 

Before you can use Retrieval Augmented Generation (RAG) to answer questions about your documents, they need to be preprocessed and loaded or upserted into the vector DB.

## Load the needed libraries

In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.vectorstores.pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from mlflow.deployments import get_deploy_client
from langchain_community.embeddings import MlflowEmbeddings
from domino_data.vectordb import domino_pinecone3x_init_params, domino_pinecone3x_index_params

from pinecone import Pinecone

import os
import random
import warnings
warnings.filterwarnings('ignore')

* 'schema_extra' has been renamed to 'json_schema_extra'


### Set variables

In [2]:
texts = []
metadata = []
chunk_size=1000
chunk_overlap=200
strip_whitespace = True
separators=["\n\n", "\n", ".", " ", ""]
PINECONE_ENV="domino"

### Load and chunk the PDF document 

In [3]:
# Load 1 PDF file
loader = PyPDFLoader("/mnt/code/data/apple-10K-20230930.pdf")
# or load an entire folder
# loader = PyPDFDirectoryLoader("/mnt/code/data/")
data = loader.load_and_split(RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    strip_whitespace=strip_whitespace,
    add_start_index = True,))

### Segment out the content from the metadata to use from_texts() to upsert records into Pinecone

In [18]:
texts = []
for row in data:
  texts.append(row.page_content)

### Check the number of pages in the document

In [4]:
print(f"There are {len(data)} pages in the document")

There are 361 pages in the document


### Examine a random sample page

In [5]:
print(data[random.randint(0, len(data))])

page_content='Deferred revenue 8,061 7,912 \nCommercial paper 5,985 9,982 \nTerm debt 9,822 11,128 \nTotal current liabilities 145,308 153,982 \nNon-current liabilities:\nTerm debt 95,281 98,959 \nOther non-current liabilities 49,848 49,142 \nTotal non-current liabilities 145,129 148,101 \nTotal liabilities 290,437 302,083 \nCommitments and contingencies\nShareholders’  equity:\nCommon stock and additional paid-in capital, $0.00001 par value: 50,400,000 shares authorized; 15,550,061\nand 15,943,425 shares issued and outstanding, respectively 73,812 64,849 \nAccumulated deficit (214) (3,068)\nAccumulated other comprehensive loss (11,452) (11,109)\nTotal shareholders’  equity 62,146 50,672 \nTotal liabilities and shareholders’  equity $ 352,583 $ 352,755 \nSee accompanying Notes to Consolidated Financial Statements.\nApple Inc. | 2023 Form 10-K | 30' metadata={'source': '/mnt/code/data/apple-10K-20230930.pdf', 'page': 32, 'start_index': 819}


### Create embeddings to embed queries using Domino AI Gateway Endpoint in LangChain

In [10]:
embeddings = MlflowEmbeddings(
    target_uri=os.environ["DOMINO_MLFLOW_DEPLOYMENTS"],
    endpoint="embedding-ada-002ja2",
)

### Initialize Pinecone vector store using a Domino-specific Environment

In [13]:
#Domino Vector Data Source name
datasource_name = "mrag-fin-docs-ja"

#Domino Vector Data Source Configuration 
pc = Pinecone(**domino_pinecone3x_init_params(datasource_name))
index = pc.Index(**domino_pinecone3x_index_params(datasource_name, index_name))

### Initialize previously created Pinecone Index

In [12]:
index_name = "mrag-fin-docs"
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

### Check index current stats as a simple checkpoint 

You'll see that the index has a ```total_vector_count```. This shows the number of vectors are currently present. 

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Embed and then upsert each chunk into the Pinecone database using the previously created index

In [19]:
docsearch = PineconeVectorStore.from_texts(
        texts,
        index_name=index_name,
        embedding=embeddings
    )

### Test with similarity search 

Use a test query with similarity search to get the closest matches to create some context and information for the answer

In [20]:
# Ask your query
query = "How did the Americas do in net sales in FY23?"
# 
docs = docsearch.similarity_search(query)

### Examine the similarity search results

In [21]:
print(docs)

[Document(page_content='Segment Operating Performance\nThe following table shows net sales by reportable segment for 2023, 2022 and 2021 (dollars in millions):\n2023 Change 2022 Change 2021\nNet sales by reportable segment:\nAmericas $ 162,560 (4)%$ 169,658 11 %$ 153,306 \nEurope 94,294 (1)% 95,118 7 % 89,307 \nGreater China 72,559 (2)% 74,200 9 % 68,366 \nJapan 24,257 (7)% 25,977 (9)% 28,482 \nRest of Asia Pacific 29,615 1 % 29,375 11 % 26,356 \nTotal net sales $ 383,285 (3)%$ 394,328 8 %$ 365,817 \nAmericas\nAmericas net sales decreased 4% or $7.1 billion  during 2023 compared to 2022 due to lower net sales of iPhone and Mac, partially offset by higher net sales of\nServices.\nEurope\nEurope net sales decreased 1% or $824 million during 2023 compared to 2022. The weakness in foreign currencies relative to the U.S. dollar accounted for\nmore than the entire year-over-year decrease in Europe net sales, which consisted primarily of lower net sales of Mac and Wearables, Home and Accessor