In [None]:
!pip install langchain langchain-chroma langchain[docarray] docarray pypdf langchain-community langchain-openai

# We instanciate the LLM model and the Embedding model

In [None]:
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()
parser = StrOutputParser()
model = Ollama(model='llama3.1')
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Test
response_from_model = model.invoke("Give me an inspirational quote")
parsed_response = parser.parse(response_from_model)
print(parsed_response)

Here's one:

"Believe you can and you're halfway there." - Theodore Roosevelt

I hope it motivates and inspires you to reach your goals!


In [None]:
from langchain.prompts import PromptTemplate

template = """
Answer the question with the help of the context below. Also use your own knowledge base.

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

# Test
formatted_prompt = prompt.format(context="My parents named me Sergio", question="What's your name?")
response_from_model = model.invoke(formatted_prompt)
parsed_response = parser.parse(response_from_model)
print(parsed_response)

My name is Sergio.


# Load a PDF to do Retrieval Augmented Generation (RAG)


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings


loader = PyPDFLoader("Documents/GitHub/KG-RAG-datasets/sec-10-q/data/v1/docs/2022 Q3 AMZN.pdf")
pages = loader.load_and_split()


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
text_documents = text_splitter.split_documents(pages)
vectorstore = Chroma.from_documents(documents=text_documents, embedding=embeddings, persist_directory='.\RAG', )


In [None]:
# Load Generated Embeddings
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
db = Chroma(persist_directory="./RAG", embedding_function=embeddings)

# Test
docs = db.similarity_search("What was Apple's cash flow from operating activities as reported in the Q3 2022 10-Q?")
print(docs[0].page_content)



Apple Inc.
CONDENSED CONSOLIDA TED ST ATEMENTS OF CASH FLOWS (Unaudited)
(In millions)
Nine Months Ended
June 25,
2022June 26,
2021
Cash, cash equivalents and restricted cash, beginning balances $ 35,929 $ 39,789 
Operating activities:
Net income 79,082 74,129 
Adjustments to reconcile net income to cash generated by operating activities:
Depreciation and amortization 8,239 8,295 
Share-based compensation expense 6,760 5,961 
Deferred income tax expense/(benefit) 2,756 (737)
Other (61) (689)
Changes in operating assets and liabilities:
Accounts receivable, net 4,561 (1,316)
Inventories 1,049 (1,213)
Vendor non-trade receivables 4,789 4,892 
Other current and non-current assets (3,289) (5,899)
Accounts payable (6,108) (1,786)
Deferred revenue 260 1,738 
Other current and non-current liabilities (14) 463 
Cash generated by operating activities 98,024 83,838 
Investing activities:
Purchases of marketable securities (70,178) (94,052)


# Create retriever of vectors that are similar to be used as context

In [None]:
retriever = db.as_retriever()
retriever.invoke("effective tax rate")

[Document(metadata={'page': 20, 'source': 'Documents/GitHub/KG-RAG-datasets/sec-10-q/data/v1/docs/2022 Q3 AAPL.pdf'}, page_content='effective tax rate on foreign earnings, tax benefits from share-based compensation and the impact of the U.S. federal R&D tax credit, partially offset by state\nincome taxes.\nApple Inc. | Q3 2022 Form 10-Q | 18'),
 Document(metadata={'page': 20, 'source': 'Documents/GitHub/KG-RAG-datasets/sec-10-q/data/v1/docs/2022 Q3 AAPL.pdf'}, page_content='Total other income/(expense), net $ (10) $ 243 (104) %$ (97) $ 796 (112)%\nOI&E decrea sed during  the third quarter and first nine months of 2022 compared to the same periods in 2021 due primarily to fair value adjustments and\nrealized losses on marketable securities and higher interest expense, partially of fset by foreign exchange gains.\nProvision for Income T axes\nProvision for income taxes, effective tax rate and statutory federal income tax rate for the three- and nine-month periods ended June 25, 2022 and 

# Generate conversate with the document to extract the details

In [None]:
# Load Qs
import pandas as pd

df = pd.read_csv('questions_final.csv')

questions = df['Question'].tolist()

for question in questions:
    retrieved_context = retriever.invoke(question)
    formatted_prompt = prompt.format(context=retrieved_context, question=question)
    response_from_model = model.invoke(formatted_prompt)
    parsed_response = parser.parse(response_from_model)

    print(f"Question: {question}")
    print(f"Answer: {parsed_response}")
    print(f"Context: {retrieved_context}")
    print()

Question: Which reportable geographic segment generated highest net sales for Apple in Q3 2022?
Answer: According to the provided documents, specifically the one with metadata={'page': 18, 'source': 'Documents/GitHub/KG-RAG-datasets/sec-10-q/data/v1/docs/2022 Q3 AAPL.pdf'} and page_content containing a table showing net sales by reportable segment for the three-month period ended June 25, 2022:

The Americas segment generated $37,472 million in net sales.

However, to answer which reportable geographic segment generated the highest net sales, we should look at all segments. 

According to the same document, the table shows that:
- Americas: $37,472 million
- Europe: $19,287 million
- Greater China: $14,604 million

Thus, the Americas segment generated the highest net sales for Apple in Q3 2022.
Context: [Document(metadata={'page': 18, 'source': 'Documents/GitHub/KG-RAG-datasets/sec-10-q/data/v1/docs/2022 Q3 AAPL.pdf'}, page_content='Services\nServices net sales increased during the thi