In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter


# Load Documents

In [6]:
loader = PyPDFLoader("../data/FY2223-Q3-JFM-8-K-Final.pdf")

# Load the document by calling loader.load()
pages = loader.load()

print(len(pages))
print(pages[0].page_content[0:500])

print(pages[0].metadata)

15
News Release The Procter & Gamble Company
One P&G Plaza
Cincinnati, OH 45202
P&G ANNOUNCES FISCAL YEAR 2023  THIRD QUARTER  RESULTS
Net Sales +4%; Organic Sales +7%
Diluted EPS and Core EPS $1.37, each +3%
RAISES SALES GROWTH AND CASH RETURN GUIDANCE
MAINTAINS FISCAL YEAR EPS GROWTH GUIDANCE RANGE
 
CINCINNATI, April 21, 2023  - The Procter & Gamble Company (NYSE:PG) reported third 
quarter  fiscal year 2023 net sales of $20.1 billion, an increase of four percent versus the prior year. 
Organic 
{'source': '../data/FY2223-Q3-JFM-8-K-Final.pdf', 'page': 0}


# Chunk Documents

In [14]:
# Define the Text Splitter 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

#Create a split of the document using the text splitter
splits = text_splitter.split_documents(pages)

In [16]:
len(splits)

27

In [17]:
splits[1]

Document(page_content='Third Quarter ($ billions, except EPS)\nGAAP 2023 2022 % Change Non-GAAP* 2023 2022 % Change\nNet Sales 20.1 19.4 4% Organic Sales n/a n/a 7%\nDiluted EPS 1.37 1.33 3% Core EPS 1.37 1.33 3%\n*Please refer to Exhibit 1 - Non-GAAP Measures for the definition and reconciliation of these measures to the related GAAP measures. \n“We delivered strong results in the third quarter of fiscal year 2023 in what continues to be a very \ndifficult cost and operating environment,” said Jon Moeller, Chairman of the Board, President and Chief \nExecutive Officer. “Our team’s strong execution of our strategies and our progress through three quarters \nenable us to raise our fiscal year outlook for sales growth and cash return to shareowners and maintain our \nguidance range for EPS growth despite continued cost and foreign exchange headwinds. We remain \ncommitted to our integrated strategies of a focused product portfolio of daily use categories where \nperformance drives brand 

# Embedding

In [18]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [19]:
embedding = OpenAIEmbeddings()

ValidationError: 1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(splits, embeddings)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)a8e1d/.gitattributes: 100%|███████████████| 1.18k/1.18k [00:00<00:00, 1.18MB/s]
Downloading (…)_Pooling/config.json: 100%|███████████████████| 190/190 [00:00<00:00, 1.57MB/s]
Downloading (…)b20bca8e1d/README.md: 100%|███████████████| 10.6k/10.6k [00:00<00:00, 3.09MB/s]
Downloading (…)0bca8e1d/config.json: 100%|███████████████████| 571/571 [00:00<00:00, 1.51MB/s]
Downloading (…)ce_transformers.json: 100%|████████████████████| 116/116 [00:00<00:00, 407kB/s]
Downloading (…)e1d/data_config.json: 100%|███████████████| 39.3k/39.3k [00:00<00:00, 74.1MB/s]
Downloading pytorch_model.bin: 100%|███████████████████████| 438M/438M [00:13<00:00, 33.2MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████████████| 53.0/53.0 [00:00<00:00, 298kB/s]
Downloading (…)cial_tokens_map.json: 100%|████████████████████| 239/239 [00:00<00:00, 272kB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|█████████████████| 466k/466k [00:00<00:00, 13.1MB/s]


AssertionError: Torch not compiled with CUDA enabled

# Vector Store

In [None]:
from langchain.vectorstores import Chroma

# Retrieval

In [None]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)