In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
OPEN_API_KEY = os.getenv("OPEN_API_KEY")

In [2]:
from openai import OpenAI
client = OpenAI(api_key=OPEN_API_KEY)

In [23]:
import requests

from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from IPython.display import display, Markdown

In [7]:
url = "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/example_data/nke-10k-2023.pdf"


In [5]:
folder_path = r"C:\Users\csing\VSCode\Projects\RAG_LangChain_OpenAI_FAISS\data"
os.makedirs(folder_path, exist_ok=True)
file_path = os.path.join(folder_path, "nke-10k-2023.pdf")

In [8]:
response = requests.get(url)
with open(file_path, "wb") as f:
    f.write(response.content)

In [9]:
loader = PyPDFLoader(file_path)
documents = loader.load()

In [None]:
print(documents[0].page_content)

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)

In [12]:
text_chunks = text_splitter.split_documents(documents)

In [17]:
display(Markdown(f"{text_chunks[0].page_content}"))


Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

In [32]:
vector_store = FAISS.from_documents(text_chunks, embeddings)

In [33]:
retriever = vector_store.as_retriever()

In [36]:
from langchain.prompts import ChatPromptTemplate

In [35]:
template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [37]:
prompt = ChatPromptTemplate.from_template(template)

In [38]:
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [42]:
output_parser = StrOutputParser()
llm_model = ChatOpenAI(openai_api_key = OPEN_API_KEY, model_name="gpt-3.5-turbo")

In [44]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm_model
    | output_parser
)

In [None]:
rag_chain.invoke("What is the business strategy of Nike?")