# 02-rag.ipynb

In [None]:
from dotenv import load_dotenv

# .env에 있는 환경변수들을 불러오기
load_dotenv()

True

In [None]:
# %pip install -q pypdf

In [None]:
# 1. Document Load (PDF)
# 지원하는 문서
# `~` : 홈 path / `..` : 상위폴더 패스 / `.` : 현재폴더 path
from langchain_community.document_loaders import PyPDFLoader
file_path = './nke-10k-2023.pdf'
#불러올 파일 위치
# pdf를 변환해줄 로더
loader = PyPDFLoader(file_path)
#로더가 pdf를 python에서 쓸 수 있도록 변환(pdf largne -> 1 Document)
docs = loader.load()
print(len(docs)) #원본 pdf 페이지 수가 나옴

107


In [15]:
# 2. Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, chunk_overlap=200 , add_start_index = True
)

# 쪼개기
chunks = text_splitter.split_documents(docs)
print(len(chunks))


516


In [18]:
# 3. Embedding (숫자로 바꾸기)
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-small')

print(len(chunks)) # 전체 chunk 개수
print(chunks[0])
v1 = embeddings.embed_query(chunks[0].page_content) #청크1 벡터로 변환
v2 = embeddings.embed_query(chunks[1].page_content) #청크2 벡터로 변환

print(len(v1)) == len(v2)
print(v1[:10]) # 벡터 눈으로 확인하기

516
page_content='Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock NKE New York Stock Exchange
(Title of each class) (Trading symbol) (Name of each exchange on which registered)' metadata={

## 저장 파트

In [21]:
# 4. Vector Store에 저장하기
from langchain_core.vectorstores import InMemoryVectorStore

# 테스트/개발용 메모리 벡터스토어
vector_store = InMemoryVectorStore(embeddings)

# pdf 쪼개놓은 chunks를 벡터스토어에 저장
ids = vector_store.add_documents(documents=chunks)

In [22]:
len(ids)

516

## 검색 파트

In [24]:
#벡터스토어 -> 검색기로 활용
retriever = vector_store.as_retriever(
    search_type = 'similarity',     # 검색방식: 유사도
    search_kwargs={'k':3}           #결과개수 : 3개
)

#검색
retriever.invoke('나이키의 미국 영업점 개수?')

[Document(id='796832b4-c504-4a2b-ba9c-739c95e18c5f', metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 4, 'page_label': '5', 'start_index': 3125}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further informati

## PDF RAG를 Agent 에 통합

In [30]:
# 검색기(retriever)를 Tool(함수)로 만들기

# 검색어(query)를 인자로 받음
def search_vectorstore(query:str) -> str:
    """Retrieve info to help answer a query about Nike"""
    # 검색기 대신 벡터스토어 바로 활용하기 (chunk 2개만 검색)
    docs = vector_store.similarity_search(query, k=2)
    result = ''
    for doc in docs:
        result += doc.page_content + '\n\n'
    return result

In [31]:
from langchain.agents import create_agent
prompt = """너는 2023 나이키 10k 보고서를 검색하는 도구를 다룰 수 있어.
사용자 질문에 답변하기 위해 필요하면 사용해. 경제분석 전문가처럼 답변해."""

agent = create_agent(
    model = "openai:gpt-4.1-mini",
    tools = [search_vectorstore],
    system_prompt = prompt
)

In [32]:
content = "나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함."

agent.invoke(
    {
        "messages": [
            {"role": "user", "content": content}
        ]
    }
)

{'messages': [HumanMessage(content='나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함.', additional_kwargs={}, response_metadata={}, id='8a124c27-5833-4157-a3d1-264478310a0a'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 31, 'prompt_tokens': 116, 'total_tokens': 147, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_a391f2cee0', 'id': 'chatcmpl-DCfyOsujTrWVezSgkl2ezkIlHonue', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c8e48-a73b-7d21-909b-41d42f7c527b-0', tool_calls=[{'name': 'search_vectorstore', 'args': {'query': '나이키 영업점 숫자, 각 영업점 평균 매출액'}, 'id': 'call_ujaVAIUsesvbjUgjaxwWXiDV', 'type': 'tool_call'}], invalid_tool_calls=[], usage_metada

In [34]:
print(search_vectorstore('나이키 영업점 개수'))

direct to consumer operations sell products through the following number of retail stores in the United States:
U.S. RETAIL STORES NUMBER
NIKE Brand factory stores 213 
NIKE Brand in-line stores (including employee-only stores) 74 
Converse stores (including factory stores) 82 
TOTAL 369 
In the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.
2023 FORM 10-K 2

Table of Contents
ITEM 1B. UNRESOLVED STAFF COMMENTS
None.
ITEM 2. PROPERTIES
The following is a summary of principal properties owned or leased by NIKE:
The NIKE World Campus, owned by NIKE and located near Beaverton, Oregon, USA, is an approximately 400-acre site consisting of over 40 buildings which, together
with adjacent leased properties, functions as our world headquarters and is occupied by approximately 11,400 employees engaged in management, research, design,
development, marketing, finance and other administrative functions serving nearly all of our s

## Web문서(HTML) RAG + Agent

In [39]:
# HTML 은 문서 본문 외에 필요하지 않은 내용이 많다. 전처리가 필요하다!
import bs4
from langchain_community.document_loaders import WebBaseLoader

#전처리
bs4_strainer = bs4.SoupStrainer(class_= ('post-title', 'post-header', 'post-content'))
loader = WebBaseLoader(
    web_path="https://lilianweng.github.io/posts/2023-06-23-agent/",
    bs_kwargs={'parse_only': bs4_strainer}, #처리기 넣기
)
docs = loader.load()
# 문서 페이지 수
print(len(docs), len(docs[0].page_content))

1 43047


In [41]:
# Split
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, chunk_overlap=200 , add_start_index = True
)

chunks = text_splitter.split_documents(docs)
print(len(chunks))

# Embedding
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-small')

print(len(chunks)) # 전체 chunk 개수
v1 = embeddings.embed_query(chunks[0].page_content) #청크1 벡터로 변환
v2 = embeddings.embed_query(chunks[1].page_content) #청크2 벡터로 변환

print(len(v1)) == len(v2)
print(v1[:10]) # 벡터 눈으로 확인하기

# Store
from langchain_core.vectorstores import InMemoryVectorStore

# 테스트/개발용 메모리 벡터스토어
vector_store = InMemoryVectorStore(embeddings)

# pdf 쪼개놓은 chunks를 벡터스토어에 저장
ids = vector_store.add_documents(documents=chunks)

#벡터스토어 -> 검색기로 활용
retriever = vector_store.as_retriever(
    search_type = 'similarity',     # 검색방식: 유사도
    search_kwargs={'k':3}           #결과개수 : 3개
)

#검색
retriever.invoke('Agent 구축 시 유의할 점?')

# Agent에 통합하기
# 검색기(retriever)를 Tool(함수)로 만들기

# 검색어(query)를 인자로 받음
def search_vectorstore(query:str) -> str:
    """Retrieve info to help answer a query about Agent"""
    # 검색기 대신 벡터스토어 바로 활용하기 (chunk 2개만 검색)
    docs = vector_store.similarity_search(query, k=2)
    result = ''
    for doc in docs:
        result += doc.page_content + '\n\n'
    return result

from langchain.agents import create_agent
prompt = """너는 에이전트를 어떻게 구축하면 좋을지에 대해 설명한 문서를 검색하는 도구를 다룰 수 있어.
사용자 질문에 답변하기 위해 필요하면 사용해. AI 분야 전문가처럼 답변해."""

agent = create_agent(
    model = "openai:gpt-4.1-mini",
    tools = [search_vectorstore],
    system_prompt = prompt
)

content = "Agent 구축 시 유의할 점이 궁금함."

agent.invoke(
    {
        "messages": [
            {"role": "user", "content": content}
        ]
    }
)

63
63
1536
[0.009648381732404232, 0.020312383770942688, 0.041626472026109695, -0.0041494304314255714, 0.00354423257522285, 0.0019164594123139977, -0.014427357353270054, 0.03675706684589386, -0.019032424315810204, 0.058878086507320404]


{'messages': [HumanMessage(content='Agent 구축 시 유의할 점이 궁금함.', additional_kwargs={}, response_metadata={}, id='09a45d1c-2f8b-4bc0-9c13-bab2e89ee0b1'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 110, 'total_tokens': 131, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_3732175f03', 'id': 'chatcmpl-DCgmlIa8QicZkn1FwgPYZPAuvyYZD', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c8e78-4da8-7ea1-bfe4-6cc31b988376-0', tool_calls=[{'name': 'search_vectorstore', 'args': {'query': 'Agent 구축 시 유의할 점'}, 'id': 'call_IumGlvfxq3oCUEVZTEYRcdPv', 'type': 'tool_call'}], invalid_tool_calls=[], usage_metadata={'input_token