In [3]:
%pip install -qU langchain_community beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install -qU langchain-core langchain-upstage

Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install langchain

In [19]:
%pip install -qU langchain-pinecone pinecone-notebooks

Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install langchain-teddynote

In [None]:
%pip install --upgrade langchain_upstage

1. 필요한 URL Load하기

In [1]:
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
import bs4
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_upstage import UpstageEmbeddings
from langchain_pinecone import PineconeVectorStore
import os
from langchain import hub
from langchain_upstage import ChatUpstage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
load_dotenv()

urls = [
    "https://cse.knu.ac.kr/bbs/board.php?bo_table=sub5_1&wr_id=28223",
    "https://cse.knu.ac.kr/bbs/board.php?bo_table=sub5_1&wr_id=28221",
]

all_docs = []

for url in urls:
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                "div",
                attrs={"id": ["bo_v_con", "bo_v_title"]},
            )
        )
    )
    docs = loader.load()
    print(f"URL: {url} - 문서의 수: {len(docs)}")
    all_docs.extend(docs)

URL: https://cse.knu.ac.kr/bbs/board.php?bo_table=sub5_1&wr_id=28223 - 문서의 수: 1
URL: https://cse.knu.ac.kr/bbs/board.php?bo_table=sub5_1&wr_id=28221 - 문서의 수: 1


2. Split하기

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
)

#splits = text_splitter.split_documents(docs)
splits = text_splitter.split_documents(all_docs)
len(splits)

6

3. Embedding하고 Vector DB에 저장하기

In [4]:
embeddings = UpstageEmbeddings(
    model="solar-embedding-1-large"
)

In [5]:
index_name = 'csechat'
pinecone_api_key = os.environ.get("PINECONE_API_KEY")

database = PineconeVectorStore.from_documents(splits, embeddings, index_name=index_name)

4. Prompt

In [6]:
prompt = hub.pull("rlm/rag-prompt")
prompt

  prompt = loads(json.dumps(prompt_object.manifest))


ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [7]:
llm = ChatUpstage()

def format_docs(docs) :
    # 검색한 문서 결과를 하나의 문단으로 합칩니다.
    return "\n\n".join(doc.page_content for doc in docs)

retriever=database.as_retriever()

qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()} 
    | prompt 
    | llm 
    | StrOutputParser()
)

In [13]:
qa_chain.invoke("탑싯 언제 시험인가요?")

'탑싯(TOPCIT) 시험은 2024년 10월 12일(토) 09:30 ~ 12:00에 진행됩니다.'

In [14]:
qa_chain.invoke("TOPCIT 신청하고 안치면 어떻게 되나요?")

'TOPCIT 신청을 하고 응시하지 않으면 SW 마일리지가 10점 감점됩니다.'

In [15]:
qa_chain.invoke("TOPCIT 신청하면 해주는 것이 있나요?")

'TOPCIT에 신청하면 응시료 전액 지원과 SW마일리지 20점을 지급해줍니다.'

In [11]:
qa_chain.invoke("현재 TUTOR 모집은 몇 차까지 완료되었나요?")

'현재 2024학년도 2학기 학부생 TUTOR 모집은 5차 모집까지 완료되었습니다.'

In [17]:
qa_chain.invoke("조현준은 누구인가요?")

'조현준이 누구인지에 대한 정보는 주어진 문맥에서 찾을 수 없습니다.'