In [128]:
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from blog_data import WebsiteDataCrawler

load_dotenv()


Retrieving documents from blog website


Fetching pages: 100%|##########| 138/138 [00:22<00:00,  6.25it/s]


In [None]:
blog_loader = WebsiteDataCrawler()
print("Retrieving documents from blog website")
docs = blog_loader.get_all_docs()


In [131]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=int(os.getenv("DOC_CHUNK_SIZE")), chunk_overlap=int(os.getenv("DOC_CHUNK_OVERLAP"))
)
split_doc = text_splitter.split_documents(docs)

In [132]:
print(len(split_doc))

13866


In [133]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")


In [134]:

docsearch = OpenSearchVectorSearch.from_documents(
    split_doc,
    embeddings,
    opensearch_url="https://localhost:9200",
    http_auth=("admin", "Open-search1!"),
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
    bulk_size=20000
)

In [135]:
retriever = docsearch.as_retriever()

In [78]:
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
    "retrieve_blog_posts",
    "Search and return information about blog posts by several companies such as Anthropic on current events and new technologies in LLM field.",
)

tools = [retriever_tool]

## AgenticRAG

In [79]:
from typing import Annotated, Sequence
from typing_extensions import TypedDict

from langchain_core.messages import BaseMessage

from langgraph.graph.message import add_messages


class AgentState(TypedDict):
    # The add_messages function defines how an update should be processed
    # Default is to replace. add_messages says "append"
    messages: Annotated[Sequence[BaseMessage], add_messages]

In [80]:
from typing import Annotated, Literal, Sequence
from typing_extensions import TypedDict

from langchain import hub
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

from pydantic import BaseModel, Field


from langgraph.prebuilt import tools_condition

### Edges


def grade_documents(state) -> Literal["generate", "rewrite"]:
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (messages): The current state

    Returns:
        str: A decision for whether the documents are relevant or not
    """

    print("---CHECK RELEVANCE---")

    # Data model
    class grade(BaseModel):
        """Binary score for relevance check."""

        binary_score: str = Field(description="Relevance score 'yes' or 'no'")

    # LLM
    model = ChatOpenAI(temperature=0, model="gpt-4o", streaming=True)

    # LLM with tool and validation
    llm_with_tool = model.with_structured_output(grade)

    # Prompt
    prompt = PromptTemplate(
        template="""You are a grader assessing relevance of a retrieved document to a user question. \n 
        Here is the retrieved document: \n\n {context} \n\n
        Here is the user question: {question} \n
        If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
        Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.""",
        input_variables=["context", "question"],
    )

    # Chain
    chain = prompt | llm_with_tool

    messages = state["messages"]
    last_message = messages[-1]

    question = messages[0].content
    docs = last_message.content

    scored_result = chain.invoke({"question": question, "context": docs})

    score = scored_result.binary_score

    if score == "yes":
        print("---DECISION: DOCS RELEVANT---")
        return "generate"

    else:
        print("---DECISION: DOCS NOT RELEVANT---")
        print(score)
        return "rewrite"


### Nodes


def agent(state):
    """
    Invokes the agent model to generate a response based on the current state. Given
    the question, it will decide to retrieve using the retriever tool, or simply end.

    Args:
        state (messages): The current state

    Returns:
        dict: The updated state with the agent response appended to messages
    """
    print("---CALL AGENT---")
    messages = state["messages"]
    print(messages)
    model = ChatOpenAI(temperature=0, streaming=True, model="gpt-4-turbo")
    model = model.bind_tools(tools)
    response = model.invoke(messages)
    # We return a list, because this will get added to the existing list
    return {"messages": [response]}


def rewrite(state):
    """
    Transform the query to produce a better question.

    Args:
        state (messages): The current state

    Returns:
        dict: The updated state with re-phrased question
    """

    print("---TRANSFORM QUERY---")
    messages = state["messages"]
    question = messages[0].content

    msg = [
        HumanMessage(
            content=f""" \n 
    Look at the input and try to reason about the underlying semantic intent / meaning. \n 
    Here is the initial question:
    \n ------- \n
    {question} 
    \n ------- \n
    Formulate an improved question: """,
        )
    ]

    # Grader
    model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True)
    response = model.invoke(msg)
    return {"messages": [response]}


def generate(state):
    """
    Generate answer

    Args:
        state (messages): The current state

    Returns:
         dict: The updated state with re-phrased question
    """
    print("---GENERATE---")
    messages = state["messages"]
    question = messages[0].content
    last_message = messages[-1]

    docs = last_message.content

    # Prompt
    prompt = hub.pull("rlm/rag-prompt")

    # LLM
    llm = ChatOpenAI(model_name="gpt-4-turbo", temperature=0, streaming=True)

    # Post-processing
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Chain
    rag_chain = prompt | llm | StrOutputParser()

    # Run
    response = rag_chain.invoke({"context": docs, "question": question})
    return {"messages": [response]}


print("*" * 20 + "Prompt[rlm/rag-prompt]" + "*" * 20)
prompt = hub.pull("rlm/rag-prompt").pretty_print()  # Show what the prompt looks like

********************Prompt[rlm/rag-prompt]********************

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: [33;1m[1;3m{question}[0m 
Context: [33;1m[1;3m{context}[0m 
Answer:


In [81]:
from langgraph.graph import END, StateGraph, START
from langgraph.prebuilt import ToolNode

# Define a new graph
workflow = StateGraph(AgentState)

# Define the nodes we will cycle between
workflow.add_node("agent", agent)  # agent
retrieve = ToolNode([retriever_tool])
workflow.add_node("retrieve", retrieve)  # retrieval
workflow.add_node("rewrite", rewrite)  # Re-writing the question
workflow.add_node(
    "generate", generate
)  # Generating a response after we know the documents are relevant
# Call agent node to decide to retrieve or not
workflow.add_edge(START, "agent")

# Decide whether to retrieve
workflow.add_conditional_edges(
    "agent",
    # Assess agent decision
    tools_condition,
    {
        # Translate the condition outputs to nodes in our graph
        "tools": "retrieve",
        END: END,
    },
)

# Edges taken after the `action` node is called.
workflow.add_conditional_edges(
    "retrieve",
    # Assess agent decision
    grade_documents,
)
workflow.add_edge("generate", END)
workflow.add_edge("rewrite", "agent")

# Compile
graph = workflow.compile()

In [99]:
import pprint

inputs = {
    "messages": [
        ("user", "Anthropic에서 최근 LLM과 관련해서 어떤 연구를 하고 있는지 알려줘."),
    ]
}
for output in graph.stream(inputs):
    for key, value in output.items():
        pprint.pprint(f"Output from node '{key}':")
        pprint.pprint("---")
        pprint.pprint(value, indent=2, width=80, depth=None)
    pprint.pprint("\n---\n")

---CALL AGENT---
[HumanMessage(content='Anthropic에서 최근 LLM과 관련해서 어떤 연구를 하고 있는지 알려줘.', additional_kwargs={}, response_metadata={}, id='1a1a43b4-af07-4688-bcd4-9552cf55b160')]
"Output from node 'agent':"
'---'
{ 'messages': [ AIMessage(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_O7fyRUDsY46DIEVXQIdSeYiM', 'function': {'arguments': '{"query":"Anthropic recent research on LLM"}', 'name': 'retrieve_blog_posts'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls', 'model_name': 'gpt-4-turbo-2024-04-09', 'system_fingerprint': 'fp_81dd8129df'}, id='run-06916865-d041-4df2-b6ba-b22d52e758e6-0', tool_calls=[{'name': 'retrieve_blog_posts', 'args': {'query': 'Anthropic recent research on LLM'}, 'id': 'call_O7fyRUDsY46DIEVXQIdSeYiM', 'type': 'tool_call'}])]}
'\n---\n'
---CHECK RELEVANCE---
---DECISION: DOCS NOT RELEVANT---
no
"Output from node 'retrieve':"
'---'
{ 'messages': [ ToolMessage(content='<h2 id="21-lora의-아이디어">2.1. LoRA의 아이디어</h2>\n\n<p>미세조정

In [96]:
for sen in output['agent']['messages'][0].content.split('.'):
    print(sen)

NCsoft는 최근에 다음과 같은 기술들을 연구하고 발표했습니다:

1
 **사용자 정의 호출어 인식 기술 (UDKWS)**: 이 기술은 사용자가 자신만의 호출어를 설정할 수 있게 해주는 음성 인식 기술입니다
 이 기술은 디바이스 내부에서 동작하며, 빠른 응답 시간과 가벼운 구조를 목표로 하고 있습니다
 이를 통해 다양한 디바이스에서 제한 없이 사용할 수 있도록 개발되었습니다


이 기술들은 NCsoft가 사용자 경험을 향상시키고, 기술적 한계를 극복하기 위해 지속적으로 연구하고 개발하는 노력의 일환입니다



## CRAG (Corrective RAG)

In [101]:
local_llm = "llama3.2"
model_tested = "llama3.2"

In [109]:
### Retrieval Grader

from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
# from langchain_mistralai.chat_models import ChatMistralAI

# LLM
llm = ChatOllama(model=local_llm, format="json", temperature=0)

# Prompt
prompt = PromptTemplate(
    template="""You are a teacher grading a quiz. You will be given: 
    1/ a QUESTION
    2/ A FACT provided by the student
    
    You are grading RELEVANCE RECALL:
    A score of 1 means that ANY of the statements in the FACT are relevant to the QUESTION. 
    A score of 0 means that NONE of the statements in the FACT are relevant to the QUESTION. 
    1 is the highest (best) score. 0 is the lowest score you can give. 
    
    Explain your reasoning in a step-by-step manner. Ensure your reasoning and conclusion are correct. 
    
    Avoid simply stating the correct answer at the outset.
    
    Question: {question} \n
    Fact: \n\n {documents} \n\n
    
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score' and no premable or explanation.
    """,
    input_variables=["question", "documents"],
)

retrieval_grader = prompt | llm | JsonOutputParser()
question = "네이버 최근 LLM 연구"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(doc_txt)
print(retrieval_grader.invoke({"question": question, "documents": doc_txt}))

<p>안녕하세요!
NLP센터 금융언어이해팀 민중현 입니다.
NLP Tech 블로그를 통해 다시 한 번 이야기 나눌 수 있어서 기쁩니다.
오늘은 논문 소개가 아닌, 센터 내에서 진행한 연구를 소개하고자 합니다.</p>

<h3 id="지금은-바야흐로">지금은 바야흐로</h3>

<p>…트랜스포머 기반 LLM의 시대!</p>

<p align="center">
<img src="assets/img/post/195e5588d76145a5becc6052cdbf9cfa5092dcbe/llm1.png" />
<img src="assets/img/post/195e5588d76145a5becc6052cdbf9cfa5092dcbe/llm2.png" />
<img src="assets/img/post/195e5588d76145a5becc6052cdbf9cfa5092dcbe/llm3.png" />
</p>

<p>최근 출시된 LLM들, 그리고 특히 그 중 생성형 모델들은 참 놀랍습니다. 정량적으로 나타나는 번역, 요약등 여러 과제에서의 정량적 성능뿐만 아니라, 사람이 쓴 것과 비슷한 이메일, 게임 스토리, 이력서, 편지, 연설문도 쓸 수 있다고 하지요. 그 뿐만 아니라, 미국 대학수학능력검정시험 (SAT), 변호사 시험, 그리고 의사 면허 시험 등도 통과한다고 전해집니다. 그렇다면, 드디어! 인간과 같은 수준으로 언어를 습득/처리할 수 있는 컴퓨터 시스템 (human-like computational language processing system) 이 만들어 진 걸까요?</p>

<p>Short answer: No</p>
{'score': '1'}


In [110]:
docs

[Document(metadata={'source': 'https://ncsoft.github.io/ncresearch/c8416dcc21d8aad7f0ee65eaa47ad53854578b59', 'title': 'Large Language Model을 밀어서 잠금해제: Parameter-Efficient Fine-Tuning 2 | \u3000', 'description': 'Parameter-Efficient Fine-Tuning (PEFT) 방법들', 'language': 'en'}, page_content='<h2 id="21-lora의-아이디어">2.1. LoRA의 아이디어</h2>\n\n<p>미세조정은 LLM이 여태 학습한걸 뒤엎어버리는 대규모의 변화보다는 원하는 작업을 위해 LLM을 섬세하게 조정하는 과정을 묘사하는 단어입니다. LLM이 사전학습한 능력을 시작점으로 하기 때문에 같은 데이터양으로도 학습시켰을 때 훨씬 나은 성능을 기대할 수 있는거잖아요? LLM을 등에 업고 원하는 작업(downstream task)을 학습하는 일은 생각보다 어렵지 않은 일일지도 모릅니다.</p>\n\n<p><strong>그렇다면 미세조정을 하는데에 LLM이 포함한 수만큼 많은 파라미터를 학습하는건 너무 과한게 아닐까?</strong> 이 질문이 LoRA 방법론의 핵심에 있는 아이디어입니다.</p>\n\n<p>이 아이디어와 큰 행렬을 작은 두 개의 행렬곱으로 바꾸는 것은 어떻게 연결될까요? 아래의 예시를 봅시다.</p>\n\n<p><img src="/ncresearch/assets/img/post/c8416dcc21d8aad7f0ee65eaa47ad53854578b59/10_ezpz.png" alt="" /></p>\n\n<p>이 문제에 답을 제출할 때, 최대한 작은 행렬을 답으로 하고 싶다면 3번과 같이 열이 1개인 행렬을 선택할 수 있습니다. 따라서 이 문제를 푸는데 필요한 최소 열의 갯수 (n) 는 1개입니다. 5 x 5 크기의 행렬도 답안이 될 수 있지만 5 

In [111]:
### Generate

from langchain_core.output_parsers import StrOutputParser

# Prompt
prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks. 
    
    Use the following documents to answer the question. 
    
    If you don't know the answer, just say that you don't know. 
    
    Use three sentences maximum and keep the answer concise:
    Question: {question} 
    Documents: {documents} 
    Answer: 
    """,
    input_variables=["question", "documents"],
)

# LLM
llm = ChatOllama(model=local_llm, temperature=0)

# Chain
rag_chain = prompt | llm | StrOutputParser()

# Run
generation = rag_chain.invoke({"documents": docs, "question": question})
print(generation)

I don't know the latest research on LLM from Naver. 

The provided documents seem to be related to a research paper titled "Large Language Model을 밀어서 잠금해제: Parameter-Efficient Fine-Tuning 2" which discusses the LoRA (Low-Rank Adaptation) method for parameter-efficient fine-tuning of large language models.

LoRA is a method that reduces the number of parameters required for fine-tuning by using low-rank adaptations, which can lead to significant memory and computational savings.
