1. Wikipedia에서 검색 or
2. DuckDuckGo에서 검색
2-1. 웹사이트의 텍스트를 스크랩하고 추출합니다.
3. 리서치 결과를 .txt 파일에 저장하기
4. 다음 쿼리로 에이전트를 실행합니다: "Research about the XZ backdoor" 라는 쿼리로 에이전트를 실행
5. 에이전트는 Wikipedia 또는 DuckDuckGo에서 검색을 시도하고, DuckDuckGo에서 웹사이트를 찾으면 해당 웹사이트에 들어가서 콘텐츠를 추출한 다음 
6. .txt 파일에 조사 내용을 저장하는 것으로 완료해야 합니다.

In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.tools import BaseTool, DuckDuckGoSearchResults
from pydantic import BaseModel, Field
from langchain.retrievers import WikipediaRetriever
from langchain.document_loaders import WebBaseLoader
from langchain.prompts import ChatPromptTemplate
from typing import Type
import json
import os

def search_wiki(topic):
    retriever = WikipediaRetriever(
        top_k_result=4,
    )
    docs = retriever.invoke(topic)
    return docs

def search_duckduckgo(query):
    search = DuckDuckGoSearchResults()
    str = search.invoke(query)
    find_s_string = "link: https://"
    find_e_string = "]"
    del_string = "link: "
    s_index = str.find(find_s_string)
    e_index = 0
    urls = []
    while s_index != -1:
        s_index += e_index
        e_index = str[s_index:].find(find_e_string) + s_index
        urls.append(str[s_index + len(del_string):e_index])
        s_index = str[e_index:].find(find_s_string)
    return urls

def web_scrapping(urls):
    print("urls: ", urls)
    urls_list = urls.split("|")
    print(urls_list)
    if len(urls_list) >= 2:
        loader = WebBaseLoader(urls_list[:2])
    else:
        loader = WebBaseLoader(urls_list)
    docs = loader.load()
    return "\n\n".join([doc.page_content.replace("\n", " ").replace("  ", " ") for doc in docs])


def save_content_to_txt(content, filename):
    folder_dir = "./.cache/agent"
    os.makedirs(folder_dir, exist_ok=True)
    with open(f"{folder_dir}/{filename}.txt", "w", encoding="utf-8") as file:
        file.write(content)
        
class WikipediaResearchSchema(BaseModel):
    term: str = Field(
        description="""
        The term you will search for on Wikipedia.
        Enter only the word you're looking for.
        Example term: the XZ backdoor
        """
    )

class WikipediaResearchTool(BaseTool):
    name = "WikipediaResearchTool"
    description = """
    Use this tool to search for something's information on Wikipedia.
    It takes a term as an argument.
    It returns the content searched on Wikipedia.
    """
    args_schema: Type[WikipediaResearchSchema] = WikipediaResearchSchema

    def _run(self, term):
        return search_wiki(term)

class DuckduckgoResearchSchema(BaseModel):
    query: str = Field(
        description="""
        The query you will search for.
        Example query: Research about the XZ backdoor
        """
    )

class DuckduckgoResearchTool(BaseTool):
    name = "DuckduckgoResearchTool"
    description = """
    Use this tool to search for something's information.
    It takes a query as an argument.
    It receives a query, searches the website, and returns list of the website address(url).
    """
    args_schema: Type[DuckduckgoResearchSchema] = DuckduckgoResearchSchema

    def _run(self, query):
        return search_duckduckgo(query)

class WebScrapingSchema(BaseModel):
    urls: str = Field(
        description="""
        url list to extracts web information.
        Example url list: 'https://en.wikipedia.org/wiki/URL|https://naver.com|https://daum.net'
        """
    )

class WebScrapingTool(BaseTool):
    name = "WebScrapingTool"
    description = """
    This tool accesses the specified URL and extracts information from them.
    This tool receives URL list, scrapes the information from those URL, and returns web information.
    """
    args_schema: Type[WebScrapingSchema] = WebScrapingSchema

    def _run(self, urls):
        return web_scrapping(urls)


class SaveToTXTSchema(BaseModel):
    content: str = Field(
        description="Content to be saved as a text file."
    )
    filename: str = Field(
        description=""""
        Filename to save the content. A single word that contains the content.
        """
    )

class SaveToTXTTool(BaseTool):
    name = "SaveToTXTTool"
    description = """
    This tool receives content and a filename and saves them as a text file.
    """
    args_schema: Type[SaveToTXTSchema] = SaveToTXTSchema

    def _run(self, content, filename):
        save_content_to_txt(content, filename)

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
)

template = ChatPromptTemplate.from_messages([
    ("system", """
You are a research expert. 
When you get a research request, search DuckDuckGo.
If the source lacks information, gather more from wikipedia.
With DuckDuckGo, obtain the website's URL list.
If you receive a list of URLs, you need to extract information from the websites through the URLs.
Save all collected content as a text file. Just one file.
Name the file with the key term.
"""), 
("human", "research request: {request}")
])

agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.OPENAI_FUNCTIONS,
    tools = [
        WikipediaResearchTool(),
        DuckduckgoResearchTool(),
        WebScrapingTool(),
        SaveToTXTTool()
    ]
)

chain = template | agent
chain.invoke({"request":"Research about the XZ backdoor"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `DuckduckgoResearchTool` with `{'query': 'Research about the XZ backdoor'}`


[0m[33;1m[1;3m['https://www.wired.com/story/xz-backdoor-everything-you-need-to-know/', 'https://cybernews.com/editorial/xz-linux-backdoor-explained/', 'https://www.wired.com/story/jia-tan-xz-backdoor/', 'https://securitylabs.datadoghq.com/articles/xz-backdoor-cve-2024-3094/'][0m[32;1m[1;3m
Invoking: `WebScrapingTool` with `{'urls': 'https://www.wired.com/story/xz-backdoor-everything-you-need-to-know/|https://cybernews.com/editorial/xz-linux-backdoor-explained/|https://www.wired.com/story/jia-tan-xz-backdoor/|https://securitylabs.datadoghq.com/articles/xz-backdoor-cve-2024-3094/'}`


[0murls:  https://www.wired.com/story/xz-backdoor-everything-you-need-to-know/|https://cybernews.com/editorial/xz-linux-backdoor-explained/|https://www.wired.com/story/jia-tan-xz-backdoor/|https://securitylabs.datadoghq.com/articles/xz-backdoor-cve-2024

{'input': ChatPromptValue(messages=[SystemMessage(content="\nYou are a research expert. \nWhen you get a research request, search DuckDuckGo.\nIf the source lacks information, gather more from wikipedia.\nWith DuckDuckGo, obtain the website's URL list.\nIf you receive a list of URLs, you need to extract information from the websites through the URLs.\nSave all collected content as a text file. Just one file.\nName the file with the key term.\n"), HumanMessage(content='research request: Research about the XZ backdoor')]),
 'output': 'I have successfully gathered and saved the research on the XZ backdoor into a text file named "XZ_backdoor". If you need any further information or assistance, feel free to ask!'}

In [3]:
a= ['a']

"\n\n".join(a)


'a'