In [None]:
# 새로운 Jupyter notebook에서 리서치 AI 에이전트를 만들고 커스텀 도구를 부여합니다.
# 에이전트는 다음 작업을 수행할 수 있어야 합니다:
# Wikipedia에서 검색
# DuckDuckGo에서 검색
# 웹사이트의 텍스트를 스크랩하고 추출합니다.
# 리서치 결과를 .txt 파일에 저장하기
# 다음 쿼리로 에이전트를 실행합니다: "Research about the XZ backdoor" 라는 쿼리로 에이전트를 실행하면, 에이전트는 Wikipedia 또는 DuckDuckGo에서 검색을 시도하고, DuckDuckGo에서 웹사이트를 찾으면 해당 웹사이트에 들어가서 콘텐츠를 추출한 다음 .txt 파일에 조사 내용을 저장하는 것으로 완료해야 합니다.
from tabnanny import verbose
from langchain.agents import create_sql_agent, AgentType
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool, DuckDuckGoSearchResults
from langchain.tools.wikipedia.tool import WikipediaQueryRun
from langchain.utilities.wikipedia import WikipediaAPIWrapper

# from langchain_community.utilities import WikipediaAPIWrapper
from langchain.sql_database import SQLDatabase
from pydantic import BaseModel, Field
from typing import Type
from langchain.agents import initialize_agent, AgentType
from langchain.schema import SystemMessage, HumanMessage
import requests
from bs4 import BeautifulSoup
import re

from config import OPENAI_API_KEY

llm = ChatOpenAI(
    temperature=0.1,
)


def get_site_contents(url):
    try:
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/122.0.0.0 Safari/537.36"
            )
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(e)
        return False, f"Error fetching the page: {e}"

    soup = BeautifulSoup(response.text, "html.parser")
    page_text = soup.get_text(separator="\n", strip=True)

    header = soup.find("header")
    footer = soup.find("footer")
    if header:
        header.decompose()
    if footer:
        footer.decompose()
    return True, str(soup.get_text())


class SearchArgsSchema(BaseModel):
    query: str = Field(description="The query you want to search for")


class DuckDuckGoSearchTool(BaseTool):
    name = "DuckDuckGoSearchTool"
    description = """
    Use this tool to get results for a given query using the DuckDuckGo search engine.
    Enter a query as an argument.
    """
    args_schema: Type[SearchArgsSchema] = SearchArgsSchema

    def _run(self, query):
        ddg = DuckDuckGoSearchResults()
        results = ddg.run(query)

        if results:
            links = re.findall(r"link:\s*(https?://[^\s,\]]+)", results)

            contents = ""
            for link in links:
                print(f"link : {link}")
                success, contents = get_site_contents(link)

                if success:
                    break
                else:
                    print("스크랩에 실패하였습니다.")

            print(contents)
            return contents
        else:
            return "No results found."


class WikipediaSearchTool(BaseTool):
    name = "WikipediaSearchTool"
    description = """
    You can use this tool to get Wikipedia's search results for your query.
    Enter a query as an argument.
    """
    args_schema: Type[SearchArgsSchema] = SearchArgsSchema

    def _run(self, query):
        wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())

        result = wikipedia.run(query)
        print(result)


agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.OPENAI_FUNCTIONS,
    tools=[
        WikipediaSearchTool(),
        DuckDuckGoSearchTool(),
    ],
    agent_kwargs={
        "system_message": SystemMessage(
            content="""
            You are a very good search engine.

            You search for a specific query with the DuckDuckGo search engine and get results. 
            You enter the first website you find and scrape the content.

            If there are no results, it searches Wikipedia and gets results.
            If the wikipedia search results include your search term, grab its content.
            Never summarize the content of that page, just get the content verbatim.
            If wikipedia also returns no results, return a blank.
        """
        )
    },
)

query = "Research about the XZ backdoor"
result = agent.invoke(HumanMessage(content=query))
text = result["output"] if isinstance(result, dict) else str(result)

if text:
    with open("result.txt", "w", encoding="utf-8") as file:
        file.write(text)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `DuckDuckGoSearchTool` with `{'query': 'Research about the XZ backdoor'}`


[0m

  results = ddgs.text(


🫡
[snippet: Threat actors added malicious code (or a backdoor) to the XZ utility versions 5.6.0 and 5.6.1. The installed backdoor works by manipulating the sshd, a server process that facilitates secure internet connections using the SSH protocol. The sshd is responsible for user authentication, encryption, terminal connections, file transfers, and tunneling., title: The XZ Backdoor explained - Cybernews, link: https://cybernews.com/editorial/xz-linux-backdoor-explained/], [snippet: The following is an excerpt from our new module on the recent XZ Utils backdoor, CVE-2024-3094.. On Mar 29, 2024, at 12:00PM ET, Andres Freund posted on the Openwall mailing list about a backdoor he discovered in the XZ Utils package. The backdoor targeted the OpenSSH binary, allowing remote code execution on impacted machines. This backdoor was not located in the GitHub repository, but only ..., title: Behind Enemy Lines: Understanding the Threat of the XZ Backdoor - OffSec, link: https://www.offsec.com/bl