In [None]:
import logging
from typing import Optional

from crewai import Agent, Task, Crew
from crewai_tools import (
    SerperDevTool,
    ScrapeWebsiteTool,
)
from pydantic import BaseModel, Field

# Configure logging
logging.basicConfig(level=logging.DEBUG)

# set the environment variables
from dotenv import load_dotenv
# NOTE: need to setup a SERPER_API_KEY in the .env file
# https://serper.dev/api-key
load_dotenv()

In [None]:
search_tool = SerperDevTool()
scrape_tool = ScrapeWebsiteTool()

researcher_agent = Agent(
    role="Article researcher",
    goal="Search in arxiv.org for a concrete article who's name or description is '{article_description}' and find the URL of the PDF file of the article.",
    backstory="You are a researcher who is responsible for finding articles from arxiv.org on a specific topic given a name or description."
              "The article name or description is: {article_description}."
              "You search the web arxiv.org to find the information about the article."
              "Find a url from arxiv.org to the article in PDF format and give it as output."
              "If you find several articles, choose the one that is most relevant to the article name or description."
              "If you cannot find the article, say you didn't found any related article.",
    allow_delegation=False,
    verbose=True,
    tools=[search_tool, scrape_tool]
)

class ArticleSearchOutput(BaseModel):
    url: str = Field(..., description="URL of the PDF file of the article to download")

researcher_task = Task(
    description="Find in arxiv.org the URL of the PDF file of an article who's name or the description is provided here '{article_description}'",
    expected_output="A dictionary with keys 'url' for a URL of the PDF file of the article to download.",
    output_json=ArticleSearchOutput,
    agent=researcher_agent,
)

In [None]:
# search_crew = Crew(
#     agents=[researcher_agent],
#     tasks=[researcher_task],
#     verbose=True,
# )
# result = search_crew.kickoff(inputs={'article_description': 'HippoRAG inspired long-term memory language models'})
# print(result)

In [None]:
from hackathon.tools import FileDownloadTool, sha256_filename_generator

# Create an instance of the FileDownloadTool class with the SHA-256 function
download_tool = FileDownloadTool(filename_generator=sha256_filename_generator)

# Create downloader agent
downloader_agent = Agent(
    role='File Downloader',
    goal='Download files from given URLs efficiently and safely to the give directory.',
    backstory='You are an expert in downloading files from the internet, ensuring the process is smooth and secure.'
              'You are provided a URL to download the file from and a directory name to save it.',
    verbose=True,
    allow_delegation=False,
    tools=[download_tool]
)

# Create a task
class FileDownloadDetails(BaseModel):
    url: str = Field(..., description="The original source URL from where the file was downloaded")
    filepath: Optional[str] = Field(..., description="If the download was success, the filepath where the file was saved")
    status: str = Field(..., description="Status of the download operation, either Success or Failure")

download_task = Task(
    description="Download a file from the URLs that are given. If an output directory is provided {directory}, save the file there.",
    agent=downloader_agent,
    #output_file='{filename}',
    output_json=FileDownloadDetails,
    expected_output='The file download operation details including source URL and filepath of the saved file will be provided in JSON.'
                    'The status will be either Success or Failure.'
                    'If the download fails, the filepath will be None and the status will be Failure.'
                    'Use the output details from the tools to complete the task.',
)


In [None]:
# coordinator = Agent(
#     role="Coordinator",
#     goal="Coordinate the actions of the Article Researcher and FileDownloader to search in arxiv.org for a concrete article"
#          " who's name or description is '{article_description}' and download it in PDF format to a directory if it is provided.",
#     backstory="You coordinate the action of other agents to look and download the article in PDF format."
#               "The article name or description is: {article_description}."
#               "You search the web arxiv.org to find the information about the article."
#               "Search for a link in to the article in PDF format."
#               "If you find several articles, choose the one that is most relevant to the topic."
#               "Download the article in PDF format and save it to the given directory if provided."
#               "If you cannot find the article, say you didn't found any article related",
#     allow_delegation=True,
#     verbose=True,
# )
#
# coordinator_task = Task(
#     description="Coordinate the actions of Article Researcher and File downloader to search in arxiv.org for a articles and download them in PDF format.",
#     agent=coordinator,
#     #output_file='{filename}',
#     output_json=FileDownloadDetails,
#     expected_output='The file download operation details including source URL and filepath of the saved file will be provided in JSON.'
#                     'The status will be either Success or Failure.'
#                     'If the download fails, the filepath will be None and the status will be Failure.'
#                     'Use the output details from the tools to complete the task.',
# )
#


In [None]:
search_and_download_crew = Crew(
    agents=[researcher_agent, downloader_agent],
    tasks=[researcher_task, download_task],
    verbose=True,
)

result = search_and_download_crew.kickoff(inputs={'article_description': 'HippoRAG inspired long-term memory language models', 'directory': './downloads'})
print(result)