In [None]:
import add_packages
import yaml, re, os, json, requests
from toolkit.langchain import document_loaders, text_splitters, documents
from crewai import Agent, Task
from pprint import pprint

with open(f"{add_packages.APP_PATH}/my_configs/crew_ai.yaml") as f:
  configs_crewai = yaml.safe_load(f)


# Browser

In [None]:

def process_scraped_content(text: str) -> str:
  # Replace "\n " with "\n"
  text = re.sub(r'\n\s*', '\n', text)

  # Replace consecutive newlines with a single newline
  text = re.sub(r'\n+', '\n', text)

  # Replace multiple spaces with a single space
  text = re.sub(r' +', ' ', text)

  # Split text into lines
  lines = text.split('\n')

  # Filter out lines with fewer than 4 words
  lines = [line for line in lines if len(line.split()) >= 4]

  # Remove words starting with "Ä"
  lines = [re.sub(r'\bÄ\w*\b', '', line) for line in lines]

  # List of phrases to remove lines containing them
  phrases_to_remove = [
      "Sign in to your CNN account",
      "Ad was repetitive to ads I've seen previously",
      "Content moved around while ad loaded"
  ]

  # Filter out lines containing any of the phrases
  lines = [line for line in lines if not any(
      phrase in line for phrase in phrases_to_remove)]

  # Join filtered lines back into text
  text = '\n'.join(lines)

  return text

## scrape_and_summarize_website

In [None]:
url = "https://edition.cnn.com/2024/04/03/politics/biden-netanyahu-tension-analysis/index.html"
doc = document_loaders.WebBaseLoader(url).load()[0].page_content
doc = process_scraped_content(doc)
doc = [documents.Document(doc)]

text_splitter = text_splitters.RecursiveCharacterTextSplitter(
  chunk_size=2000, chunk_overlap=300,
)
docs = text_splitter.split_documents(doc)

In [None]:
configs_tool_browser = configs_crewai["tools"]["browser"]
configs_tool_scrape_and_summarize_website = configs_tool_browser[
    "scrape_and_summarize_website"]
configs_tool_scrape_and_summarize_website_agent = configs_tool_scrape_and_summarize_website["agent"]
configs_tool_scrape_and_summarize_website_task = configs_tool_scrape_and_summarize_website["task"]


In [None]:
agent = Agent(
  **configs_tool_scrape_and_summarize_website_agent,
  allow_delegation=False,
)

In [None]:
summaries = []

for doc in docs:
  content = doc.page_content
  description = configs_tool_scrape_and_summarize_website_task["description"].replace(
      "%CHUNK%", content)
  expected_output = configs_tool_scrape_and_summarize_website_task["expected_output"]
  
  task = Task(
    agent=agent,
    description=description,
    expected_output=expected_output,
  )
  
  summary = task.execute()
  summaries.append(summary)

## search_internet

In [16]:
def search_internet(
  query: str,
  top_result_to_return: int = 4,
) -> str:
  """
  Useful to search the internet about a a given topic and return relevant 
  results
  """
  url_serper = "https://google.serper.dev/search"
  
  payload = json.dumps({"q": query})
  headers = {
    "X-API-KEY": os.environ["SERPER_API_KEY"],
    "content-type": "application/json",
  }

  response = requests.request("POST", url_serper, headers=headers, data=payload)
  
  if "organic" not in response.json():
    return ("Sorry, I couldn't find anything about that, there could be an "
            "error with you serper api key.")
  else:
    results = response.json()["organic"]
    result_str = []
    
    for result in results[:top_result_to_return]:
      try:
        result_str.append("\n".join([
          f"Title: {result['title']}", f"Link: {result['link']}",
          f"Snippet: {result['snippet']}", f"\n{'-'*10}"
        ]))
      except KeyError:
        next
  
  return "\n".join(result_str)

In [19]:
result = search_internet(query="apple", top_result_to_return=4)
print(result)

Title: Apple
Link: https://www.apple.com/
Snippet: Apple Footer · 1. · Available in the U.S. on apple.com, in the Apple Store app, and at Apple Stores. · To access and use all Apple Card features and products ...

----------
Title: Apple Inc. - Wikipedia
Link: https://en.wikipedia.org/wiki/Apple_Inc.
Snippet: Apple Inc. (formerly Apple Computer, Inc.) is an American multinational corporation and technology company headquartered in Cupertino, California, ...

----------
Title: Apple - YouTube
Link: https://www.youtube.com/user/apple
Snippet: Welcome to the official Apple YouTube channel. Here you'll find news about product launches, tutorials, and other great content. apple.comand 1 more link.

----------
Title: @apple • Instagram photos and videos
Link: https://www.instagram.com/apple/
Snippet: 33M Followers, 9 Following, 1183 Posts - See Instagram photos and videos from @apple.

----------


# Test

In [None]:
import add_packages
from toolkit.crewai import tools

url = "https://edition.cnn.com/2024/04/03/politics/biden-netanyahu-tension-analysis/index.html"

tools_browser = tools.ToolsBrowser()
tools_browser.scrape_and_summarize_website(url)