# Lesson 3: Agentic Search


In [None]:
from dotenv import load_dotenv
import os
import sys
import json, re
import pprint
import boto3
from botocore.client import Config
import warnings
warnings.filterwarnings("ignore")
import logging

# import local modules
dir_current = os.path.abspath('')
dir_parent = os.path.dirname(dir_current)
if dir_parent not in sys.path:
    sys.path.append(dir_parent)
from utils import utils

bedrock_config = Config(
    connect_timeout=120, read_timeout=120, retries={"max_attempts": 0}
)

# Set basic configs
logger = utils.set_logger()
pp = utils.set_pretty_printer()

# Set main parameters
tavily_api_key_name = "TAVILY_API_KEY"
aws_region = "us-east-1"

# Set bedrock configs
bedrock_config = Config(
    connect_timeout=120, read_timeout=120, retries={"max_attempts": 0}
)

# Create a bedrock runtime client
bedrock_rt = boto3.client(
    "bedrock-runtime",
    region_name=aws_region,
    config=bedrock_config
)

# Create a bedrock client to check available models
bedrock = boto3.client(
    "bedrock",
    region_name=aws_region,
    config=bedrock_config
)

# Retrieve API KEY from env variables or secrets manager
try:
    tavily_ai_api_key = utils.get_from_secretstore_or_env(tavily_api_key_name, aws_region)
    os.environ["TAVILY_API_KEY"] = tavily_ai_api_key
except ValueError as ve:
    logger.error(
        "Could not retrieve the TAVILIY API KEY, neither from the os enviroment variables, nor from AWS Secrets manager!"
    )
    logger.error(ve)

In [None]:
from tavily import TavilyClient

client = TavilyClient(api_key=tavily_ai_api_key)
# run search
result = client.search("What is in Nvidia's new Blackwell GPU?", include_answer=True)

# print the answer
result["answer"]

## Regular search


In [None]:
# choose location (try to change to your own city!)

city = "San Francisco"

query = f"""
    what is the current weather in {city}?
    Should I travel there today?
    "weather.com"
"""

> Note: search was modified to return expected results in the event of an exception. High volumes of student traffic sometimes cause rate limit exceptions.


In [None]:
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
import re

ddg = DDGS()


def search(query, max_results=6):
    try:
        results = ddg.text(query, max_results=max_results)
        return [i["href"] for i in results]
    except Exception as e:
        print(f"returning previous results due to exception reaching ddg.")
        results = [  # cover case where DDG rate limits due to high deeplearning.ai volume
            "https://weather.com/weather/today/l/USCA0987:1:US",
            "https://weather.com/weather/hourbyhour/l/54f9d8baac32496f6b5497b4bf7a277c3e2e6cc5625de69680e6169e7e38e9a8",
        ]
        return results


for i in search(query):
    print(i)

In [None]:
def scrape_weather_info(url):
    """Scrape content from the given URL"""
    if not url:
        return "Weather information could not be found."

    # fetch data
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return "Failed to retrieve the webpage."

    # parse result
    soup = BeautifulSoup(response.text, "html.parser")
    return soup

> Note: This produces a long output, you may want to right click and clear the cell output after you look at it briefly to avoid scrolling past it.


In [None]:
# use DuckDuckGo to find websites and take the first result
url = search(query)[0]

# scrape first wesbsite
soup = scrape_weather_info(url)

print(f"Website: {url}\n\n")
print(str(soup.body)[:50000])  # limit long outputs

In [None]:
# extract text
weather_data = []
for tag in soup.find_all(["h1", "h2", "h3", "p"]):
    text = tag.get_text(" ", strip=True)
    weather_data.append(text)

# combine all elements into a single string
weather_data = "\n".join(weather_data)

# remove all spaces from the combined text
weather_data = re.sub(r"\s+", " ", weather_data)

print(f"Website: {url}\n\n")
print(weather_data)

## Agentic Search


In [None]:
# run search
result = client.search(query, max_results=1)

# print first result
data = result["results"][0]["content"]

print(data)

In [None]:
import json
from pygments import highlight, lexers, formatters

# parse JSON
parsed_json = json.loads(data.replace("'", '"'))

# pretty print JSON with syntax highlighting
formatted_json = json.dumps(parsed_json, indent=4)
colorful_json = highlight(
    formatted_json, lexers.JsonLexer(), formatters.TerminalFormatter()
)

print(colorful_json)