In [None]:
from dotenv import load_dotenv
from openai import AsyncOpenAI
from agents import Agent, Runner, trace, function_tool, OpenAIChatCompletionsModel, input_guardrail, GuardrailFunctionOutput, WebSearchTool
from openai.types.responses import ResponseTextDeltaEvent
from typing import Dict, List, Optional
from pydantic import BaseModel, HttpUrl
import sendgrid
import os
from sendgrid.helpers.mail import Mail, Email, To, Content
import asyncio
from scraper import fetch_website_contents, fetch_website_links
from IPython.core.display import Markdown

In [None]:
load_dotenv(override=True)

In [None]:
google_api_key = os.getenv('GOOGLE_API_KEY')
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
gemini_client = AsyncOpenAI(base_url=GEMINI_BASE_URL, api_key=google_api_key)
gemini_2_5_pro_model = OpenAIChatCompletionsModel(model="gemini-2.5-pro", openai_client=gemini_client)
gemini_2_5_flash_model = OpenAIChatCompletionsModel(model="gemini-2.5-flash", openai_client=gemini_client)

In [None]:
COMPANY_URL = "https://cbtw.tech/"

In [None]:
# PageSummarizer agent

page_summarizer_instruction ="""
You are a customer service content analyzer. Your task is to create concise summaries of content to support customer service teams.

INPUT: Website content (HTML or text) from a company website

OUTPUT: A brief, structured markdown summary (max 500 words)
"""
page_summarizer_agent = Agent(
        name="PageSummarizer Agent",
        instructions=page_summarizer_instruction,
        model=gemini_2_5_flash_model
)

In [None]:
async def summarize(url):
    web_content = fetch_website_contents(url)
    return web_content
    #print(f"AGENT: PageSummarizer is running {url}")
    #with trace("PageSummarizer Agent"):
        #result = await Runner.run(page_summarizer_agent, web_content)
        #return result.final_output

# web_content = await summarize(CBTW_URL)
# print(web_content)

In [None]:
# Get relevant links Agent
class Link(BaseModel):
    type: str
    url: HttpUrl

class RelevantLinks(BaseModel):
    links: List[Link]

get_relevant_links_system_prompt = """
You are an AI customer service content analyzer.

**Goal:** From a provided list of webpage links, identify and extract the links that are most relevant for inclusion in customer-facing content about the company.

**Relevant links typically include:**

* About or Company information pages
* Careers, Jobs, or Recruitment pages
* Contact or Support pages (if applicable)
* Corporate responsibility or leadership information (optional if found)
* All products this company supported

**Instructions:**

1. Review the provided list of links.
2. Select only those that directly relate to the company's identity, values, or opportunities.
3. Exclude irrelevant links such as facebook, email, instagram, marketing campaigns, or unrelated resources ...
4. Return your answer strictly in JSON format following this structure:

```json
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
```

**Notes:**

* The `type` field should clearly describe the purpose of the page (e.g., `about page`, `careers page`, `contact page`).
* The `url` field must contain the full URL as provided.
* If no relevant links are found, return an empty list: `{ "links": [] }`.
"""

def generate_relevant_filter_user_prompt(base_url): 
   user_prompt = f"""
   Here is the list of links on the website {base_url} -
   Please decide which of these are relevant web links for relevant for inclusion in customer-facing content about the company, 
   respond with the full https URL in JSON format.
   Do not include Terms of Service, Privacy, email links, facebook, instagram, youtube, twitter

   Links (some might be relative links):
   """
   links = fetch_website_links(base_url)
   user_prompt += "\n".join(links)
   return user_prompt

get_relevant_links_agent = Agent(
        name="Agent: get relevant links",
        instructions=get_relevant_links_system_prompt,
        output_type=RelevantLinks,
        model=gemini_2_5_flash_model
)

In [None]:
generate_relevant_filter_user_prompt(COMPANY_URL)

In [None]:
async def get_relevant_links(url):
    print(f"AGENT: Link filtering agent is running {url}")
    user_prompt= generate_relevant_filter_user_prompt(url)
    with trace("Link filtering agent"):
        result = await Runner.run(get_relevant_links_agent, user_prompt)
        return result.final_output

In [None]:
relevant_link = await get_relevant_links(COMPANY_URL)

In [None]:
relevant_link.links

In [None]:
def write_markdown(content: str, filename: str = "output.md"):
    """
    Write markdown content to a local file.
    Overrides file if it exists, creates new file if it doesn't.
    
    Args:
        content: Markdown text to write
        filename: Output filename (default: output.md)
    """

    with open(f"profiles/{filename}", 'w', encoding='utf-8') as f:
        f.write(content)

In [None]:
def append_to_file(filename, text):
    """
    Appends text to a local file. Creates the file if it doesn't exist.
    
    Args:
        filename: Path to the file
        text: Text content to append
    
    Returns:
        True if successful, False otherwise
    """
    try:
        # Create directory if it doesn't exist
        directory = os.path.dirname(filename)
        if directory and not os.path.exists(directory):
            os.makedirs(directory)
        
        # Append to file (creates file if it doesn't exist)
        with open(f"profiles/{filename}", 'a', encoding='utf-8') as f:
            f.write(text)
        return True
    except Exception as e:
        print(f"Error writing to file '{filename}': {e}")
        return False

In [None]:
# Function to process URLs in batches of 10
async def run_summarize_in_batches(urls, batch_size=10, delay=60):
    results = []

    for i in range(0, len(urls), batch_size):
        batch = urls[i:i+batch_size]
        print(f"\n🚀 Running batch {i//batch_size + 1} ({len(batch)} URLs)...")

        # Run this batch in parallel
        tasks = [asyncio.create_task(summarize(url)) for url in batch]
        batch_results = await asyncio.gather(*tasks)
        results.extend(batch_results)

        # If there are more batches left, sleep
        if i + batch_size < len(urls):
            print(f"⏳ Waiting {delay}s before next batch...\n")
            await asyncio.sleep(delay)

    print("\n✅ All batches completed.")
    return results

In [None]:
async def scrape_recursive(current_url, depth, max_depth, visited_urls, filename, type="Infotrack"):
    """
    Recursively scrapes a URL and its related URLs up to max_depth.
    
    Args:
        current_url: URL to scrape
        depth: Current depth level
        max_depth: Maximum depth to scrape
        visited_urls: Set of already visited URLs
        all_content: List to collect content
    """
    #print(f"scrape_recursive url: {current_url}")
    # Write this 2 if statement to see behavior of function
    if depth > max_depth:
        print(f"scrape_recursive exist with depth: {depth}, max: {max_depth}, ")
        return
    
    if current_url in visited_urls:
        print(f"scrape_recursive exist with current_url: {current_url} has already scraped")
        return
    print(f"scrape_recursive url: {current_url} with depth level: {depth}")
    visited_urls.append(current_url)
    
    # Extract content from current URL
    content = await summarize(current_url)
    append_to_file(filename, f"## Type {type} Level {depth}: {current_url}\n\n{content}\n\n")
    
    # Add delay to avoid the rate limit. only need for free account
    #delay = 5
    #print(f"⏳ Waiting {delay}s before next run...\n")
    #await asyncio.sleep(delay)
    
    # Only extract related URLs if not at max depth
    if depth < max_depth:
        related_urls = await get_relevant_links(current_url)
        for link in related_urls.links:
            print(f"Type: {link.type}, URL: {link.url}")
        for related_url in related_urls.links:
            await scrape_recursive(related_url.url, depth + 1, max_depth, visited_urls, filename, related_url.type) 

In [None]:
async def scrape_website_with_depth(url, max_depth=2, filename: str = "output.md"):
    """
    Scrapes website content up to specified depth level.
    
    Args:
        url: Starting URL to scrape
        max_depth: Maximum depth level (default: 2)
    
    Returns:
        Markdown formatted summary of all extracted content
    """
    visited_urls = []
    await scrape_recursive(url, 1, max_depth, visited_urls, filename, "Root Page")

In [None]:
await scrape_website_with_depth(COMPANY_URL, 3, "CBTW.txt")

In [None]:
async def fetch_page_and_summurize(url, company_name):
    print(f"start fetch {url}")
    contents = await summarize(url)
    relevant_links = await get_relevant_links(url)
    summarized_contents = f"## Landing page:\n\n{contents}\n## Relevant Information:"

    results = await run_summarize_in_batches(relevant_links.links, batch_size=8, delay=60)
    for relevant_link, summary in zip(relevant_links.links, results):
        summarized_contents += f"\n\n### Link: {relevant_link}\n"
        summarized_contents += summary

    write_markdown(summarized_contents, f"{company_name}.md")
    return summarized_contents

print(await fetch_page_and_summurize(COMPANY_URL, "Infotrack"))