## AI Setup

In [1]:
import google.generativeai as genai
import os
from dotenv import load_dotenv
import time

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=GEMINI_API_KEY)
GEMINI_MODEL = "gemini-1.5-flash"

In [3]:
def generate_response(prompt):
    '''
    Send user query to Gemini API and return response
    '''
    max_retries = 3
    for attempt in range(max_retries):
        try:
            model = genai.GenerativeModel(GEMINI_MODEL)
            response = model.generate_content(prompt)
            return response.text
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
                time.sleep(1)  
                continue
            else:
                return f"Error after {max_retries} attempts: {str(e)}"

# 1. Initial Scan of Website

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [5]:
def fetch_html(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        print(f"🔗 Fetching HTML from: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        print("✅ HTML fetch successful.")
        return response.text
    except Exception as e:
        print(f"[ERROR] Failed to fetch HTML from {url}: {e}")
        return None

def extract_links(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    raw_links = soup.find_all("a", href=True)
    links = [urljoin(base_url, tag['href']) for tag in raw_links]
    print(f"🔍 Found {len(links)} total links.")
    print(links)
    return links

def build_prompt(links, homepage_text=""):
    link_list = "\n".join(links)  
    return f"""
You are an AI assistant helping extract key competitor page links from a homepage.

Here is a list of links found on the homepage:
{link_list}

Based on common naming conventions, identify the most likely URLs for:
- pricing
- blog
- release notes or changelog
- Twitter
- LinkedIn
- Play Store
- App Store

Respond ONLY in this JSON format:
{{
  "pricing_url": "...",
  "blog_url": "...",
  "release_notes_url": "...",
  "linkedin_url": "...",
  "twitter_url": "...",
  "playstore_url": "...",
  "appstore_url": "..."
}}

If any link is missing, set the value to an empty string.
"""

In [6]:
def scan_website(url):
    html = fetch_html(url)
    if not html:
        return

    links = extract_links(html, url)

    print("\n📨 Sending links to Gemini...")
    prompt = build_prompt(links)
    ai_response = generate_response(prompt)

    print("\n🧠 Gemini returned:")
    print(ai_response)

In [7]:
generate_response("Hello, how are you?")  

"I'm doing well, thank you for asking! How are you today?\n"

In [8]:
scan_website("https://www.notion.com")

🔗 Fetching HTML from: https://www.notion.com
✅ HTML fetch successful.
🔍 Found 83 total links.
['https://www.notion.com/product', 'https://www.notion.com/product/ai', 'https://www.notion.com/product/docs', 'https://www.notion.com/product/wikis', 'https://www.notion.com/product/projects', 'https://www.notion.com/product/enterprise-search', 'https://www.notion.com/product/ai-meeting-notes', 'https://www.notion.com/product/forms', 'https://www.notion.com/product/sites', 'https://www.notion.com/templates', 'https://www.notion.com/integrations', 'https://www.notion.com/web-clipper', 'https://www.notion.com/download', 'https://www.notion.com/product/mail', 'https://www.notion.com/product/calendar', 'https://www.notion.com/product/ai', 'https://www.notion.com/enterprise', 'https://www.notion.com/pricing', 'https://www.notion.com/product/notion-for-product-development', 'https://www.notion.com/product/notion-for-design', 'https://www.notion.com/templates/category/marketing', 'https://www.notion

# 2. Snapshots

In [13]:
URL_info = {"pricing_url": "https://www.notion.com/pricing",
            "blog_url": "https://www.notion.com/blog",
            "release_notes_url": "https://www.notion.com/releases",
            "linkedin_url": "https://www.linkedin.com/company/notionhq",
            "twitter_url": "https://x.com/NotionHQ",
            "playstore_url": "https://play.google.com/store/apps/details?id=notion.id",
            "appstore_url": "https://apps.apple.com/app/notion/id1232780281"
            }

In [14]:
import pandas as pd

In [15]:
def scrape_urls_to_df(url_info: dict) -> pd.DataFrame:
    data = []

    headers = {'User-Agent': 'Mozilla/5.0'}

    for label, url in url_info.items():
        if not url:
            print(f"⚠️ Skipping {label} as URL is empty.")
            content = ""
        else:
            try:
                print(f"🔗 Fetching {label} from {url}")
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                content = response.text
                print(f"✅ {label} fetched successfully.")
            except Exception as e:
                print(f"❌ Failed to fetch {label} from {url}: {e}")
                content = ""

        data.append({
            "label": label,
            "url": url,
            "content": content
        })

    df = pd.DataFrame(data)
    return df

In [17]:
scrape_df = scrape_urls_to_df(URL_info)
print(scrape_df)

🔗 Fetching pricing_url from https://www.notion.com/pricing
✅ pricing_url fetched successfully.
🔗 Fetching blog_url from https://www.notion.com/blog
✅ blog_url fetched successfully.
🔗 Fetching release_notes_url from https://www.notion.com/releases
✅ release_notes_url fetched successfully.
🔗 Fetching linkedin_url from https://www.linkedin.com/company/notionhq
✅ linkedin_url fetched successfully.
🔗 Fetching twitter_url from https://x.com/NotionHQ
❌ Failed to fetch twitter_url from https://x.com/NotionHQ: 400 Client Error: Bad Request for url: https://x.com/NotionHQ
🔗 Fetching playstore_url from https://play.google.com/store/apps/details?id=notion.id
✅ playstore_url fetched successfully.
🔗 Fetching appstore_url from https://apps.apple.com/app/notion/id1232780281
✅ appstore_url fetched successfully.
               label                                                url  \
0        pricing_url                     https://www.notion.com/pricing   
1           blog_url                        

In [21]:
scrape_df.iloc[0]

label                                            pricing_url
url                           https://www.notion.com/pricing
content    <!DOCTYPE html><html lang="en-us"><head><meta ...
Name: 0, dtype: object