# Website Analyzer

- Detect if website is support multiple languages
- Detect if bot detection is enabled to provide proper crawling configuration
- See if text mode can be used to improve crawling speed
- Check if a sitemap is available and if it is updated regularly

### Test URLs

- https://eur-lex.europa.eu/homepage.html
-

In [2]:
target_url = "https://eur-lex.europa.eu/homepage.html"
target_file = "eur-lex.europa.eu.json"

### Output Result

In [7]:
from pydantic import BaseModel, Field
from typing import Optional, List
class SiteMetadata(BaseModel):
    home_page_url: Optional[str] = Field(default=None, description="The URL of the home page"),
    language_codes: Optional[List[str]] = Field(default=None, description="List of language codes supported by the website"),
    bot_detection_enabled: Optional[bool] = Field(default=None, description="Whether bot detection is enabled"),
    text_mode_enabled: Optional[bool] = Field(default=None, description="Whether text mode is enabled for crawling"),
    has_sitemap: Optional[bool] = Field(default=None, description="Indicate if a sitemap is available for URL discovery"),

In [None]:
target_site = SiteMetadata()

In [None]:
# Deep crawling example: Explore a website dynamically
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.models import CrawlResultContainer
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
import json
# Configure a 2-level deep crawl
config = CrawlerRunConfig(
    deep_crawl_strategy=BFSDeepCrawlStrategy(
        max_depth=2,           # Crawl 2 levels deep
        include_external=False, # Stay within domain
        max_pages=50           # Limit for efficiency
    ),
    verbose=True
)

async with AsyncWebCrawler() as crawler:
    # Start crawling and follow links dynamically
    results: CrawlResultContainer = await crawler.arun(target_url, config=config)
    print(f"Discovered and crawled {len(results)} pages")
    for result in results[:3]:
        print(f"Found: {result.url} at depth {result.metadata.get('depth', 0)}")
    # Save data for later reuse
    with open("website_data.json", "w", encoding="utf-8") as f:
        f.write(json.dumps([ r.model_dump() for r in results]))


In [6]:
# URL seeding example: Analyze all documentation
from crawl4ai import AsyncUrlSeeder, SeedingConfig

seeder = AsyncUrlSeeder()
config = SeedingConfig(
    source="sitemap",
    extract_head=True,
    max_urls=100,
    verbose=True,
)

# Get ALL documentation URLs instantly
sitemap_urls = await seeder.urls(target_url, config)
# 1000+ URLs discovered in seconds!
if len(sitemap_urls) > 0:
    print(f"Discovered {len(sitemap_urls)} URLs from sitemap")
else:
    print("No sitemap found. Fallback to use common crawl")
    sitemap_urls = await seeder.urls(target_url, config=SeedingConfig(
    source="cc",
    extract_head=True,
    max_urls=100,
    verbose=True,
))
    print(f"Discovered {len(sitemap_urls)} URLs from common crawl")

No sitemap found. Fallback to use common crawl
Discovered 100 URLs from common crawl


In [14]:
import json
with open(target_file + ".meta.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(sitemap_urls, indent=4))

In [1]:

import re
from collections import Counter
from urllib.parse import urlparse

def detect_multi_language(sitemap_urls, threshold=0.1):
    """
    Detect if a website is multi-language based on sitemap URLs.

    :param sitemap_urls: List of URLs from the sitemap
    :param threshold: Minimum proportion of URLs that should have language indicators
    :return: Tuple (is_multi_language, detected_languages)
    """
    language_patterns = [
        r'/([a-z]{2})/', # matches /en/, /fr/, etc.
        r'/([a-z]{2})-[a-z]{2}/', # matches /en-us/, /fr-ca/, etc.
        r'\.([a-z]{2})\.' # matches .en., .fr., etc. in subdomains
    ]

    language_counts = Counter()
    total_urls = len(sitemap_urls)

    for url in sitemap_urls:
        parsed_url = urlparse(url)
        path = parsed_url.path
        netloc = parsed_url.netloc

        for pattern in language_patterns:
            matches = re.findall(pattern, path) or re.findall(pattern, netloc)
            if matches:
                language_counts.update(matches)
                break  # Count only one language indicator per URL

    # Determine if it's multi-language
    total_language_urls = sum(language_counts.values())
    is_multi_language = (total_language_urls / total_urls) > threshold

    # Get the list of detected languages
    detected_languages = list(language_counts.keys())

    return is_multi_language, detected_languages

# Usage
is_multi_language, detected_languages = detect_multi_language(sitemap_urls)

print(f"Is multi-language site: {is_multi_language}")
print(f"Detected languages: {', '.join(detected_languages)}")

# Update the target_site object
target_site.language_codes = detected_languages if is_multi_language else None

NameError: name 'sitemap_urls' is not defined