## HTML Layout structure parsing with crawl4ai

Main implementation of the HTML Layout structure parsing with crawl4ai.

In [20]:
import lxml
from lxml.html.clean import Cleaner
from collections import defaultdict
from pydantic import BaseModel, ConfigDict
from typing import List, Optional, Dict, Any

# Define the Pydantic model for our analysis results
class ElementAnalysis(BaseModel):
    """
    Represents the analysis results for a single HTML element.
    """
    model_config = ConfigDict(arbitrary_types_allowed=True)

    text_count: int
    text_ratio: float
    tag_name: str
    css_selector: str
    xpath: str
    _element: 'lxml.html.HtmlElement'

    @property
    def element(self) -> 'lxml.html.HtmlElement':
        """
        Returns the original lxml.html.HtmlElement object.
        """
        return self._element

class HtmlLayoutAnalyzer:
    """
    Analyzes an HTML document to identify elements with the most text content.
    Performs a cleaning step to remove non-text-related elements before analysis.
    Returns a list of ElementAnalysis Pydantic models.
    """

    def __init__(self, html_content: str):
        """
        Initializes the analyzer with raw HTML content, cleaning it first.

        Args:
            html_content (str): The raw HTML content as a string.
        """
        try:
            raw_tree = lxml.html.fromstring(html_content)
            self.tree = self._clean_html(raw_tree)
        except lxml.etree.ParserError as e:
            raise ValueError(f"Failed to parse HTML content: {e}")
        self.element_data = {}

    def _clean_html(self, tree):
        """
        Removes non-text-related elements and their content from the HTML tree.
        This includes 'form', 'style', and 'script' tags.

        Args:
            tree: The lxml HTML tree to clean.

        Returns:
            The cleaned lxml HTML tree.
        """
        cleaner = Cleaner(
            page_structure=False,
        )
        return cleaner.clean_html(tree)

    def _calculate_text_content(self, element) -> int:
        """
        Recursively calculates the total text content (including descendants)
        for a given HTML element.

        Args:
            element: An lxml.html element.

        Returns:
            int: The total length of text content within the element and its descendants.
        """
        text_length = 0
        if element.text:
            text_length += len(element.text.strip())
        for child in element:
            text_length += self._calculate_text_content(child)
            if child.tail:
                text_length += len(child.tail.strip())
        return text_length

    def _get_element_css_selector(self, element) -> str:
        """
        Generates a full CSS selector path for an lxml element.
        """
        if element is None:
            return ""

        path_parts = []
        current = element

        while current is not None and current.tag != 'document':
            if current.tag in [lxml.html.HtmlComment, lxml.html.HtmlProcessingInstruction, lxml.etree.Entity]:
                current = current.getparent()
                continue

            tag_name = current.tag
            path_segment = tag_name

            element_id = current.get('id')
            element_classes = current.get('class')

            if element_id:
                path_segment = f"{tag_name}#{element_id}"
            elif element_classes:
                classes = element_classes.strip().split()
                if classes:
                    class_selectors = ".".join(classes)
                    path_segment = f"{tag_name}.{class_selectors}"

            parent = current.getparent()
            if parent is not None:
                index = 1
                for sibling in parent:
                    if sibling is current:
                        break
                    if sibling.tag == tag_name and \
                       sibling.tag not in [lxml.html.HtmlComment, lxml.html.HtmlProcessingInstruction, lxml.etree.Entity]:
                        index += 1

                siblings_with_same_tag = [s for s in parent
                                          if s.tag == tag_name and
                                          s.tag not in [lxml.html.HtmlComment, lxml.html.HtmlProcessingInstruction, lxml.etree.Entity]]

                if len(siblings_with_same_tag) > 1 and not element_id:
                    path_segment = f"{path_segment}:nth-of-type({index})"

            path_parts.insert(0, path_segment)
            current = current.getparent()

        return " > ".join(path_parts)

    def _get_element_xpath(self, element) -> str:
        """
        Generates a full, absolute XPath for an lxml element.
        """
        if element is None:
            return ""

        path_parts = []
        current = element

        while current is not None and current.tag != 'document':
            if current.tag in [lxml.html.HtmlComment, lxml.html.HtmlProcessingInstruction, lxml.etree.Entity]:
                current = current.getparent()
                continue

            tag_name = current.tag
            predicate = ""

            element_id = current.get('id')
            element_classes = current.get('class')

            if element_id:
                predicate = f"[@id='{element_id}']"
            elif element_classes:
                classes = element_classes.strip().split()
                if classes:
                    class_predicates = [f"contains(concat(' ', @class, ' '), ' {cls} ')" for cls in classes]
                    predicate = f"[{' and '.join(class_predicates)}]"

            if not predicate:
                parent = current.getparent()
                if parent is not None:
                    index = 1
                    for sibling in parent:
                        if sibling is current:
                            break
                        if sibling.tag == tag_name and \
                           sibling.tag not in [lxml.html.HtmlComment, lxml.html.HtmlProcessingInstruction, lxml.etree.Entity]:
                            index += 1

                    if index > 1 or len([s for s in parent if s.tag == tag_name]) > 1:
                        predicate = f"[{index}]"

            path_parts.insert(0, f"{tag_name}{predicate}")
            current = current.getparent()

        if not path_parts:
            return "/" + element.tag

        full_path = "/" + "/".join(path_parts)
        if full_path.startswith('/html') or full_path.startswith('/*'):
             return full_path

        return "/" + "/".join(path_parts)


    def analyze_text_density(self, top_n: int = 3) -> List[ElementAnalysis]:
        """
        Traverses the HTML tree, calculates text density for each element,
        and returns the top N elements with the most text as Pydantic models.

        Args:
            top_n (int): The number of top elements to return. Defaults to 3.

        Returns:
            List[ElementAnalysis]: A list of Pydantic models, sorted by text_count.
        """
        self.element_data = {}

        body_element = self.tree.find('.//body')
        if body_element is not None:
            total_body_text_count = self._calculate_text_content(body_element)
        else:
            total_body_text_count = 0

        for element in self.tree.xpath('//*'):
            if isinstance(element.tag, str):
                text_count = self._calculate_text_content(element)

                text_ratio = 0.0
                if total_body_text_count > 0:
                    text_ratio = (text_count / total_body_text_count) * 100

                css_selector = self._get_element_css_selector(element)
                xpath = self._get_element_xpath(element)

                if css_selector or xpath:
                    self.element_data[css_selector] = {
                        "text_count": text_count,
                        "text_ratio": round(text_ratio, 2),
                        "tag_name": element.tag,
                        "css_selector": css_selector,
                        "xpath": xpath,
                        "_element": element
                    }

        sorted_elements = sorted(
            self.element_data.values(),
            key=lambda item: item["text_count"],
            reverse=True
        )

        result = []
        for data in sorted_elements[:top_n]:
            result.append(ElementAnalysis(**data))

        return result


    def determine_page_type(self) -> str:
        """
        Heuristically determines if the page is a 'detail' page or a 'listing' page
        based on the text distribution of the most text-dense elements.

        Returns:
            str: "detail" if the page has one highly dominant text block,
                 "listing" if text is distributed more evenly, or "unknown".
        """
        # Get the top 5 most text-dense elements to analyze their distribution
        top_elements = self.analyze_text_density(top_n=5)

        # Handle pages with very little content
        if not top_elements or top_elements[0].text_count < 100:  # A minimum threshold
            return "unknown"

        # Heuristic 1: Is the top element's text ratio very high?
        # A detail page often has a single element with > 50% of the body's text.
        if top_elements[0].text_ratio > 50.0:
            return "detail"

        # Heuristic 2: How does the top element compare to the next one?
        # On a listing page, the top few elements should have a similar amount of text.
        # Check if the top element is not overwhelmingly larger than the second one.
        if len(top_elements) > 1:
            ratio_of_top_to_second = top_elements[0].text_count / top_elements[1].text_count

            # If the top element is only slightly larger than the second, it's likely a listing.
            # A value like < 2.0 (i.e., top element is less than 2x larger than the second)
            if ratio_of_top_to_second < 2.0:
                 return "listing"

        # If neither of the above heuristics match, we return 'unknown'
        return "unknown"



def find_most_relevant_selectors(all_results: List[List[ElementAnalysis]], min_occurrence: int = 5) -> List[Dict[str, Any]]:
    """
    Analyzes a list of lists of ElementAnalysis models to find the most relevant
    CSS selectors based on average text_ratio and frequency.

    Args:
        all_results (List[List[ElementAnalysis]]): A list where each item is a list
                                                   of ElementAnalysis models from one page.
        min_occurrence (int): The minimum number of pages a selector must appear in
                              to be considered relevant.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries, each containing a selector, its
                              average text_ratio, and its count, sorted by average ratio.
    """
    selector_dict = defaultdict(lambda: {'ratios': [], 'count': 0, 'tag_name': '', 'example_xpath': ''})

    for page_results in all_results:
        # Keep track of which selectors we've seen on this page to avoid duplicates
        seen_selectors_on_page = set()
        for element_analysis in page_results:
            selector = element_analysis.css_selector
            if selector not in seen_selectors_on_page:
                selector_dict[selector]['ratios'].append(element_analysis.text_ratio)
                selector_dict[selector]['count'] += 1
                selector_dict[selector]['tag_name'] = element_analysis.tag_name
                selector_dict[selector]['example_xpath'] = element_analysis.xpath
                seen_selectors_on_page.add(selector)

    # Calculate average ratio and filter based on min_occurrence
    aggregated_results = []
    for selector, data in selector_dict.items():
        if data['count'] >= min_occurrence:
            average_ratio = sum(data['ratios']) / len(data['ratios'])
            aggregated_results.append({
                'css_selector': selector,
                'tag_name': data['tag_name'],
                'average_text_ratio': round(average_ratio, 2),
                'occurrence_count': data['count'],
                'example_xpath': data['example_xpath']
            })

    # Sort by average text ratio in descending order
    aggregated_results.sort(key=lambda x: x['average_text_ratio'], reverse=True)

    return aggregated_results



In [21]:
detail_page_html = """
<!DOCTYPE html>
<html><body>
    <main id="main-content">
        <h1>Article Title</h1>
        <article class="main-article">
            <p>This is the start of a very long article. It has many paragraphs
            and a lot of descriptive text that makes up the bulk of the page's
            content. This is the key content area.</p>
            <p>More content here to make the page text-heavy.</p>
            <p>Even more content to dominate the text distribution.</p>
        </article>
    </main>
    <div class="sidebar">
        <p>A small amount of text here.</p>
    </div>
</body></html>
"""

listing_page_html = """
<!DOCTYPE html>
<html><body>
    <h1>Search Results</h1>
    <div class="results-list">
        <div class="result-item">
            <h2>Result 1</h2>
            <p>Short snippet of text for result 1.</p>
        </div>
        <div class="result-item">
            <h2>Result 2</h2>
            <p>Short snippet of text for result 2.</p>
        </div>
        <div class="result-item">
            <h2>Result 3</h2>
            <p>Short snippet of text for result 3.</p>
        </div>
        <div class="footer">
            <p>Some text in the footer.</p>
        </div>
    </div>
</body></html>
"""

analyzer_detail = HtmlLayoutAnalyzer(detail_page_html)
page_type_detail = analyzer_detail.determine_page_type()
print(f"Detail Page HTML Type: {page_type_detail}")

analyzer_listing = HtmlLayoutAnalyzer(listing_page_html)
page_type_listing = analyzer_listing.determine_page_type()
print(f"Listing Page HTML Type: {page_type_listing}")

# Example for an "unknown" page
empty_page_html = """
<!DOCTYPE html>
<html><body>
    <img src="logo.png">
    <a href="#">Link</a>
</body></html>
"""
analyzer_empty = HtmlLayoutAnalyzer(empty_page_html)
page_type_empty = analyzer_empty.determine_page_type()
print(f"Empty Page HTML Type: {page_type_empty}")

Detail Page HTML Type: detail
Listing Page HTML Type: detail
Empty Page HTML Type: unknown


In [17]:
# Let's simulate a list of results from 3 different HTML samples
# We'll use our analyzer to get the data for each sample.

html_sample_1 = """
<!DOCTYPE html>
<html><body>
    <main id="main-content">
        <section class="intro">
            <p>Text for the first sample, intro section.</p>
            <p>More text here.</p>
        </section>
    </main>
    <div class="sidebar">
        <p>Sidebar text, not very long.</p>
    </div>
</body></html>
"""

html_sample_2 = """
<!DOCTYPE html>
<html><body>
    <main id="main-content">
        <section class="intro">
            <p>This is the text for the second sample. It's quite a bit longer.</p>
            <p>This section is very text-dense and a key part of the page.</p>
            <p>Another paragraph to make it even more content-rich.</p>
        </section>
    </main>
    <div class="sidebar">
        <p>Short sidebar text.</p>
    </div>
</body></html>
"""

html_sample_3 = """
<!DOCTYPE html>
<html><body>
    <article id="main-content">
        <section class="intro">
            <p>Sample 3 content, very similar to the others.</p>
        </section>
    </article>
    <div class="footer">
        <p>Footer text.</p>
    </div>
</body></html>
"""

# Process each sample using the analyzer
analyzer_1 = HtmlLayoutAnalyzer(html_sample_1)
results_1 = analyzer_1.analyze_text_density(top_n=5)

analyzer_2 = HtmlLayoutAnalyzer(html_sample_2)
results_2 = analyzer_2.analyze_text_density(top_n=5)

analyzer_3 = HtmlLayoutAnalyzer(html_sample_3)
results_3 = analyzer_3.analyze_text_density(top_n=5)

# Create the list of lists of ElementAnalysis models
all_page_results = [results_1, results_2, results_3]

# Find the most relevant selectors with a minimum occurrence of 2 out of 3 samples
relevant_selectors = find_most_relevant_selectors(all_page_results, min_occurrence=2)

print("Most relevant CSS selectors based on text ratio across the dataset:")
print("-" * 60)
for selector_data in relevant_selectors:
    print(f"Selector: {selector_data['css_selector']}")
    print(f"  Tag Name: {selector_data['tag_name']}")
    print(f"  Average Text Ratio: {selector_data['average_text_ratio']}%")
    print(f"  Appeared in {selector_data['occurrence_count']} out of {len(all_page_results)} samples")
    print(f"  Example XPath: {selector_data['example_xpath']}")
    print("-" * 60)

Most relevant CSS selectors based on text ratio across the dataset:
------------------------------------------------------------
Selector: html
  Tag Name: html
  Average Text Ratio: 100.0%
  Appeared in 3 out of 3 samples
  Example XPath: /html
------------------------------------------------------------
Selector: html > body
  Tag Name: body
  Average Text Ratio: 100.0%
  Appeared in 3 out of 3 samples
  Example XPath: /html/body
------------------------------------------------------------
Selector: html > body > section.intro
  Tag Name: section
  Average Text Ratio: 78.44%
  Appeared in 2 out of 3 samples
  Example XPath: /html/body/section[contains(concat(' ', @class, ' '), ' intro ')]
------------------------------------------------------------
Selector: html > body > section.intro > p:nth-of-type(1)
  Tag Name: p
  Average Text Ratio: 40.9%
  Appeared in 2 out of 3 samples
  Example XPath: /html/body/section[contains(concat(' ', @class, ' '), ' intro ')]/p[1]
-------------------