In [6]:
from bs4 import BeautifulSoup
import os

def element_signature(elem, include_text=True, include_position=False):
    """
    Create a normalized signature string for an element.
    
    Args:
        elem: BeautifulSoup element
        include_text: Whether to include text content in signature
        include_position: Whether to include element position (for structural comparison)
    """
    tag = elem.name
    
    # Handle attributes more safely and consistently
    attrs = []
    for k, v in sorted(elem.attrs.items()):
        # Handle different attribute value types
        if isinstance(v, list):
            v = ' '.join(v)  # For class attributes, etc.
        attrs.append(f'{k}={repr(str(v))}')
    
    attr_str = ' '.join(attrs)
    
    if include_text:
        # Better text normalization - handles multiple whitespaces
        text = ' '.join(elem.get_text(strip=True).split())
        signature = f"<{tag} {attr_str}>{text}</{tag}>"
    else:
        signature = f"<{tag} {attr_str}/>"
    
    if include_position:
        # Add position info for structural comparison
        parent_tags = [p.name for p in elem.parents if p.name]
        position = '->'.join(reversed(parent_tags[-3:]))  # Last 3 levels
        signature = f"{position}::{signature}"
    
    return signature

def extract_signatures_from_file(filepath, **kwargs):
    """Load HTML from file and extract element signatures."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            html = file.read()
    except UnicodeDecodeError:
        # Try with different encoding
        with open(filepath, 'r', encoding='latin-1') as file:
            html = file.read()
    
    soup = BeautifulSoup(html, 'html.parser')
    
    # Filter out script and style tags by default
    for script in soup(["script", "style"]):
        script.decompose()
    
    return set(element_signature(el, **kwargs) for el in soup.find_all(True))

def compare_html_files(file1, file2, **signature_kwargs):
    """
    Compare two HTML files and return differences.
    
    Args:
        file1, file2: File paths to compare
        **signature_kwargs: Arguments passed to element_signature function
    """
    try:
        sigs1 = extract_signatures_from_file(file1, **signature_kwargs)
        sigs2 = extract_signatures_from_file(file2, **signature_kwargs)
        
        new_elements = sigs2 - sigs1
        removed_elements = sigs1 - sigs2
        common_elements = sigs1 & sigs2
        
        return {
            'new': new_elements,
            'removed': removed_elements,
            'common': common_elements,
            'total_file1': len(sigs1),
            'total_file2': len(sigs2)
        }
    except Exception as e:
        print(f"Error comparing files: {e}")
        return None

def print_comparison_results(results, file1, file2):
    """Print formatted comparison results."""
    if not results:
        return
    
    print(f"\n📊 Comparison Summary:")
    print(f"   {file1}: {results['total_file1']} elements")
    print(f"   {file2}: {results['total_file2']} elements")
    print(f"   Common: {len(results['common'])} elements")
    
    if results['new']:
        print(f"\n🆕 New elements in {file2} ({len(results['new'])} total):")
        for el in sorted(results['new']):
            print(f"   {el}")
    else:
        print(f"\n✅ No new elements in {file2}")
    
    if results['removed']:
        print(f"\n❌ Removed elements from {file1} ({len(results['removed'])} total):")
        for el in sorted(results['removed']):
            print(f"   {el}")
    else:
        print(f"\n✅ No elements removed from {file1}")

# Example usage
if __name__ == "__main__":
    # File paths
    file1 = 'html (5).html'
    file2 = 'html (6).html'
    
    # Basic comparison (your original approach, improved)
    print("=== BASIC COMPARISON ===")
    results = compare_html_files(file1, file2)
    if results:
        print_comparison_results(results, file1, file2)
    
    # Structure-only comparison (ignoring text content)
    print("\n=== STRUCTURE-ONLY COMPARISON ===")
    results_structure = compare_html_files(file1, file2, include_text=False)
    if results_structure:
        print_comparison_results(results_structure, file1, file2)
    
    # Positional comparison (includes element hierarchy)
    print("\n=== POSITIONAL COMPARISON ===")
    results_position = compare_html_files(file1, file2, include_position=True)
    if results_position:
        print_comparison_results(results_position, file1, file2)

=== BASIC COMPARISON ===

📊 Comparison Summary:
   html (5).html: 473 elements
   html (6).html: 487 elements
   Common: 464 elements

🆕 New elements in html (6).html (23 total):
   <button aria-checked='false' class='toolbarButton labeled' data-l10n-id='pdfjs-cursor-hand-tool-button' disabled='' id='cursorHandTool' role='radio' tabindex='0' title='Enable Hand Tool' type='button'>Hand Tool</button>
   <button aria-checked='true' class='toolbarButton labeled toggled' data-l10n-id='pdfjs-cursor-text-select-tool-button' disabled='' id='cursorSelectTool' role='radio' tabindex='0' title='Enable Text Selection Tool' type='button'>Text Selection Tool</button>
   <button aria-controls='editorStampParamsToolbar' aria-expanded='true' aria-haspopup='true' class='toolbarButton toggled' data-l10n-id='pdfjs-editor-stamp-button' id='editorStampButton' role='radio' tabindex='0' title='Add or edit images' type='button'>Add or edit images</button>
   <button aria-selected='false' data-color='#FFFF98' da

In [4]:
from bs4 import BeautifulSoup

def element_signature(elem):
    """Create a normalized signature string for an element."""
    tag = elem.name
    attrs = sorted(f'{k}="{v}"' for k, v in elem.attrs.items())
    text = elem.get_text(strip=True)
    return f"<{tag} {' '.join(attrs)}>{text}</{tag}>"

def extract_signatures_from_file(filepath):
    """Load HTML from file and extract element signatures."""
    with open(filepath, 'r', encoding='utf-8') as file:
        html = file.read()
    soup = BeautifulSoup(html, 'html.parser')
    return set(element_signature(el) for el in soup.find_all(True))  # True gets all tags

def compare_html_files(file1, file2):
    sigs1 = extract_signatures_from_file(file1)
    sigs2 = extract_signatures_from_file(file2)

    new_elements = sigs2 - sigs1
    removed_elements = sigs1 - sigs2

    return new_elements, removed_elements

# File paths
file1 = 'html (5).html'
file2 = 'html (6).html'

new, removed = compare_html_files(file1, file2)

print("🆕 New elements in htm2.html:")
for el in new:
    print(el)

print("\n❌ Removed elements (present in htm1.html, missing in htm2.html):")
for el in removed:
    print(el)



🆕 New elements in htm2.html:
<div class="['editToolbar']" role="toolbar">Alt textAdded test</div>
<button aria-controls="editorStampParamsToolbar" aria-expanded="true" aria-haspopup="true" class="['toolbarButton', 'toggled']" data-l10n-id="pdfjs-editor-stamp-button" id="editorStampButton" role="radio" tabindex="0" title="Add or edit images" type="button">Add or edit images</button>
<div aria-label="Page ⁨1⁩" class="['page']" data-l10n-args="{"page":1}" data-l10n-id="pdfjs-page-landmark" data-loaded="true" data-page-number="1" role="region" style="width: round(down, var(--total-scale-factor) * 612px, var(--scale-round-x)); height: round(down, var(--total-scale-factor) * 792px, var(--scale-round-y)); --scale-round-x: 2px; --scale-round-y: 2px;">Trace-based Just-in-Time Type Specialization for DynamicLanguagesAndreas Gal∗+, Brendan Eich∗, Mike Shaver∗, David Anderson∗, David Mandelin∗,Mohammad R. Haghighat$, Blake Kaplan∗, Graydon Hoare∗, Boris Zbarsky∗, Jason Orendorff∗,Jesse Ruderman∗, 