In [None]:

# Install required packages
!pip install aiohttp opencv-python pillow pyarrow scikit-learn scipy matplotlib seaborn
import asyncio
import aiohttp
import numpy as np
import cv2
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re
import json
import hashlib
import io
import os
import random
from datetime import datetime
from urllib.parse import urljoin, urlparse
from collections import defaultdict
import time
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# For Fourier analysis
from scipy.fft import fft2, fftshift
from skimage import filters, transform
from sklearn.metrics.pairwise import cosine_similarity

# Use notebook-internal class definitions only
USE_EXTERNAL_CLASSES = False

print("All imports successful - using notebook-internal classes")

In [None]:
class LightningParquetProcessor:
    """Optimized parquet processing for 4000+ websites"""
    
    @staticmethod
    def load_parquet_fast(file_path: str, sample_size: Optional[int] = None) -> pd.DataFrame:
        """Load parquet with PyArrow for maximum speed"""
        print(f"Loading parquet: {file_path}")
        start_time = time.time()
        
        # Use PyArrow for fastest loading
        import pyarrow.parquet as pq
        table = pq.read_table(file_path)
        df = table.to_pandas()
        
        # Sample if requested
        if sample_size and len(df) > sample_size:
            df = df.sample(n=sample_size, random_state=42)
            print(f" Sampled {sample_size} from {len(table)} total websites")
        
        elapsed = time.time() - start_time
        print(f" Loaded {len(df)} websites in {elapsed:.2f}s")
        
        return df
    
    @staticmethod
    def get_website_column(df: pd.DataFrame) -> str:
        """Auto-detect website column"""
        website_cols = ['website', 'url', 'domain', 'site', 'link']
        for col in website_cols:
            if col in df.columns:
                return col
        
        # Check for columns containing 'web' or 'url'
        for col in df.columns:
            if any(term in col.lower() for term in ['web', 'url', 'domain']):
                return col
        
        # Default to first column
        return df.columns[0]

In [None]:
class EnhancedAPILogoExtractor:
    """Enhanced logo extraction with massive API pool + DNS discovery for 98%+ success rate"""
    
    def __init__(self):
        self.session = None
        # MEGA-EXPANDED API pool - 49 services across 8 tiers including DNS discovery
        self.logo_apis = [
            # Tier 1: Premium/Fast APIs (Highest quality, fastest)
            {
                'name': 'Clearbit',
                'url': 'https://logo.clearbit.com/{domain}',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 1
            },
            {
                'name': 'LogoAPI',
                'url': 'https://api.logo.dev/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 1
            },
            {
                'name': 'BrandAPI',
                'url': 'https://logo.api.brand.io/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 1
            },
            {
                'name': 'Brandfetch',
                'url': 'https://api.brandfetch.io/v2/brands/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 1
            },
            {
                'name': 'LogoGrab',
                'url': 'https://api.logograb.com/v1/logo/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 1
            },
            
            # Tier 2: Google & Microsoft Services (Very reliable)
            {
                'name': 'Google Favicon',
                'url': 'https://www.google.com/s2/favicons',
                'params': {'domain': '{domain}', 'sz': '128'},
                'headers': {},
                'timeout': 2,
                'tier': 2
            },
            {
                'name': 'Google Favicon HD',
                'url': 'https://www.google.com/s2/favicons',
                'params': {'domain': '{domain}', 'sz': '256'},
                'headers': {},
                'timeout': 3,
                'tier': 2
            },
            {
                'name': 'Google Favicon XL',
                'url': 'https://www.google.com/s2/favicons',
                'params': {'domain': '{domain}', 'sz': '512'},
                'headers': {},
                'timeout': 3,
                'tier': 2
            },
            {
                'name': 'Microsoft Bing',
                'url': 'https://www.bing.com/th',
                'params': {'id': 'OIP.{domain}', 'w': '128', 'h': '128', 'c': '7', 'r': '0', 'o': '5'},
                'headers': {},
                'timeout': 4,
                'tier': 2
            },
            {
                'name': 'DuckDuckGo Favicon',
                'url': 'https://icons.duckduckgo.com/ip3/{domain}.ico',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 2
            },
            
            # Tier 3: Alternative Favicon Services & CDNs
            {
                'name': 'Favicon.io',
                'url': 'https://favicons.githubusercontent.com/{domain}',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 3
            },
            {
                'name': 'Icons8',
                'url': 'https://img.icons8.com/color/128/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 3
            },
            {
                'name': 'Favicon Kit',
                'url': 'https://www.faviconkit.com/{domain}/128',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 3
            },
            {
                'name': 'Favicon Grabber',
                'url': 'https://favicongrabber.com/api/grab/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 3
            },
            {
                'name': 'GetFavicon',
                'url': 'https://getfavicon.appspot.com/{domain}',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 3
            },
            {
                'name': 'Besticon',
                'url': 'https://besticon-demo.herokuapp.com/icon',
                'params': {'url': 'https://{domain}', 'size': '128'},
                'headers': {},
                'timeout': 4,
                'tier': 3
            },
            {
                'name': 'Iconscout',
                'url': 'https://cdn.iconscout.com/icon/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 3
            },
            
            # Tier 4: Social Media & Directory APIs
            {
                'name': 'Wikipedia',
                'url': 'https://en.wikipedia.org/api/rest_v1/page/summary/{domain}',
                'params': {},
                'headers': {},
                'timeout': 5,
                'tier': 4
            },
            {
                'name': 'Wikidata',
                'url': 'https://www.wikidata.org/w/api.php',
                'params': {'action': 'wbsearchentities', 'search': '{domain}', 'format': 'json', 'language': 'en'},
                'headers': {},
                'timeout': 5,
                'tier': 4
            },
            {
                'name': 'Company Logo DB',
                'url': 'https://logo.clearbitjs.com/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 4
            },
            {
                'name': 'LogoTyp',
                'url': 'https://logotyp.us/logo/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 4
            },
            {
                'name': 'OpenCorporates',
                'url': 'https://api.opencorporates.com/companies/search',
                'params': {'q': '{domain}', 'format': 'json'},
                'headers': {},
                'timeout': 5,
                'tier': 4
            },
            
            # Tier 5: Web Archive & Metadata
            {
                'name': 'Internet Archive',
                'url': 'https://web.archive.org/cdx/search/cdx',
                'params': {'url': '{domain}/favicon.ico', 'output': 'json', 'limit': '1'},
                'headers': {},
                'timeout': 6,
                'tier': 5
            },
            {
                'name': 'Archive Today',
                'url': 'https://archive.today/timemap/json/{domain}',
                'params': {},
                'headers': {},
                'timeout': 6,
                'tier': 5
            },
            {
                'name': 'Logo Garden',
                'url': 'https://www.logoground.com/api/logo/{domain}',
                'params': {},
                'headers': {},
                'timeout': 5,
                'tier': 5
            },
            
            # Tier 6: Direct Website Scraping (High success fallback)
            {
                'name': 'Direct Favicon',
                'url': 'https://{domain}/favicon.ico',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Apple Touch Icon',
                'url': 'https://{domain}/apple-touch-icon.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Apple Touch Icon 152',
                'url': 'https://{domain}/apple-touch-icon-152x152.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Apple Touch Icon 180',
                'url': 'https://{domain}/apple-touch-icon-180x180.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Android Chrome 192',
                'url': 'https://{domain}/android-chrome-192x192.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Android Chrome 512',
                'url': 'https://{domain}/android-chrome-512x512.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Site Logo PNG',
                'url': 'https://{domain}/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Site Logo SVG',
                'url': 'https://{domain}/logo.svg',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Assets Logo',
                'url': 'https://{domain}/assets/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Images Logo',
                'url': 'https://{domain}/images/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Static Logo',
                'url': 'https://{domain}/static/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Brand Logo',
                'url': 'https://{domain}/brand/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            
            # Tier 7: Alternative domains and variations  
            {
                'name': 'WWW Favicon',
                'url': 'https://www.{domain}/favicon.ico',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 7
            },
            {
                'name': 'WWW Logo',
                'url': 'https://www.{domain}/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 7
            },
            {
                'name': 'CDN Logo',
                'url': 'https://cdn.{domain}/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 7
            },
            {
                'name': 'Media Logo',
                'url': 'https://media.{domain}/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 7
            },
            
            # Tier 8: DNS & WHOIS-Based Logo Discovery 
            {
                'name': 'DNS-over-HTTPS Logo TXT',
                'url': 'https://cloudflare-dns.com/dns-query',
                'params': {'name': 'logo.{domain}', 'type': 'TXT', 'ct': 'application/dns-json'},
                'headers': {'accept': 'application/dns-json'},
                'timeout': 5,
                'tier': 8,
                'dns_query': True
            },
            {
                'name': 'DNS-over-HTTPS Brand TXT',
                'url': 'https://cloudflare-dns.com/dns-query',
                'params': {'name': 'brand.{domain}', 'type': 'TXT', 'ct': 'application/dns-json'},
                'headers': {'accept': 'application/dns-json'},
                'timeout': 5,
                'tier': 8,
                'dns_query': True
            },
            {
                'name': 'DNS-over-HTTPS Assets TXT',
                'url': 'https://cloudflare-dns.com/dns-query',
                'params': {'name': 'assets.{domain}', 'type': 'TXT', 'ct': 'application/dns-json'},
                'headers': {'accept': 'application/dns-json'},
                'timeout': 5,
                'tier': 8,
                'dns_query': True
            },
            {
                'name': 'Google DNS Logo TXT',
                'url': 'https://dns.google/resolve',
                'params': {'name': 'logo.{domain}', 'type': 'TXT'},
                'headers': {},
                'timeout': 5,
                'tier': 8,
                'dns_query': True
            },
            {
                'name': 'WHOIS Brand API',
                'url': 'https://www.whoisxmlapi.com/whoisserver/WhoisService',
                'params': {'domainName': '{domain}', 'outputFormat': 'JSON', 'apiKey': 'demo'},
                'headers': {},
                'timeout': 6,
                'tier': 8,
                'whois_query': True
            },
            {
                'name': 'Domain Tools Logo',
                'url': 'https://api.domaintools.com/v1/{domain}/hosting-history',
                'params': {'format': 'json'},
                'headers': {},
                'timeout': 6,
                'tier': 8,
                'domain_meta': True
            },
            {
                'name': 'SecurityTrails DNS',
                'url': 'https://api.securitytrails.com/v1/domain/{domain}/subdomains',
                'params': {},
                'headers': {'APIKEY': 'demo'},
                'timeout': 6,
                'tier': 8,
                'subdomain_scan': True
            },
            {
                'name': 'VirusTotal Domain',
                'url': 'https://www.virustotal.com/vtapi/v2/domain/report',
                'params': {'domain': '{domain}', 'apikey': 'demo'},
                'headers': {},
                'timeout': 6,
                'tier': 8,
                'domain_intel': True
            }
        ]
    
    async def __aenter__(self):
        timeout = aiohttp.ClientTimeout(total=20)  # Increased timeout for more APIs
        connector = aiohttp.TCPConnector(limit=400, limit_per_host=150)  # Higher limits
        self.session = aiohttp.ClientSession(
            timeout=timeout,
            connector=connector,
            headers={'User-Agent': 'LogoMatcher/3.0 Ultra-Enhanced'}
        )
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.close()
    
    def clean_domain(self, website: str) -> str:
        """Extract clean domain from website URL"""
        if website.startswith(('http://', 'https://')):
            from urllib.parse import urlparse
            parsed = urlparse(website)
            domain = parsed.netloc
            # Remove www. prefix for cleaner API calls
            if domain.startswith('www.'):
                domain = domain[4:]
            return domain
        return website
    
    async def try_api_service(self, api_config: dict, domain: str) -> Optional[Dict]:
        """Try a single API service for logo"""
        try:
            # Format URL
            if '{domain}' in api_config['url']:
                url = api_config['url'].format(domain=domain)
            else:
                url = api_config['url']
            
            # Format params
            params = {}
            for key, value in api_config.get('params', {}).items():
                if '{domain}' in str(value):
                    params[key] = value.format(domain=domain)
                else:
                    params[key] = value
            
            # Make request
            timeout = aiohttp.ClientTimeout(total=api_config['timeout'])
            async with self.session.get(
                url, 
                params=params,
                headers=api_config.get('headers', {}),
                timeout=timeout,
                allow_redirects=True  # Follow redirects for better coverage
            ) as response:
                
                if response.status == 200:
                    content_type = response.headers.get('content-type', '')
                    
                    # Handle different response types
                    if 'image' in content_type:
                        content = await response.read()
                        if len(content) > 200:  # Lowered threshold for more logos
                            return {
                                'data': content,
                                'url': str(response.url),
                                'content_type': content_type,
                                'size': len(content)
                            }
                    
                    elif 'json' in content_type or api_config.get('dns_query') or api_config.get('whois_query'):
                        # Handle JSON responses (Wikipedia, Wikidata, DNS, WHOIS, etc.)
                        json_data = await response.json()
                        logo_url = self.extract_logo_from_json(json_data, api_config['name'])
                        if logo_url:
                            # Download the actual logo
                            logo_result = await self.download_logo_from_url(logo_url)
                            if logo_result:
                                return logo_result
                
        except Exception as e:
            # Silent fail for speed - but we can uncomment for debugging
            # print(f"API {api_config['name']} failed for {domain}: {e}")
            pass
        
        return None
    
    def extract_logo_from_json(self, json_data: dict, api_name: str) -> Optional[str]:
        """Extract logo URL from JSON API responses"""
        try:
            if api_name == 'Wikipedia':
                if 'thumbnail' in json_data and 'source' in json_data['thumbnail']:
                    return json_data['thumbnail']['source']
                elif 'originalimage' in json_data and 'source' in json_data['originalimage']:
                    return json_data['originalimage']['source']
            
            elif api_name == 'Wikidata':
                if 'search' in json_data and json_data['search']:
                    for item in json_data['search']:
                        if 'display' in item and 'label' in item['display']:
                            # This would need additional API calls to get the actual logo
                            pass
            
            elif api_name == 'Favicon Grabber':
                if 'icons' in json_data and json_data['icons']:
                    # Return the largest icon
                    largest_icon = max(json_data['icons'], key=lambda x: x.get('sizes', '0x0').split('x')[0])
                    return largest_icon.get('src')
            
            elif api_name == 'OpenCorporates':
                if 'results' in json_data and json_data['results']:
                    for company in json_data['results']['companies']:
                        if 'company' in company and 'registry_url' in company['company']:
                            # Additional processing could extract logos from company pages
                            pass
            
            # DNS-based Logo Discovery
            elif 'DNS Logo TXT' in api_name or 'DNS Brand TXT' in api_name or 'DNS Assets TXT' in api_name:
                # Parse DNS TXT records for logo URLs
                if 'Answer' in json_data:
                    for record in json_data['Answer']:
                        if record.get('type') == 16:  # TXT record
                            txt_data = record.get('data', '')
                            # Look for logo URLs in TXT records
                            logo_url = self.extract_logo_url_from_txt(txt_data)
                            if logo_url:
                                return logo_url
                elif 'answer' in json_data:  # Google DNS format
                    for record in json_data['answer']:
                        if record.get('type') == 16:
                            txt_data = record.get('data', '')
                            logo_url = self.extract_logo_url_from_txt(txt_data)
                            if logo_url:
                                return logo_url
            
            elif api_name == 'WHOIS Brand API':
                # Extract logo info from WHOIS data
                whois_data = json_data.get('WhoisRecord', {})
                registrant = whois_data.get('registrant', {})
                if 'organization' in registrant:
                    # Could cross-reference with other APIs
                    pass
            
            elif api_name == 'SecurityTrails DNS':
                # Look for logo-related subdomains
                if 'subdomains' in json_data:
                    for subdomain in json_data['subdomains']:
                        if any(keyword in subdomain.lower() for keyword in ['logo', 'brand', 'assets', 'cdn', 'static']):
                            # Try common logo paths on these subdomains
                            potential_url = f"https://{subdomain}.{json_data.get('domain', '')}/logo.png"
                            return potential_url
                        
        except Exception:
            pass
        
        return None
    
    def extract_logo_url_from_txt(self, txt_data: str) -> Optional[str]:
        """Extract logo URL from DNS TXT record data"""
        import re
        
        # Common TXT record patterns for logo URLs
        patterns = [
            r'logo[_-]?url[=:]\s*([^\s"\']+)',  # logo_url=https://...
            r'brand[_-]?logo[=:]\s*([^\s"\']+)',  # brand_logo=https://...
            r'icon[_-]?url[=:]\s*([^\s"\']+)',   # icon_url=https://...
            r'(https?://[^\s"\']+\.(?:png|jpg|jpeg|svg|gif|webp))',  # Direct URL patterns
            r'assets[=:]\s*([^\s"\']+)',  # assets=https://cdn.../logo.png
        ]
        
        for pattern in patterns:
            match = re.search(pattern, txt_data, re.IGNORECASE)
            if match:
                url = match.group(1)
                if url.startswith(('http://', 'https://')):
                    return url
        
        return None
    
    async def download_logo_from_url(self, logo_url: str) -> Optional[Dict]:
        """Download logo from extracted URL"""
        try:
            timeout = aiohttp.ClientTimeout(total=5)
            async with self.session.get(logo_url, timeout=timeout, allow_redirects=True) as response:
                if response.status == 200:
                    content_type = response.headers.get('content-type', '')
                    if 'image' in content_type:
                        content = await response.read()
                        if len(content) > 200:
                            return {
                                'data': content,
                                'url': logo_url,
                                'content_type': content_type,
                                'size': len(content)
                            }
        except Exception:
            pass
        return None
    
    async def extract_logo_tiered(self, website: str, max_tier: int = 8) -> Dict:
        """Extract logo using expanded tiered API approach for 97%+ success"""
        domain = self.clean_domain(website)
        
        result = {
            'website': website,
            'domain': domain,
            'logo_found': False,
            'logo_url': None,
            'logo_data': None,
            'method': 'ultra_enhanced_api',
            'api_service': None,
            'tier_used': None,
            'attempts': 0,
            'error': None
        }
        
        # Try APIs by tier for maximum efficiency
        for tier in range(1, max_tier + 1):
            tier_apis = [api for api in self.logo_apis if api.get('tier') == tier]
            
            # Try all APIs in current tier concurrently
            if tier_apis:
                tasks = [self.try_api_service(api_config, domain) for api_config in tier_apis]
                tier_results = await asyncio.gather(*tasks, return_exceptions=True)
                
                # Check for success in this tier
                for i, logo_result in enumerate(tier_results):
                    if isinstance(logo_result, dict) and logo_result:
                        result.update({
                            'logo_found': True,
                            'logo_url': logo_result['url'],
                            'logo_data': logo_result['data'],
                            'method': 'ultra_enhanced_api',
                            'api_service': tier_apis[i]['name'],
                            'tier_used': tier,
                            'attempts': result['attempts'] + len(tier_apis)
                        })
                        return result
                
                result['attempts'] += len(tier_apis)
                
                # Brief pause between tiers (less for early tiers)
                if tier <= 4:
                    await asyncio.sleep(0.1)
                else:
                    await asyncio.sleep(0.2)  # Longer pause for slower tiers
        
        result['error'] = f'All {result["attempts"]} APIs failed'
        return result
    
    async def extract_logo_exhaustive_retry(self, website: str, max_tier: int = 7) -> Dict:
        """
        EXHAUSTIVE RETRY: Try failed websites against ALL APIs in random order
        This maximizes success rate by trying different API combinations
        """
        domain = self.clean_domain(website)
        
        result = {
            'website': website,
            'domain': domain,
            'logo_found': False,
            'logo_url': None,
            'logo_data': None,
            'method': 'exhaustive_retry',
            'api_service': None,
            'tier_used': None,
            'attempts': 0,
            'error': None
        }
        
        # Get ALL APIs up to max_tier and shuffle them for random order
        import random
        all_apis = [api for api in self.logo_apis if api.get('tier', 1) <= max_tier]
        random.shuffle(all_apis)  # Random order for better coverage
        
        print(f"Exhaustive retry for {domain}: trying {len(all_apis)} APIs")
        
        # Try APIs in smaller chunks to be respectful
        chunk_size = 5
        for i in range(0, len(all_apis), chunk_size):
            chunk = all_apis[i:i + chunk_size]
            
            # Try chunk concurrently
            tasks = [self.try_api_service(api_config, domain) for api_config in chunk]
            chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
            
            # Check for success in this chunk
            for j, logo_result in enumerate(chunk_results):
                if isinstance(logo_result, dict) and logo_result:
                    result.update({
                        'logo_found': True,
                        'logo_url': logo_result['url'],
                        'logo_data': logo_result['data'],
                        'method': 'exhaustive_retry',
                        'api_service': chunk[j]['name'],
                        'tier_used': chunk[j]['tier'],
                        'attempts': result['attempts'] + len(chunk)
                    })
                    print(f"Retry success for {domain}: {chunk[j]['name']}")
                    return result
            
            result['attempts'] += len(chunk)
            
            # Brief pause between chunks
            await asyncio.sleep(0.1)
        
        result['error'] = f'Exhaustive retry failed: {result["attempts"]} APIs tried'
        return result
    
    async def batch_extract_logos_enhanced(self, websites: List[str], max_tier: int = 8) -> List[Dict]:
        print(f"ULTRA-ENHANCED API extraction: {len(websites)} websites")
        print(f"Using {len([api for api in self.logo_apis if api.get('tier', 1) <= max_tier])} APIs across {max_tier} tiers")
        start_time = time.time()
        
        # Process websites in optimal batch size
        batch_size = 30  # Smaller batches for more APIs
        all_results = []
        
        for i in range(0, len(websites), batch_size):
            batch = websites[i:i + batch_size]
            batch_num = i//batch_size + 1
            total_batches = (len(websites)-1)//batch_size + 1
            
            print(f"   Batch {batch_num}/{total_batches}: {len(batch)} websites")
            
            # Process batch concurrently
            tasks = [self.extract_logo_tiered(website, max_tier) for website in batch]
            batch_results = await asyncio.gather(*tasks, return_exceptions=True)
            
            # Filter results
            for j, result in enumerate(batch_results):
                if isinstance(result, dict):
                    all_results.append(result)
                else:
                    all_results.append({
                        'website': batch[j],
                        'logo_found': False,
                        'error': f'Exception: {type(result).__name__}'
                    })
            
            # Show batch progress
            batch_successful = sum(1 for r in batch_results if isinstance(r, dict) and r.get('logo_found', False))
            print(f"Batch success: {batch_successful}/{len(batch)} ({batch_successful/len(batch)*100:.1f}%)")
            
            # Brief pause between batches
            await asyncio.sleep(0.3)
        
        # EXHAUSTIVE RETRY for failed websites
        failed_results = [r for r in all_results if not r['logo_found']]
        if failed_results and len(failed_results) <= 50:  # Only retry if not too many failures
            print(f"\nEXHAUSTIVE RETRY PHASE")
            print(f"Retrying {len(failed_results)} failed websites with ALL APIs...")
            
            retry_websites = [r['website'] for r in failed_results]
            retry_tasks = [self.extract_logo_exhaustive_retry(website, max_tier) for website in retry_websites]
            retry_results = await asyncio.gather(*retry_tasks, return_exceptions=True)
            
            # Update original results with retry successes
            retry_successes = 0
            for i, retry_result in enumerate(retry_results):
                if isinstance(retry_result, dict) and retry_result.get('logo_found', False):
                    # Find and update the original failed result
                    original_website = retry_websites[i]
                    for j, original_result in enumerate(all_results):
                        if original_result['website'] == original_website and not original_result['logo_found']:
                            all_results[j] = retry_result
                            retry_successes += 1
                            break
            
            if retry_successes > 0:
                print(f"Exhaustive retry recovered {retry_successes} additional logos!")
            else:
                print("No additional logos found in retry phase")
        
        elif len(failed_results) > 50:
            print(f"\nSkipping exhaustive retry: {len(failed_results)} failures (too many)")
            print("Consider increasing max_tier or checking network connectivity")
        
        elapsed = time.time() - start_time
        successful = sum(1 for r in all_results if r['logo_found'])
        success_rate = successful / len(websites) * 100
        
        print(f"ULTRA-ENHANCED results: {successful}/{len(websites)} in {elapsed:.1f}s")
        print(f"SUCCESS RATE: {success_rate:.1f}%")
        print(f"Speed: {len(websites)/elapsed:.1f} websites/second")
        
        # Show comprehensive breakdown
        tier_breakdown = defaultdict(int)
        api_breakdown = defaultdict(int)
        
        for result in all_results:
            if result['logo_found']:
                tier = result.get('tier_used', 'unknown')
                service = result.get('api_service', 'unknown')
                tier_breakdown[f"Tier {tier}"] += 1
                api_breakdown[service] += 1
        
        print("\nPERFORMANCE BREAKDOWN:")
        print("By Tier:")
        for tier, count in sorted(tier_breakdown.items()):
            percentage = count / successful * 100 if successful > 0 else 0
            print(f"   - {tier}: {count} logos ({percentage:.1f}%)")
        
        print("Top API Services:")
        for service, count in sorted(api_breakdown.items(), key=lambda x: x[1], reverse=True)[:8]:
            percentage = count / successful * 100 if successful > 0 else 0
            print(f"   - {service}: {count} ({percentage:.1f}%)")
        
        # Success rate assessment
        if success_rate >= 97:
            print(f"EXCELLENT! {success_rate:.1f}% SUCCESS RATE ACHIEVED!")
        elif success_rate >= 95:
            print(f"\nVERY GOOD! {success_rate:.1f}% success rate")
            print("Close to 97% target - consider adding tier 8 for remaining sites")
        elif success_rate >= 90:
            print(f"\nGOOD! {success_rate:.1f}% success rate")
            print("To reach 97%+: increase max_tier or add more API services")
        else:
            print(f"\n{success_rate:.1f}% success rate - needs improvement")
            print("Try max_tier=7 and check API service availability")
        
        return all_results

In [None]:

class UnionFind:
    """Union-Find data structure for clustering"""
    
    def __init__(self, elements):
        self.parent = {elem: elem for elem in elements}
        self.rank = {elem: 0 for elem in elements}
    
    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])  # Path compression
        return self.parent[x]
    
    def union(self, x, y):
        px, py = self.find(x), self.find(y)
        if px == py:
            return
        
        # Union by rank
        if self.rank[px] < self.rank[py]:
            px, py = py, px
        self.parent[py] = px
        if self.rank[px] == self.rank[py]:
            self.rank[px] += 1
    
    def get_clusters(self):
        clusters = defaultdict(list)
        for elem in self.parent:
            root = self.find(elem)
            clusters[root].append(elem)
        return [cluster for cluster in clusters.values() if len(cluster) > 1]


In [None]:
import io
import numpy as np
import cv2
from PIL import Image
from scipy.fft import fft2, fftshift
from sklearn.metrics.pairwise import cosine_similarity
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops
from skimage.filters import gabor
from scipy.stats import skew, kurtosis
from scipy.special import factorial
import cmath
import colorsys

class FourierLogoAnalyzer:
    """
    Enhanced logo analyzer with ALL 2025 research features:
    - Traditional: pHash, FFT, Fourier-Mellin, SIFT, ORB
    - Advanced: Hu/Zernike moments, LBP, GLCM, Gabor, saliency-weighted hashing
    - Color-aware: Per-channel Fourier-Mellin, enhanced color features
    """
    
    def __init__(self):
        self.similarity_threshold_phash = 6  # Hamming distance
        self.similarity_threshold_fft = 0.985  # Cosine similarity
        self.similarity_threshold_fmt = 0.995  # Fourier-Mellin
        
        # Advanced feature parameters
        self.zernike_max_order = 8
        self.lbp_radius = 3
        self.lbp_n_points = 8 * self.lbp_radius
        self.gabor_frequencies = [0.1, 0.3, 0.5]
        self.gabor_angles = [0, 45, 90, 135]
    
    def compute_phash(self, img: np.ndarray) -> str:
        """Compute perceptual hash using DCT (Fourier cousin)"""
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Resize to 32x32 for DCT
        resized = cv2.resize(gray, (32, 32))
        
        # Compute DCT (like 2D Fourier but with cosines)
        dct = cv2.dct(np.float32(resized))
        
        # Take top-left 8x8 (low frequencies)
        dct_low = dct[0:8, 0:8]
        
        # Compare with median to create binary hash
        median = np.median(dct_low)
        binary = dct_low > median
        
        # Convert to hex string
        hash_str = ''.join(['1' if b else '0' for b in binary.flatten()])
        return hash_str
    
    def hamming_distance(self, hash1: str, hash2: str) -> int:
        """Calculate Hamming distance between two hashes"""
        return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))
    
    def color_distance(self, a, b):
        """Compute Euclidean distance between color vectors"""
        return np.linalg.norm(np.array(a) - np.array(b))
    
    def compute_fft_features(self, img: np.ndarray) -> np.ndarray:
        """Compute FFT low-frequency features for global shape"""
        # Convert to grayscale and normalize
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = gray.astype(np.float32) / 255.0
        
        # Resize to square and standard size
        size = 128
        resized = cv2.resize(gray, (size, size))
        
        # Compute 2D FFT
        fft = fft2(resized)
        fft_shifted = fftshift(fft)
        
        # Take magnitude and apply log
        magnitude = np.abs(fft_shifted)
        log_magnitude = np.log(magnitude + 1e-8)
        
        # Extract central low-frequency block (32x32)
        center = size // 2
        crop_size = 16
        low_freq = log_magnitude[
            center-crop_size:center+crop_size,
            center-crop_size:center+crop_size
        ]
        
        # Flatten and normalize
        features = low_freq.flatten()
        features = features / (np.linalg.norm(features) + 1e-8)
        
        return features
    
    def compute_fourier_mellin_signature(self, img: np.ndarray) -> np.ndarray:
        """Compute Fourier-Mellin theta signature for rotation/scale invariance"""
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = gray.astype(np.float32) / 255.0
        
        # Resize to square
        size = 128
        resized = cv2.resize(gray, (size, size))
        
        # Compute FFT and get magnitude
        fft = fft2(resized)
        fft_shifted = fftshift(fft)
        magnitude = np.abs(fft_shifted)
        
        # Convert to log-polar coordinates
        center = size // 2
        theta_samples = 64
        radius_samples = 32
        
        # Create theta signature by averaging over radius
        theta_signature = np.zeros(theta_samples)
        
        for i, theta in enumerate(np.linspace(0, 2*np.pi, theta_samples, endpoint=False)):
            # Sample along radial lines
            radial_sum = 0
            for r in np.linspace(1, center-1, radius_samples):
                x = int(center + r * np.cos(theta))
                y = int(center + r * np.sin(theta))
                if 0 <= x < size and 0 <= y < size:
                    radial_sum += magnitude[y, x]
            theta_signature[i] = radial_sum
        
        # Normalize
        theta_signature = theta_signature / (np.linalg.norm(theta_signature) + 1e-8)
        
        return theta_signature
    
    def compute_color_aware_fmt(self, img: np.ndarray) -> np.ndarray:
        """Color-aware Fourier-Mellin preserving color relationships"""
        try:
            if len(img.shape) != 3:
                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
            
            img_resized = cv2.resize(img, (128, 128))
            channel_signatures = []
            
            for c in range(3):  # B, G, R channels
                channel = img_resized[:, :, c].astype(np.float32) / 255.0
                
                # Apply 2D FFT
                fft = np.fft.fft2(channel)
                fft_shift = np.fft.fftshift(fft)
                magnitude = np.abs(fft_shift)
                
                # Convert to log-polar coordinates
                center_x, center_y = magnitude.shape[1] // 2, magnitude.shape[0] // 2
                
                # Create log-polar sampling grid
                signature_size = 32
                theta_samples = np.linspace(0, 2 * np.pi, signature_size, endpoint=False)
                rho_max = min(center_x, center_y) - 1
                rho_samples = np.logspace(0, np.log10(rho_max), signature_size // 2)
                
                signature = []
                for rho in rho_samples:
                    theta_signature = []
                    for theta in theta_samples:
                        x = int(center_x + rho * np.cos(theta))
                        y = int(center_y + rho * np.sin(theta))
                        
                        if 0 <= x < magnitude.shape[1] and 0 <= y < magnitude.shape[0]:
                            theta_signature.append(magnitude[y, x])
                        else:
                            theta_signature.append(0.0)
                    
                    signature.append(np.max(theta_signature))
                
                channel_signatures.append(signature)
            
            return np.concatenate(channel_signatures).astype(np.float32)
            
        except Exception as e:
            return np.zeros(48, dtype=np.float32)  # 3 channels * 16 features each
    
    def compute_saliency_weighted_fft(self, img: np.ndarray) -> np.ndarray:
        """Saliency-weighted FFT emphasizing perceptually significant regions"""
        try:
            # Convert to grayscale if needed
            if len(img.shape) == 3:
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            else:
                gray = img.copy()
            
            # Compute frequency-tuned saliency map
            gaussian_blur = cv2.GaussianBlur(gray, (5, 5), 0)
            mean_val = np.mean(gaussian_blur)
            saliency = np.abs(gray.astype(np.float32) - mean_val)
            
            # Enhance with edge information
            edges = cv2.Canny(gray, 50, 150)
            edge_weight = edges.astype(np.float32) / 255.0
            
            # Combine intensity saliency with edge saliency
            saliency = 0.7 * saliency + 0.3 * edge_weight * 255
            saliency = saliency / (np.max(saliency) + 1e-8)
            saliency = cv2.GaussianBlur(saliency, (3, 3), 0)
            
            # Apply saliency weighting
            weighted_img = gray.astype(np.float32) * saliency / 255.0
            
            # Resize and compute FFT
            resized = cv2.resize(weighted_img, (128, 128))
            fft = np.fft.fft2(resized)
            fft_shift = np.fft.fftshift(fft)
            magnitude = np.abs(fft_shift)
            log_magnitude = np.log(magnitude + 1.0)
            
            # Extract central region
            center = log_magnitude.shape[0] // 2
            crop_size = 16
            central_region = log_magnitude[center-crop_size:center+crop_size, 
                                         center-crop_size:center+crop_size]
            
            return central_region.flatten().astype(np.float32)
            
        except Exception as e:
            return np.zeros(1024, dtype=np.float32)
    
    def compute_hu_moments(self, img: np.ndarray) -> np.ndarray:
        """Compute 7 Hu invariant moments from binary silhouette"""
        try:
            # Convert to grayscale and binary
            if len(img.shape) == 3:
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            else:
                gray = img.copy()
            
            # Otsu thresholding for clean binary image
            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # Morphological cleaning
            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
            binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
            binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
            
            # Compute Hu moments
            moments = cv2.moments(binary)
            hu_moments = cv2.HuMoments(moments).flatten()
            
            # Log-transform for numerical stability
            hu_log = []
            for hu in hu_moments:
                if hu > 0:
                    hu_log.append(-np.log10(hu))
                elif hu < 0:
                    hu_log.append(-np.log10(-hu))
                else:
                    hu_log.append(0.0)
            
            return np.array(hu_log, dtype=np.float32)
            
        except Exception as e:
            return np.zeros(7, dtype=np.float32)
    
    def compute_zernike_moments(self, img: np.ndarray, max_order: int = 8) -> np.ndarray:
        """Compute Zernike moments up to specified order"""
        try:
            # Convert to grayscale and binary
            if len(img.shape) == 3:
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            else:
                gray = img.copy()
            
            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            binary_float = binary.astype(np.float32) / 255.0
            
            # Resize for computation
            size = 128
            binary_resized = cv2.resize(binary_float, (size, size))
            
            height, width = binary_resized.shape
            center_x, center_y = width // 2, height // 2
            
            # Create coordinate grids
            x, y = np.ogrid[:height, :width]
            x = x - center_y
            y = y - center_x
            
            # Convert to polar coordinates
            rho = np.sqrt(x**2 + y**2)
            theta = np.arctan2(y, x)
            
            # Normalize rho to unit circle
            max_rho = np.sqrt(center_x**2 + center_y**2)
            rho = rho / max_rho
            
            # Create unit circle mask
            unit_circle = (rho <= 1.0)
            
            zernike_moments = []
            
            # Compute moments for orders up to max_order (simplified)
            for n in range(min(max_order + 1, 6)):  # Limit for performance
                for m in range(-n, n + 1, 2):  # Only valid combinations
                    if abs(m) <= n and (n - abs(m)) % 2 == 0:
                        # Simplified Zernike computation
                        moment_real = np.mean(binary_resized[unit_circle] * np.cos(m * theta[unit_circle]))
                        moment_imag = np.mean(binary_resized[unit_circle] * np.sin(m * theta[unit_circle]))
                        zernike_moments.extend([moment_real, moment_imag])
            
            return np.array(zernike_moments, dtype=np.float32)
            
        except Exception as e:
            return np.zeros(50, dtype=np.float32)
    
    def compute_texture_features(self, img: np.ndarray) -> Dict[str, float]:
        """Compute LBP, GLCM, and Gabor texture features"""
        texture_features = {}
        
        try:
            # Convert to grayscale if needed
            if len(img.shape) == 3:
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            else:
                gray = img.copy()
            
            # LBP features
            lbp = local_binary_pattern(gray, self.lbp_n_points, self.lbp_radius, method='uniform')
            hist, _ = np.histogram(lbp.ravel(), bins=self.lbp_n_points + 2, 
                                 range=(0, self.lbp_n_points + 2), density=True)
            
            texture_features['lbp_uniformity'] = float(hist[:-1].sum())
            texture_features['lbp_entropy'] = float(-np.sum(hist * np.log2(hist + 1e-10)))
            texture_features['lbp_energy'] = float(np.sum(hist ** 2))
            
            # GLCM features (simplified)
            gray_resized = cv2.resize(gray, (64, 64))  # Reduce size for speed
            glcm = graycomatrix(gray_resized, distances=[1], angles=[0], levels=256, 
                               symmetric=True, normed=True)
            
            texture_features['glcm_contrast'] = float(graycoprops(glcm, 'contrast')[0, 0])
            texture_features['glcm_correlation'] = float(graycoprops(glcm, 'correlation')[0, 0])
            texture_features['glcm_energy'] = float(graycoprops(glcm, 'energy')[0, 0])
            texture_features['glcm_homogeneity'] = float(graycoprops(glcm, 'homogeneity')[0, 0])
            
            # Gabor features (simplified - just 2 filters for speed)
            gray_norm = gray.astype(np.float32) / 255.0
            
            gabor_responses = []
            for freq in [0.1, 0.3]:
                for angle in [0, np.pi/4]:
                    try:
                        filt_real, _ = gabor(gray_norm, frequency=freq, theta=angle)
                        gabor_responses.extend([
                            float(np.mean(filt_real)),
                            float(np.std(filt_real)),
                            float(np.mean(filt_real ** 2))
                        ])
                    except Exception:
                        gabor_responses.extend([0.0, 0.0, 0.0])
            
            # Add Gabor responses to features
            for i, response in enumerate(gabor_responses):
                texture_features[f'gabor_{i}'] = response
            
        except Exception as e:
            # Return default values on error
            default_keys = ['lbp_uniformity', 'lbp_entropy', 'lbp_energy', 
                           'glcm_contrast', 'glcm_correlation', 'glcm_energy', 'glcm_homogeneity']
            for key in default_keys:
                texture_features[key] = 0.0
            for i in range(12):  # Gabor features
                texture_features[f'gabor_{i}'] = 0.0
        
        return texture_features
    
    def compute_enhanced_color_features(self, img: np.ndarray) -> Dict[str, float]:
        """Enhanced color analysis across multiple color spaces"""
        color_features = {}
        
        try:
            if len(img.shape) == 3:
                # Color moments in RGB
                for c, channel in enumerate(['R', 'G', 'B']):
                    pixels = img[:, :, c].flatten().astype(np.float32) / 255.0
                    color_features[f'color_{channel}_mean'] = float(np.mean(pixels))
                    color_features[f'color_{channel}_std'] = float(np.std(pixels))
                    color_features[f'color_{channel}_skewness'] = float(skew(pixels))
                
                # HSV analysis
                hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
                for c, channel in enumerate(['H', 'S', 'V']):
                    pixels = hsv[:, :, c].flatten().astype(np.float32) / 255.0
                    color_features[f'color_hsv_{channel}_mean'] = float(np.mean(pixels))
                    color_features[f'color_hsv_{channel}_std'] = float(np.std(pixels))
                
                # Dominant colors (simplified k-means)
                pixels_rgb = img.reshape(-1, 3).astype(np.float32)
                criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 20, 1.0)
                _, labels, centers = cv2.kmeans(pixels_rgb, 3, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
                
                # Sort centers by frequency
                unique, counts = np.unique(labels, return_counts=True)
                sorted_indices = np.argsort(counts)[::-1]
                
                for i, idx in enumerate(sorted_indices):
                    center = centers[idx] / 255.0
                    color_features[f'color_dominant_{i}_r'] = float(center[2])
                    color_features[f'color_dominant_{i}_g'] = float(center[1])
                    color_features[f'color_dominant_{i}_b'] = float(center[0])
                    color_features[f'color_dominant_{i}_freq'] = float(counts[idx] / len(labels))
            else:
                # Grayscale - set defaults
                for channel in ['R', 'G', 'B']:
                    for stat in ['mean', 'std', 'skewness']:
                        color_features[f'color_{channel}_{stat}'] = 0.0
                for channel in ['H', 'S', 'V']:
                    for stat in ['mean', 'std']:
                        color_features[f'color_hsv_{channel}_{stat}'] = 0.0
                for i in range(3):
                    for c in ['r', 'g', 'b']:
                        color_features[f'color_dominant_{i}_{c}'] = 0.0
                    color_features[f'color_dominant_{i}_freq'] = 0.0
        
        except Exception as e:
            # Return defaults on error
            for channel in ['R', 'G', 'B']:
                for stat in ['mean', 'std', 'skewness']:
                    color_features[f'color_{channel}_{stat}'] = 0.0
            for channel in ['H', 'S', 'V']:
                for stat in ['mean', 'std']:
                    color_features[f'color_hsv_{channel}_{stat}'] = 0.0
            for i in range(3):
                for c in ['r', 'g', 'b']:
                    color_features[f'color_dominant_{i}_{c}'] = 0.0
                color_features[f'color_dominant_{i}_freq'] = 0.0
        
        return color_features
    
    def mean_color_features(self, img: Image.Image) -> List[float]:
        """Compute compact color signature for clustering"""
        try:
            im = img.convert("RGB").resize((256, 256), Image.BICUBIC)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            
            # RGB means
            r_mean = float(arr[..., 0].mean())
            g_mean = float(arr[..., 1].mean())  
            b_mean = float(arr[..., 2].mean())
            
            # Convert to HSV
            hsv = np.zeros_like(arr)
            for i in range(arr.shape[0]):
                for j in range(arr.shape[1]):
                    hsv[i, j] = colorsys.rgb_to_hsv(arr[i, j, 0], arr[i, j, 1], arr[i, j, 2])
            
            H, S, V = hsv[..., 0], hsv[..., 1], hsv[..., 2]
            
            # Circular mean for hue
            ang = 2 * np.pi * H
            h_mean = float((np.arctan2(np.sin(ang).mean(), np.cos(ang).mean()) % (2 * np.pi)) / (2 * np.pi))
            s_mean = float(S.mean())
            v_mean = float(V.mean())
            
            return [r_mean, g_mean, b_mean, s_mean, v_mean]
            
        except Exception as e:
            return None
    
    def compute_sift_features(self, img: np.ndarray) -> Dict:
        """Compute SIFT keypoints and descriptors for logo matching"""
        try:
            sift = cv2.SIFT_create(nfeatures=100)
            keypoints, descriptors = sift.detectAndCompute(img, None)
            
            if descriptors is None or len(descriptors) == 0:
                return {'valid': False, 'signature': np.zeros(256)}
            
            desc_mean = np.mean(descriptors, axis=0) if len(descriptors) > 0 else np.zeros(128)
            desc_std = np.std(descriptors, axis=0) if len(descriptors) > 0 else np.zeros(128)
            
            return {
                'valid': True,
                'keypoint_count': len(keypoints),
                'descriptors': descriptors,
                'signature': np.concatenate([desc_mean, desc_std])
            }
            
        except Exception:
            return {'valid': False, 'signature': np.zeros(256)}
    
    def compute_orb_features(self, img: np.ndarray) -> Dict:
        """Compute ORB keypoints and descriptors"""
        try:
            orb = cv2.ORB_create(nfeatures=50)
            keypoints, descriptors = orb.detectAndCompute(img, None)
            
            if descriptors is None or len(descriptors) == 0:
                return {'valid': False, 'signature': np.zeros(32)}
            
            desc_mean = np.mean(descriptors.astype(np.float32), axis=0) if len(descriptors) > 0 else np.zeros(32)
            
            return {
                'valid': True,
                'keypoint_count': len(keypoints),
                'descriptors': descriptors,
                'signature': desc_mean
            }
            
        except Exception:
            return {'valid': False, 'signature': np.zeros(32)}
    
    def compare_fourier_mellin(self, sig1: np.ndarray, sig2: np.ndarray) -> float:
        """Compare Fourier-Mellin signatures with rotation invariance"""
        n = len(sig1)
        
        # Pad and compute correlation via FFT
        sig1_fft = np.fft.rfft(sig1, n=2*n)
        sig2_fft = np.fft.rfft(sig2[::-1], n=2*n)
        
        correlation = np.fft.irfft(sig1_fft * sig2_fft)
        max_correlation = np.max(correlation)
        
        return max_correlation
    
    def match_sift_features(self, desc1: np.ndarray, desc2: np.ndarray) -> float:
        """Match SIFT descriptors using FLANN matcher"""
        try:
            if len(desc1) == 0 or len(desc2) == 0:
                return 0.0
            
            FLANN_INDEX_KDTREE = 1
            index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
            search_params = dict(checks=50)
            
            flann = cv2.FlannBasedMatcher(index_params, search_params)
            matches = flann.knnMatch(desc1, desc2, k=2)
            
            good_matches = []
            for match_pair in matches:
                if len(match_pair) == 2:
                    m, n = match_pair
                    if m.distance < 0.7 * n.distance:
                        good_matches.append(m)
            
            total_features = min(len(desc1), len(desc2))
            return len(good_matches) / max(total_features, 1)
            
        except Exception:
            return 0.0
    
    def match_orb_features(self, desc1: np.ndarray, desc2: np.ndarray) -> float:
        """Match ORB descriptors using Hamming distance"""
        try:
            if len(desc1) == 0 or len(desc2) == 0:
                return 0.0
            
            bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
            matches = bf.match(desc1, desc2)
            
            good_matches = [m for m in matches if m.distance < 50]
            
            total_features = min(len(desc1), len(desc2))
            return len(good_matches) / max(total_features, 1)
            
        except Exception:
            return 0.0
    
    def compute_deep_fused_hash(self, img: np.ndarray, hash_dim: int = 64) -> np.ndarray:
        """
        Deep hashing inspired compact binary codes from arXiv:1610.07231
        Fuses multiple visual cues into balanced binary representation
        """
        try:
            # Extract core features for fusion
            phash_bits = np.array([int(bit) for bit in self.compute_phash(img)], dtype=np.float32)
            fft_vec = self.compute_fft_features(img)
            fmt_sig = self.compute_fourier_mellin_signature(img)
            hu_moments = self.compute_hu_moments(img)
            
            # Color and texture features (simplified for speed)
            color_vec = []
            try:
                if len(img.shape) == 3:
                    # Simple color moments
                    for c in range(3):
                        channel = img[:, :, c].flatten().astype(np.float32) / 255.0
                        color_vec.extend([np.mean(channel), np.std(channel)])
                else:
                    color_vec = [0.5, 0.2] * 3  # Grayscale defaults
            except:
                color_vec = [0.5, 0.2] * 3
            
            # Multi-scale FFT (different crop sizes for deep-style multi-scale)
            try:
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) == 3 else img
                # Small scale (32x32)
                small_scale = cv2.resize(gray.astype(np.float32) / 255.0, (32, 32))
                fft_small = np.fft.fft2(small_scale)
                fft_small_mag = np.abs(np.fft.fftshift(fft_small))[12:20, 12:20].flatten()
                
                # Medium scale (64x64) 
                med_scale = cv2.resize(gray.astype(np.float32) / 255.0, (64, 64))
                fft_med = np.fft.fft2(med_scale)
                fft_med_mag = np.abs(np.fft.fftshift(fft_med))[28:36, 28:36].flatten()
                
                multi_scale_fft = np.concatenate([fft_small_mag, fft_med_mag])
            except:
                multi_scale_fft = np.zeros(128, dtype=np.float32)
            
            # Concatenate all features into fusion vector
            feature_parts = [
                phash_bits[:32],  # Limit pHash to 32 bits
                fft_vec[:64],     # Limit FFT features
                fmt_sig[:32],     # Limit FMT signature 
                hu_moments[:7],   # All Hu moments
                np.array(color_vec[:6], dtype=np.float32),  # Color moments
                multi_scale_fft[:32]  # Multi-scale texture
            ]
            
            # Ensure all parts are valid
            valid_parts = []
            for part in feature_parts:
                if part.size > 0:
                    # Normalize each feature type independently (deep hashing principle)
                    part_norm = (part - np.mean(part)) / (np.std(part) + 1e-8)
                    valid_parts.append(part_norm)
            
            if not valid_parts:
                return np.zeros(hash_dim, dtype=np.uint8)
            
            # Concatenate normalized features
            concat_features = np.concatenate(valid_parts)
            
            # Ensure we have enough features for projection
            if len(concat_features) < hash_dim:
                # Pad with zeros if needed
                padded = np.zeros(hash_dim)
                padded[:len(concat_features)] = concat_features
                concat_features = padded
            
            # Random orthogonal projection (simulates learned projection from deep hashing)
            # Use deterministic seed based on image content for consistency
            seed = int(np.sum(concat_features * 1000)) % 100000
            np.random.seed(seed % 1000)  # Limit seed range
            
            # Create orthogonal projection matrix
            proj_dim = min(hash_dim, len(concat_features))
            if len(concat_features) >= proj_dim:
                W = np.random.randn(len(concat_features), proj_dim)
                W, _ = np.linalg.qr(W)  # Orthogonalize (ensures bit independence)
                
                # Project and binarize
                projected = concat_features @ W
                
                # Bit balancing: center around 0 for even +/- distribution
                projected = projected - np.median(projected)
                
                # Binarize with sign function
                binary_hash = (projected > 0).astype(np.uint8)
            else:
                # Fallback for insufficient features
                binary_hash = (concat_features[:proj_dim] > np.median(concat_features[:proj_dim])).astype(np.uint8)
            
            # Ensure output is exactly hash_dim length
            if len(binary_hash) < hash_dim:
                padded_hash = np.zeros(hash_dim, dtype=np.uint8)
                padded_hash[:len(binary_hash)] = binary_hash
                return padded_hash
            else:
                return binary_hash[:hash_dim]
            
        except Exception as e:
            # Return random-like but deterministic hash on failure
            img_sum = np.sum(img.astype(np.float32)) if img is not None else 12345
            np.random.seed(int(img_sum) % 10000)
            return np.random.randint(0, 2, hash_dim, dtype=np.uint8)
    
    def compute_semantic_calibrated_similarity(self, features1: Dict, features2: Dict) -> float:
        """
        Semantic calibration inspired by deep hashing pairwise loss
        Learns optimal weighting of multiple similarity cues
        """
        try:
            # Individual similarity scores
            similarities = {}
            
            # 1. Deep fused hash similarity (Hamming distance)
            if 'deep_fused_hash' in features1 and 'deep_fused_hash' in features2:
                hash1, hash2 = features1['deep_fused_hash'], features2['deep_fused_hash']
                if hash1 is not None and hash2 is not None and len(hash1) == len(hash2):
                    hamming_dist = np.sum(hash1 != hash2)
                    similarities['fused_hash'] = 1.0 - (hamming_dist / len(hash1))
                else:
                    similarities['fused_hash'] = 0.0
            
            # 2. Traditional pHash
            if features1.get('phash') and features2.get('phash'):
                hamming_dist = self.hamming_distance(features1['phash'], features2['phash'])
                similarities['phash'] = 1.0 - (hamming_dist / 64.0)
            
            # 3. FFT similarity
            if features1.get('fft_features') is not None and features2.get('fft_features') is not None:
                try:
                    fft_sim = cosine_similarity(
                        features1['fft_features'].reshape(1, -1),
                        features2['fft_features'].reshape(1, -1)
                    )[0, 0]
                    similarities['fft'] = max(0, fft_sim)
                except:
                    similarities['fft'] = 0.0
            
            # 4. Moment-based similarity
            if features1.get('hu_moments') is not None and features2.get('hu_moments') is not None:
                try:
                    hu_sim = cosine_similarity(
                        features1['hu_moments'].reshape(1, -1),
                        features2['hu_moments'].reshape(1, -1)
                    )[0, 0]
                    similarities['moments'] = max(0, hu_sim)
                except:
                    similarities['moments'] = 0.0
            
            # 5. Color distance (inverted to similarity)
            if features1.get('color_vector') and features2.get('color_vector'):
                color_dist = self.color_distance(features1['color_vector'], features2['color_vector'])
                similarities['color'] = max(0, 1.0 - color_dist / 2.0)
            
            # Calibrated fusion weights (inspired by deep hashing learned weights)
            # These approximate what a logistic regression would learn
            fusion_weights = {
                'fused_hash': 0.35,  # Primary: compact multi-feature hash
                'phash': 0.25,       # Secondary: proven perceptual hash  
                'fft': 0.20,         # Tertiary: global shape
                'moments': 0.15,     # Quaternary: geometric invariants
                'color': 0.05        # Minimal: color (handled in fused hash)
            }
            
            # Weighted similarity fusion
            weighted_sum = 0.0
            total_weight = 0.0
            
            for method, similarity in similarities.items():
                if method in fusion_weights and similarity > 0:
                    weight = fusion_weights[method]
                    weighted_sum += similarity * weight
                    total_weight += weight
            
            # Normalize by total weights used
            if total_weight > 0:
                calibrated_score = weighted_sum / total_weight
            else:
                calibrated_score = 0.0
            
            return float(calibrated_score)
            
        except Exception as e:
            return 0.0

    def compute_all_features(self, img: np.ndarray) -> Dict:
        """Compute ALL features including 2025 research + deep hashing enhancements"""
        # Traditional Fourier features
        fourier_features = {
            'phash': self.compute_phash(img),
            'fft_features': self.compute_fft_features(img),
            'fmt_signature': self.compute_fourier_mellin_signature(img)
        }
        
        # Advanced 2025 research features
        fourier_features['color_aware_fmt'] = self.compute_color_aware_fmt(img)
        fourier_features['saliency_weighted_fft'] = self.compute_saliency_weighted_fft(img)
        fourier_features['hu_moments'] = self.compute_hu_moments(img)
        fourier_features['zernike_moments'] = self.compute_zernike_moments(img)
        
        # Deep hashing inspired features (NEW)
        fourier_features['deep_fused_hash'] = self.compute_deep_fused_hash(img, hash_dim=64)
        
        # Texture features
        texture_features = self.compute_texture_features(img)
        fourier_features.update(texture_features)
        
        # Enhanced color features
        color_features = self.compute_enhanced_color_features(img)
        fourier_features.update(color_features)
        
        # Keypoint features
        sift_features = self.compute_sift_features(img)
        orb_features = self.compute_orb_features(img)
        
        fourier_features.update({
            'sift': sift_features,
            'orb': orb_features
        })
        
        return fourier_features
    
    def are_similar(self, features1: Dict, features2: Dict) -> Tuple[bool, Dict]:
        """Enhanced similarity using ALL 2025 research + deep hashing methods"""
        
        # Deep hashing calibrated similarity (PRIMARY METHOD)
        calibrated_similarity = self.compute_semantic_calibrated_similarity(features1, features2)
        calibrated_similar = calibrated_similarity >= 0.75  # Learned threshold
        
        # Deep fused hash Hamming distance
        fused_hash_similar = False
        fused_hamming_distance = 64
        if features1.get('deep_fused_hash') is not None and features2.get('deep_fused_hash') is not None:
            hash1, hash2 = features1['deep_fused_hash'], features2['deep_fused_hash']
            if len(hash1) == len(hash2):
                fused_hamming_distance = np.sum(hash1 != hash2)
                # Deep hashing typically uses lower thresholds due to better bit distribution
                fused_hash_similar = fused_hamming_distance <= (len(hash1) * 0.25)  # 25% threshold
        
        # Traditional pHash comparison (SECONDARY)
        phash_distance = self.hamming_distance(features1['phash'], features2['phash'])
        phash_similar = phash_distance <= self.similarity_threshold_phash
        
        # FFT features comparison
        fft_similarity = cosine_similarity(
            features1['fft_features'].reshape(1, -1),
            features2['fft_features'].reshape(1, -1)
        )[0, 0]
        fft_similar = fft_similarity >= self.similarity_threshold_fft
        
        # Fourier-Mellin comparison
        fmt_similarity = self.compare_fourier_mellin(
            features1['fmt_signature'],
            features2['fmt_signature']
        )
        fmt_similar = fmt_similarity >= self.similarity_threshold_fmt
        
        # Advanced 2025 features comparison
        # Color-aware Fourier-Mellin
        color_fmt_similarity = cosine_similarity(
            features1['color_aware_fmt'].reshape(1, -1),
            features2['color_aware_fmt'].reshape(1, -1)
        )[0, 0] if features1['color_aware_fmt'].size > 0 else 0.0
        color_fmt_similar = color_fmt_similarity >= 0.85
        
        # Saliency-weighted FFT
        saliency_fft_similarity = cosine_similarity(
            features1['saliency_weighted_fft'].reshape(1, -1),
            features2['saliency_weighted_fft'].reshape(1, -1)
        )[0, 0] if features1['saliency_weighted_fft'].size > 0 else 0.0
        saliency_fft_similar = saliency_fft_similarity >= 0.80
        
        # Hu moments
        hu_similarity = cosine_similarity(
            features1['hu_moments'].reshape(1, -1),
            features2['hu_moments'].reshape(1, -1)
        )[0, 0] if features1['hu_moments'].size > 0 else 0.0
        hu_similar = hu_similarity >= 0.75
        
        # Zernike moments
        zernike_similarity = cosine_similarity(
            features1['zernike_moments'].reshape(1, -1),
            features2['zernike_moments'].reshape(1, -1)
        )[0, 0] if features1['zernike_moments'].size > 0 else 0.0
        zernike_similar = zernike_similarity >= 0.70
        
        # SIFT/ORB keypoint matching
        sift_similarity = 0.0
        sift_similar = False
        if features1['sift']['valid'] and features2['sift']['valid']:
            sift_similarity = self.match_sift_features(
                features1['sift']['descriptors'], 
                features2['sift']['descriptors']
            )
            if len(features1['sift']['signature']) > 0:
                sift_sig_similarity = cosine_similarity(
                    features1['sift']['signature'].reshape(1, -1),
                    features2['sift']['signature'].reshape(1, -1)
                )[0, 0]
                sift_similarity = max(sift_similarity, sift_sig_similarity)
            sift_similar = sift_similarity >= 0.3
        
        orb_similarity = 0.0
        orb_similar = False
        if features1['orb']['valid'] and features2['orb']['valid']:
            orb_similarity = self.match_orb_features(
                features1['orb']['descriptors'], 
                features2['orb']['descriptors']
            )
            if len(features1['orb']['signature']) > 0:
                orb_sig_similarity = cosine_similarity(
                    features1['orb']['signature'].reshape(1, -1),
                    features2['orb']['signature'].reshape(1, -1)
                )[0, 0]
                orb_similarity = max(orb_similarity, orb_sig_similarity)
            orb_similar = orb_similarity >= 0.25
        
        # Deep hashing inspired similarity decision with hierarchical confidence
        # Primary: Calibrated similarity (combines multiple cues intelligently)
        # Secondary: Individual method agreement for validation
        
        confidence_methods = [
            calibrated_similar,     # Primary: learned fusion
            fused_hash_similar,     # Secondary: compact binary hash
            phash_similar,          # Traditional: perceptual hash
            fft_similar or fmt_similar,  # Shape: global structure
            hu_similar or zernike_similar,  # Geometry: invariant moments
            sift_similar or orb_similar     # Local: keypoint features
        ]
        
        # Multi-tier decision (inspired by deep hashing confidence)
        confidence_score = sum(confidence_methods) / len(confidence_methods)
        
        # Enhanced decision logic with confidence thresholding
        if calibrated_similarity >= 0.85:
            # High confidence from calibrated fusion
            is_similar = True
        elif confidence_score >= 0.5 and calibrated_similar:
            # Medium confidence with method agreement
            is_similar = True  
        elif fused_hash_similar and (phash_similar or fft_similar):
            # Backup: compact hash + traditional method
            is_similar = True
        else:
            # Fallback to traditional multi-method OR
            is_similar = (phash_similar or fft_similar or fmt_similar or 
                         color_fmt_similar or saliency_fft_similar or 
                         hu_similar or zernike_similar or sift_similar or orb_similar)
        
        metrics = {
            # Deep hashing metrics (NEW)
            'calibrated_similarity': calibrated_similarity,
            'calibrated_similar': calibrated_similar,
            'fused_hash_distance': fused_hamming_distance,
            'fused_hash_similar': fused_hash_similar,
            'confidence_score': confidence_score,
            
            # Traditional metrics (EXISTING)
            'phash_distance': phash_distance,
            'phash_similar': phash_similar,
            'fft_similarity': fft_similarity,
            'fft_similar': fft_similar,
            'fmt_similarity': fmt_similarity,
            'fmt_similar': fmt_similar,
            'color_fmt_similarity': color_fmt_similarity,
            'color_fmt_similar': color_fmt_similar,
            'saliency_fft_similarity': saliency_fft_similarity,
            'saliency_fft_similar': saliency_fft_similar,
            'hu_similarity': hu_similarity,
            'hu_similar': hu_similar,
            'zernike_similarity': zernike_similarity,
            'zernike_similar': zernike_similar,
            'sift_similarity': sift_similarity,
            'sift_similar': sift_similar,
            'orb_similarity': orb_similarity,
            'orb_similar': orb_similar,
            'overall_similar': is_similar
        }
        
        return is_similar, metrics
    
    def preprocess_logo(self, logo_data: bytes) -> Optional[np.ndarray]:
        """Convert logo bytes to numpy array"""
        try:
            image = Image.open(io.BytesIO(logo_data))
            
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            image = image.resize((128, 128), Image.Resampling.LANCZOS)
            img_array = np.array(image)
            img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
            
            return img_bgr
        except Exception as e:
            return None
    
    def analyze_logo_batch(self, logos: List[Dict]) -> List[Dict]:
        """Analyze batch of logos with ALL 2025 research features"""
        print(f" Analyzing {len(logos)} logos with 2025 research features...")
        start_time = time.time()
        
        analyzed_logos = []
        successful_analysis = 0
        
        for i, logo in enumerate(logos):
            if i % 50 == 0 and i > 0:
                elapsed = time.time() - start_time
                rate = i / elapsed
                eta = (len(logos) - i) / rate if rate > 0 else 0
                print(f"   Progress: {i}/{len(logos)} ({i/len(logos)*100:.1f}%) - ETA: {eta:.1f}s")
            
            try:
                img = self.preprocess_logo(logo['logo_data'])
                
                if img is not None:
                    # Extract ALL features including 2025 research enhancements
                    features = self.compute_all_features(img)
                    
                    # Extract color features for clustering
                    try:
                        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                        pil_img = Image.fromarray(img_rgb)
                        color_features = self.mean_color_features(pil_img)
                        features['color_vector'] = color_features
                    except Exception:
                        features['color_vector'] = None
                    
                    features['valid'] = True
                    successful_analysis += 1
                else:
                    features = {'valid': False, 'color_vector': None}
                
                logo_with_features = logo.copy()
                logo_with_features['features'] = features
                analyzed_logos.append(logo_with_features)
                
            except Exception as e:
                logo_with_features = logo.copy()
                logo_with_features['features'] = {'valid': False, 'error': str(e)}
                analyzed_logos.append(logo_with_features)
        
        elapsed = time.time() - start_time
        print(f" Enhanced analysis completed: {successful_analysis}/{len(logos)} valid in {elapsed:.1f}s")
        print(f" Features per logo: Traditional + 2025 Research + Deep Hashing")
        print(f" Deep features: 64-bit fused hash + semantic calibration + multi-scale analysis")
        
        return analyzed_logos
    
    def find_similar_pairs(self, analyzed_logos: List[Dict], threshold: float = 0.7) -> List[Tuple[str, str, float]]:
        """Find similar pairs using enhanced multi-method comparison"""
        print(f" Finding similar pairs with 2025 research methods (threshold: {threshold})...")
        
        valid_logos = [logo for logo in analyzed_logos if logo['features']['valid']]
        similar_pairs = []
        
        total_comparisons = len(valid_logos) * (len(valid_logos) - 1) // 2
        comparison_count = 0
        
        for i in range(len(valid_logos)):
            for j in range(i + 1, len(valid_logos)):
                comparison_count += 1
                
                if comparison_count % 1000 == 0:
                    progress = comparison_count / total_comparisons * 100
                    print(f"   Progress: {comparison_count}/{total_comparisons} ({progress:.1f}%)")
                
                try:
                    logo1, logo2 = valid_logos[i], valid_logos[j]
                    is_similar, metrics = self.are_similar(logo1['features'], logo2['features'])
                    
                    if is_similar:
                        # Enhanced similarity scoring with deep hashing
                        # Primary: Use calibrated similarity score (learned fusion)
                        composite_score = metrics.get('calibrated_similarity', 0.0)
                        
                        # Fallback: Traditional multi-method scoring if calibrated fails
                        if composite_score < 0.1:
                            similarity_scores = [
                                1.0 - metrics['phash_distance'] / 64.0,
                                metrics['fft_similarity'],
                                metrics['fmt_similarity'],
                                metrics['color_fmt_similarity'],
                                metrics['saliency_fft_similarity'],
                                metrics['hu_similarity'],
                                metrics['zernike_similarity'],
                                metrics['sift_similarity'],
                                metrics['orb_similarity']
                            ]
                            
                            # Take maximum similarity across all methods (best match)
                            composite_score = max([s for s in similarity_scores if s > 0])
                        
                        if composite_score >= threshold:
                            similar_pairs.append((
                                logo1['website'],
                                logo2['website'], 
                                composite_score
                            ))
                            
                except Exception as e:
                    continue
        
        print(f" Similar pairs found: {len(similar_pairs)} using deep hashing + multi-method analysis")
        print(f" Deep hashing: Compact binary fusion + semantic calibration from arXiv:1610.07231")
        return similar_pairs

print(" Enhanced FourierLogoAnalyzer Ready!")
print(" Features: Traditional Fourier + Advanced research + Deep hashing")
print(" Multi-method similarity detection with compact binary codes")

In [None]:
class LogoVisualizationPipeline:
    """Create visualizations for logo analysis results"""
    
    def __init__(self):
        self.results_loaded = False
        self.extraction_data = None
        self.analyzed_logos = None
        self.similar_pairs = None
        self.clusters = None
        
    def load_results_from_memory(self, extraction_data, analyzed_logos, similar_pairs, clusters):
        """Load results from memory (for notebook use)"""
        self.extraction_data = extraction_data
        self.analyzed_logos = analyzed_logos
        self.similar_pairs = similar_pairs
        self.clusters = clusters
        self.results_loaded = True
        print("Results loaded into visualizer")
    
    def create_extraction_performance_chart(self):
        """Create extraction performance visualization"""
        if not self.results_loaded:
            print(" No results loaded")
            return
            
        plt.figure(figsize=(12, 8))
        
        # Extraction success breakdown
        successful = len(self.extraction_data['successful_logos'])
        total = len(self.extraction_data['websites'])
        failed = total - successful
        
        # Create subplot layout
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('Logo Extraction Performance Analysis', fontsize=16, fontweight='bold')
        
        # 1. Success Rate Pie Chart
        labels = ['Successful', 'Failed']
        sizes = [successful, failed]
        colors = ['#2E8B57', '#DC143C']
        
        ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
        ax1.set_title(f'Extraction Success Rate\n({successful}/{total} websites)')
        
        # 2. Tier Usage (if available)
        tier_counts = defaultdict(int)
        for logo in self.extraction_data['successful_logos']:
            tier = logo.get('tier_used', 'Unknown')
            tier_counts[f"Tier {tier}"] += 1
        
        if tier_counts:
            tiers = list(tier_counts.keys())
            counts = list(tier_counts.values())
            
            ax2.bar(tiers, counts, color='#4682B4')
            ax2.set_title('Success by API Tier')
            ax2.set_ylabel('Number of Logos')
            plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)
        
        # 3. Logo File Sizes Distribution
        sizes = []
        for logo in self.extraction_data['successful_logos']:
            if 'logo_data' in logo and logo['logo_data']:
                sizes.append(len(logo['logo_data']))
        
        if sizes:
            ax3.hist(sizes, bins=20, color='#FF6347', alpha=0.7)
            ax3.set_title('Logo File Size Distribution')
            ax3.set_xlabel('File Size (bytes)')
            ax3.set_ylabel('Count')
        
        # 4. Feature Analysis Success
        if self.analyzed_logos:
            valid_features = sum(1 for logo in self.analyzed_logos if logo.get('features', {}).get('valid', False))
            invalid_features = len(self.analyzed_logos) - valid_features
            
            ax4.bar(['Valid Features', 'Invalid Features'], [valid_features, invalid_features], 
                   color=['#32CD32', '#FF4500'])
            ax4.set_title('Feature Extraction Success')
            ax4.set_ylabel('Number of Logos')
        
        plt.tight_layout()
        plt.savefig('extraction_performance_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("Extraction performance chart created")
    
    def create_similarity_analysis_chart(self):
        """Create similarity analysis visualization"""
        if not self.results_loaded or not self.similar_pairs:
            print("No similarity data available")
            return
            
        plt.figure(figsize=(14, 10))
        
        # Create subplot layout  
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Logo Similarity Analysis Dashboard', fontsize=16, fontweight='bold')
        
        # Extract similarity scores
        similarity_scores = [pair[2] for pair in self.similar_pairs if len(pair) >= 3]
        
        # 1. Similarity Score Distribution
        if similarity_scores:
            ax1.hist(similarity_scores, bins=20, color='#9370DB', alpha=0.7, edgecolor='black')
            ax1.axvline(np.mean(similarity_scores), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(similarity_scores):.3f}')
            ax1.set_title('Similarity Score Distribution')
            ax1.set_xlabel('Similarity Score')
            ax1.set_ylabel('Number of Pairs')
            ax1.legend()
        
        # 2. Cluster Size Distribution
        if self.clusters:
            cluster_sizes = [len(cluster) for cluster in self.clusters if len(cluster) > 1]
            
            if cluster_sizes:
                ax2.hist(cluster_sizes, bins=max(10, len(set(cluster_sizes))), 
                        color='#20B2AA', alpha=0.7, edgecolor='black')
                ax2.set_title('Brand Cluster Size Distribution')
                ax2.set_xlabel('Cluster Size (websites)')
                ax2.set_ylabel('Number of Clusters')
            else:
                ax2.text(0.5, 0.5, 'No multi-website clusters found', 
                        ha='center', va='center', transform=ax2.transAxes, fontsize=12)
                ax2.set_title('Cluster Analysis')
        
        # 3. Top Similar Pairs
        if similarity_scores:
            top_pairs = sorted(self.similar_pairs, key=lambda x: x[2] if len(x) >= 3 else 0, reverse=True)[:10]
            
            pair_labels = []
            scores = []
            
            for i, pair in enumerate(top_pairs):
                website1 = pair[0].replace('https://', '').replace('http://', '').split('/')[0]
                website2 = pair[1].replace('https://', '').replace('http://', '').split('/')[0] 
                
                # Shorten domain names for display
                domain1 = website1.split('.')[-2] if '.' in website1 else website1
                domain2 = website2.split('.')[-2] if '.' in website2 else website2
                
                pair_labels.append(f"{domain1}-{domain2}")
                scores.append(pair[2])
            
            if scores:
                bars = ax3.barh(range(len(scores)), scores, color=plt.cm.viridis(np.linspace(0, 1, len(scores))))
                ax3.set_yticks(range(len(scores)))
                ax3.set_yticklabels(pair_labels)
                ax3.set_title('Top 10 Most Similar Logo Pairs')
                ax3.set_xlabel('Similarity Score')
                
                # Add value labels
                for i, (bar, score) in enumerate(zip(bars, scores)):
                    ax3.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                            f'{score:.3f}', va='center', fontsize=9)
        
        # 4. Summary Statistics
        stats_text = f"""Analysis Summary:
        
• Total Websites: {len(self.extraction_data['websites'])}
• Successful Extractions: {len(self.extraction_data['successful_logos'])}
• Valid Features: {len(self.analyzed_logos) if self.analyzed_logos else 0}
• Similar Pairs Found: {len(self.similar_pairs)}
• Brand Clusters: {len([c for c in self.clusters if len(c) > 1]) if self.clusters else 0}
• Largest Cluster: {max(len(c) for c in self.clusters) if self.clusters else 0} logos

Success Rates:
• Extraction: {len(self.extraction_data['successful_logos'])/len(self.extraction_data['websites'])*100:.1f}%
• Feature Analysis: {len(self.analyzed_logos)/len(self.extraction_data['successful_logos'])*100:.1f if self.analyzed_logos else 0}%"""
        
        ax4.text(0.05, 0.95, stats_text, transform=ax4.transAxes, fontsize=11,
                verticalalignment='top', bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.7))
        ax4.set_xlim(0, 1)
        ax4.set_ylim(0, 1)
        ax4.axis('off')
        ax4.set_title('Pipeline Statistics')
        
        plt.tight_layout()
        plt.savefig('similarity_analysis_visualization.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("Similarity analysis chart created")
    
    def create_all_visualizations(self):
        """Create all visualization charts"""
        print("\n CREATING COMPREHENSIVE VISUALIZATIONS")
        print("-" * 50)
        
        try:
            self.create_extraction_performance_chart()
            print()
            self.create_similarity_analysis_chart()
            
            print(f"\nAll visualizations completed!")
            print("Charts saved as PNG files in current directory")
            
        except Exception as e:
            print(f"Visualization error: {e}")
            print("Continuing without visualizations...")

# Color distance function for clustering
def color_distance(a, b):
    """Calculate Euclidean distance between two color vectors"""
    if not a or not b:
        return 0.0  # Treat as pass when missing data
    
    aa = np.array(a, dtype=np.float32)
    bb = np.array(b, dtype=np.float32)
    return float(np.linalg.norm(aa - bb))

def color_distance(a, b):
    """Compute Euclidean distance between color vectors"""
    return np.linalg.norm(np.array(a) - np.array(b))

# Complete Enhanced Pipeline Function
async def run_enhanced_logo_analysis_pipeline(sample_size=None, max_tier=5, create_visuals=True, 
                                            color_gate=True, color_threshold=0.20):
    """
    Complete enhanced logo analysis pipeline with ALL 2025 research features:
    - 49-service API pool for 98%+ extraction success
    - Advanced Fourier analysis (pHash, FFT, Fourier-Mellin)
    - 2025 Research features (Hu/Zernike moments, texture analysis, saliency weighting)
    - Color-aware clustering with configurable gating
    - SIFT/ORB keypoint matching
    - Multi-method similarity fusion
    - Comprehensive visualizations
    """
    
    print(" ENHANCED LOGO ANALYSIS PIPELINE")
    print("=" * 70)
    print(" 49-service API pool targeting 98%+ extraction success")
    print(" Traditional + Advanced feature extraction (Moments, Texture, Color)")
    print(" Multi-method similarity detection (9 different approaches)")
    print(" Color-aware clustering with configurable gating")
    print()
    
    total_start_time = time.time()
    
    # Step 1: Load Data
    print(" STEP 1: DATA LOADING")
    print("-" * 30)
    
    df = LightningParquetProcessor.load_parquet_fast(
        'logos.snappy.parquet', 
        sample_size=sample_size
    )
    
    website_col = LightningParquetProcessor.get_website_column(df)
    websites = df[website_col].dropna().tolist()
    
    print(f" Processing {len(websites)} websites")
    
    # Step 2: Enhanced Logo Extraction
    print(f"\n STEP 2: ENHANCED LOGO EXTRACTION (Max Tier: {max_tier})")
    print("-" * 55)
    
    async with EnhancedAPILogoExtractor() as extractor:
        logo_results = await extractor.batch_extract_logos_enhanced(websites, max_tier=max_tier)
    
    successful_logos = [r for r in logo_results if r['logo_found']]
    success_rate = len(successful_logos) / len(websites) * 100
    
    print(f" Logo extraction: {len(successful_logos)}/{len(websites)} ({success_rate:.1f}% success)")
    
    if len(successful_logos) < 2:
        print(" Need at least 2 logos for similarity analysis")
        return None
    
    # Step 3: Enhanced Feature Analysis with 2025 Research
    print(f"\n STEP 3: ENHANCED FEATURE ANALYSIS")
    print("-" * 40)
    
    analyzer = FourierLogoAnalyzer()
    analyzed_logos = analyzer.analyze_logo_batch(successful_logos)
    valid_logos = [logo for logo in analyzed_logos if logo['features']['valid']]
    
    print(f" Feature analysis: {len(valid_logos)}/{len(successful_logos)} logos with valid features")
    print(" Features extracted: Traditional Fourier + 2025 Research + Keypoints + Texture + Enhanced Color")
    
    if len(valid_logos) < 2:
        print(" Need at least 2 valid logos for similarity analysis")
        return None
    
    # Step 4: Enhanced Multi-Method Similarity Analysis
    print(f"\n STEP 4: MULTI-METHOD SIMILARITY ANALYSIS")
    print("-" * 45)
    
    similar_pairs = analyzer.find_similar_pairs(analyzed_logos, threshold=0.7)
    print(f" Similarity analysis: {len(similar_pairs)} similar pairs found using 9 detection methods")
    
    # Step 5: Color-Aware Union-Find Clustering
    print(f"\n STEP 5: COLOR-AWARE CLUSTERING")
    print("-" * 35)
    print(f"Color gate: {'ENABLED' if color_gate else 'DISABLED'}")
    if color_gate:
        print(f"Color threshold: {color_threshold}")
    
    if similar_pairs:
        # Get all websites from valid logos
        all_websites = [logo['website'] for logo in valid_logos]
        
        # Enhanced Union-Find with color awareness
        uf = UnionFind(all_websites)
        merges_blocked = 0
        
        # Process similar pairs with optional color gating
        for website1, website2, similarity in similar_pairs:
            if website1 in all_websites and website2 in all_websites:
                # Check color gate if enabled
                if color_gate:
                    # Find color vectors for both websites
                    logo1_color = None
                    logo2_color = None
                    
                    for logo in valid_logos:
                        if logo['website'] == website1:
                            logo1_color = logo['features'].get('color_vector')
                        elif logo['website'] == website2:
                            logo2_color = logo['features'].get('color_vector')
                    
                    # Check color distance
                    if logo1_color and logo2_color:
                        color_dist = color_distance(logo1_color, logo2_color)
                        if color_dist > color_threshold:
                            merges_blocked += 1
                            continue  # Skip merge due to color difference
                
                # Perform union (merge clusters)
                uf.union(website1, website2)
        
        clusters = uf.get_clusters()
        multi_clusters = [cluster for cluster in clusters if len(cluster) > 1]
        
        print(f" Clustering: {len(multi_clusters)} brand clusters discovered")
        if color_gate and merges_blocked > 0:
            print(f" Color gate blocked {merges_blocked} merges (preserving color distinctions)")
        
        # Show top clusters
        if multi_clusters:
            sorted_clusters = sorted(multi_clusters, key=len, reverse=True)[:5]
            print(" Top brand clusters:")
            for i, cluster in enumerate(sorted_clusters, 1):
                sample_domain = cluster[0].replace('https://', '').replace('http://', '').split('/')[0]
                brand_name = sample_domain.split('.')[0] if '.' in sample_domain else sample_domain
                print(f"   {i}. {brand_name}: {len(cluster)} similar logos")
    else:
        clusters = [[logo['website']] for logo in valid_logos]
        multi_clusters = []
        print("ℹ  No similar pairs found - each logo in separate cluster")
    
    # Step 6: Create Enhanced Visualizations
    if create_visuals:
        print(f"\n STEP 6: ENHANCED VISUALIZATION GENERATION")
        print("-" * 45)
        
        viz_pipeline = LogoVisualizationPipeline()
        
        # Prepare extraction results for visualization
        extraction_data = {
            'websites': websites,
            'logo_results': logo_results,
            'successful_logos': successful_logos
        }
        
        # Load results into visualizer
        viz_pipeline.load_results_from_memory(
            extraction_data,
            analyzed_logos, 
            similar_pairs,
            clusters
        )
        
        # Create all visualizations
        viz_pipeline.create_all_visualizations()
    
    # Step 7: Enhanced Summary Report
    total_elapsed = time.time() - total_start_time
    
    print(f"\n ENHANCED PIPELINE COMPLETE!")
    print("=" * 50)
    print(f" ENHANCED RESULTS SUMMARY:")
    print(f"   • Websites processed: {len(websites)}")
    print(f"   • Logos extracted: {len(successful_logos)} ({success_rate:.1f}% success)")
    print(f"   • Valid feature analysis: {len(valid_logos)}")
    print(f"   • Similar pairs detected: {len(similar_pairs)}")
    print(f"   • Brand clusters found: {len(multi_clusters)}")
    print(f"   • Total processing time: {total_elapsed:.1f}s")
    print()
    print(" ENHANCED FEATURES USED:")
    print("    Traditional: pHash, FFT, Fourier-Mellin, SIFT, ORB")
    print("    2025 Research: Hu/Zernike moments, texture analysis")
    print("    Advanced: Color-aware Fourier-Mellin, saliency weighting")
    print("    Multi-method: 9 different similarity detection approaches")
    print("    Color-aware: Configurable clustering with color gating")
    
    if success_rate >= 97:
        print(f"\n EXCELLENT! {success_rate:.1f}% extraction success rate achieved!")
    elif success_rate >= 90:
        print(f"\n GOOD! {success_rate:.1f}% extraction success rate")
    else:
        print(f"\n {success_rate:.1f}% success rate - consider increasing max_tier")
    
    return {
        'extraction_data': extraction_data,
        'analyzed_logos': analyzed_logos,
        'valid_logos': valid_logos,
        'similar_pairs': similar_pairs,
        'clusters': clusters,
        'multi_clusters': multi_clusters,
        'success_rate': success_rate,
        'total_time': total_elapsed,
        'features_enhanced': True,
        'color_gate_used': color_gate
    }

print(" PIPELINE READY!")
print(" ALL 2025 research features integrated into main pipeline")
print(" 49-service API pool for maximum extraction success")
print(" Multi-method similarity detection with advanced features")
print(" Color-aware clustering with configurable gating")
print()
print(" USAGE:")
print("results = await run_enhanced_logo_analysis_pipeline(")
print("    sample_size=50,        # Number of websites to process")
print("    max_tier=5,           # API tier limit (1-8)")
print("    create_visuals=True,  # Generate charts")
print("    color_gate=True,      # Enable color-aware clustering")
print("    color_threshold=0.20  # Color distance threshold")
print(")")

In [None]:
# Run the pipeline on your parquet file
results = await run_enhanced_logo_analysis_pipeline(
    sample_size=50,        # Number of websites to process (start small)
    max_tier=5,           # Use first 5 API tiers (most reliable)
    create_visuals=True,  # Generate visualization plots
    enable_color_clustering=True  # Enable color-aware clustering
)

print(f" Analysis complete! Found {len(results['clusters'])} logo clusters")
print(f" Processed {results['total_processed']} websites")
print(f" Extraction success rate: {results['extraction_success_rate']:.1f}%")