In [None]:
import asyncio
import aiohttp
import numpy as np
import cv2
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re
import json
import hashlib
import io
import os
import random
from datetime import datetime
from urllib.parse import urljoin, urlparse
from collections import defaultdict
import time
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# For Fourier analysis
from scipy.fft import fft2, fftshift
from skimage import filters, transform
from sklearn.metrics.pairwise import cosine_similarity

# Use notebook-internal class definitions only
USE_EXTERNAL_CLASSES = False

print("All imports successful - using notebook-internal classes")

In [None]:
class LightningParquetProcessor:
    """Optimized parquet processing for 4000+ websites"""
    
    @staticmethod
    def load_parquet_fast(file_path: str, sample_size: Optional[int] = None) -> pd.DataFrame:
        """Load parquet with PyArrow for maximum speed"""
        print(f"Loading parquet: {file_path}")
        start_time = time.time()
        
        # Use PyArrow for fastest loading
        import pyarrow.parquet as pq
        table = pq.read_table(file_path)
        df = table.to_pandas()
        
        # Sample if requested
        if sample_size and len(df) > sample_size:
            df = df.sample(n=sample_size, random_state=42)
            print(f" Sampled {sample_size} from {len(table)} total websites")
        
        elapsed = time.time() - start_time
        print(f" Loaded {len(df)} websites in {elapsed:.2f}s")
        
        return df
    
    @staticmethod
    def get_website_column(df: pd.DataFrame) -> str:
        """Auto-detect website column"""
        website_cols = ['website', 'url', 'domain', 'site', 'link']
        for col in website_cols:
            if col in df.columns:
                return col
        
        # Check for columns containing 'web' or 'url'
        for col in df.columns:
            if any(term in col.lower() for term in ['web', 'url', 'domain']):
                return col
        
        # Default to first column
        return df.columns[0]

In [None]:
class EnhancedAPILogoExtractor:
    """Enhanced logo extraction with massive API pool + DNS discovery for 98%+ success rate"""
    
    def __init__(self):
        self.session = None
        # MEGA-EXPANDED API pool - 49 services across 8 tiers including DNS discovery
        self.logo_apis = [
            # Tier 1: Premium/Fast APIs (Highest quality, fastest)
            {
                'name': 'Clearbit',
                'url': 'https://logo.clearbit.com/{domain}',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 1
            },
            {
                'name': 'LogoAPI',
                'url': 'https://api.logo.dev/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 1
            },
            {
                'name': 'BrandAPI',
                'url': 'https://logo.api.brand.io/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 1
            },
            {
                'name': 'Brandfetch',
                'url': 'https://api.brandfetch.io/v2/brands/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 1
            },
            {
                'name': 'LogoGrab',
                'url': 'https://api.logograb.com/v1/logo/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 1
            },
            
            # Tier 2: Google & Microsoft Services (Very reliable)
            {
                'name': 'Google Favicon',
                'url': 'https://www.google.com/s2/favicons',
                'params': {'domain': '{domain}', 'sz': '128'},
                'headers': {},
                'timeout': 2,
                'tier': 2
            },
            {
                'name': 'Google Favicon HD',
                'url': 'https://www.google.com/s2/favicons',
                'params': {'domain': '{domain}', 'sz': '256'},
                'headers': {},
                'timeout': 3,
                'tier': 2
            },
            {
                'name': 'Google Favicon XL',
                'url': 'https://www.google.com/s2/favicons',
                'params': {'domain': '{domain}', 'sz': '512'},
                'headers': {},
                'timeout': 3,
                'tier': 2
            },
            {
                'name': 'Microsoft Bing',
                'url': 'https://www.bing.com/th',
                'params': {'id': 'OIP.{domain}', 'w': '128', 'h': '128', 'c': '7', 'r': '0', 'o': '5'},
                'headers': {},
                'timeout': 4,
                'tier': 2
            },
            {
                'name': 'DuckDuckGo Favicon',
                'url': 'https://icons.duckduckgo.com/ip3/{domain}.ico',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 2
            },
            
            # Tier 3: Alternative Favicon Services & CDNs
            {
                'name': 'Favicon.io',
                'url': 'https://favicons.githubusercontent.com/{domain}',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 3
            },
            {
                'name': 'Icons8',
                'url': 'https://img.icons8.com/color/128/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 3
            },
            {
                'name': 'Favicon Kit',
                'url': 'https://www.faviconkit.com/{domain}/128',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 3
            },
            {
                'name': 'Favicon Grabber',
                'url': 'https://favicongrabber.com/api/grab/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 3
            },
            {
                'name': 'GetFavicon',
                'url': 'https://getfavicon.appspot.com/{domain}',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 3
            },
            {
                'name': 'Besticon',
                'url': 'https://besticon-demo.herokuapp.com/icon',
                'params': {'url': 'https://{domain}', 'size': '128'},
                'headers': {},
                'timeout': 4,
                'tier': 3
            },
            {
                'name': 'Iconscout',
                'url': 'https://cdn.iconscout.com/icon/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 3
            },
            
            # Tier 4: Social Media & Directory APIs
            {
                'name': 'Wikipedia',
                'url': 'https://en.wikipedia.org/api/rest_v1/page/summary/{domain}',
                'params': {},
                'headers': {},
                'timeout': 5,
                'tier': 4
            },
            {
                'name': 'Wikidata',
                'url': 'https://www.wikidata.org/w/api.php',
                'params': {'action': 'wbsearchentities', 'search': '{domain}', 'format': 'json', 'language': 'en'},
                'headers': {},
                'timeout': 5,
                'tier': 4
            },
            {
                'name': 'Company Logo DB',
                'url': 'https://logo.clearbitjs.com/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 4
            },
            {
                'name': 'LogoTyp',
                'url': 'https://logotyp.us/logo/{domain}',
                'params': {},
                'headers': {},
                'timeout': 4,
                'tier': 4
            },
            {
                'name': 'OpenCorporates',
                'url': 'https://api.opencorporates.com/companies/search',
                'params': {'q': '{domain}', 'format': 'json'},
                'headers': {},
                'timeout': 5,
                'tier': 4
            },
            
            # Tier 5: Web Archive & Metadata
            {
                'name': 'Internet Archive',
                'url': 'https://web.archive.org/cdx/search/cdx',
                'params': {'url': '{domain}/favicon.ico', 'output': 'json', 'limit': '1'},
                'headers': {},
                'timeout': 6,
                'tier': 5
            },
            {
                'name': 'Archive Today',
                'url': 'https://archive.today/timemap/json/{domain}',
                'params': {},
                'headers': {},
                'timeout': 6,
                'tier': 5
            },
            {
                'name': 'Logo Garden',
                'url': 'https://www.logoground.com/api/logo/{domain}',
                'params': {},
                'headers': {},
                'timeout': 5,
                'tier': 5
            },
            
            # Tier 6: Direct Website Scraping (High success fallback)
            {
                'name': 'Direct Favicon',
                'url': 'https://{domain}/favicon.ico',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Apple Touch Icon',
                'url': 'https://{domain}/apple-touch-icon.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Apple Touch Icon 152',
                'url': 'https://{domain}/apple-touch-icon-152x152.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Apple Touch Icon 180',
                'url': 'https://{domain}/apple-touch-icon-180x180.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Android Chrome 192',
                'url': 'https://{domain}/android-chrome-192x192.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Android Chrome 512',
                'url': 'https://{domain}/android-chrome-512x512.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Site Logo PNG',
                'url': 'https://{domain}/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Site Logo SVG',
                'url': 'https://{domain}/logo.svg',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Assets Logo',
                'url': 'https://{domain}/assets/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Images Logo',
                'url': 'https://{domain}/images/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Static Logo',
                'url': 'https://{domain}/static/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            {
                'name': 'Brand Logo',
                'url': 'https://{domain}/brand/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 6
            },
            
            # Tier 7: Alternative domains and variations  
            {
                'name': 'WWW Favicon',
                'url': 'https://www.{domain}/favicon.ico',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 7
            },
            {
                'name': 'WWW Logo',
                'url': 'https://www.{domain}/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 7
            },
            {
                'name': 'CDN Logo',
                'url': 'https://cdn.{domain}/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 7
            },
            {
                'name': 'Media Logo',
                'url': 'https://media.{domain}/logo.png',
                'params': {},
                'headers': {},
                'timeout': 3,
                'tier': 7
            },
            
            # Tier 8: DNS & WHOIS-Based Logo Discovery 
            {
                'name': 'DNS-over-HTTPS Logo TXT',
                'url': 'https://cloudflare-dns.com/dns-query',
                'params': {'name': 'logo.{domain}', 'type': 'TXT', 'ct': 'application/dns-json'},
                'headers': {'accept': 'application/dns-json'},
                'timeout': 5,
                'tier': 8,
                'dns_query': True
            },
            {
                'name': 'DNS-over-HTTPS Brand TXT',
                'url': 'https://cloudflare-dns.com/dns-query',
                'params': {'name': 'brand.{domain}', 'type': 'TXT', 'ct': 'application/dns-json'},
                'headers': {'accept': 'application/dns-json'},
                'timeout': 5,
                'tier': 8,
                'dns_query': True
            },
            {
                'name': 'DNS-over-HTTPS Assets TXT',
                'url': 'https://cloudflare-dns.com/dns-query',
                'params': {'name': 'assets.{domain}', 'type': 'TXT', 'ct': 'application/dns-json'},
                'headers': {'accept': 'application/dns-json'},
                'timeout': 5,
                'tier': 8,
                'dns_query': True
            },
            {
                'name': 'Google DNS Logo TXT',
                'url': 'https://dns.google/resolve',
                'params': {'name': 'logo.{domain}', 'type': 'TXT'},
                'headers': {},
                'timeout': 5,
                'tier': 8,
                'dns_query': True
            },
            {
                'name': 'WHOIS Brand API',
                'url': 'https://www.whoisxmlapi.com/whoisserver/WhoisService',
                'params': {'domainName': '{domain}', 'outputFormat': 'JSON', 'apiKey': 'demo'},
                'headers': {},
                'timeout': 6,
                'tier': 8,
                'whois_query': True
            },
            {
                'name': 'Domain Tools Logo',
                'url': 'https://api.domaintools.com/v1/{domain}/hosting-history',
                'params': {'format': 'json'},
                'headers': {},
                'timeout': 6,
                'tier': 8,
                'domain_meta': True
            },
            {
                'name': 'SecurityTrails DNS',
                'url': 'https://api.securitytrails.com/v1/domain/{domain}/subdomains',
                'params': {},
                'headers': {'APIKEY': 'demo'},
                'timeout': 6,
                'tier': 8,
                'subdomain_scan': True
            },
            {
                'name': 'VirusTotal Domain',
                'url': 'https://www.virustotal.com/vtapi/v2/domain/report',
                'params': {'domain': '{domain}', 'apikey': 'demo'},
                'headers': {},
                'timeout': 6,
                'tier': 8,
                'domain_intel': True
            }
        ]
    
    async def __aenter__(self):
        timeout = aiohttp.ClientTimeout(total=20)  # Increased timeout for more APIs
        connector = aiohttp.TCPConnector(limit=400, limit_per_host=150)  # Higher limits
        self.session = aiohttp.ClientSession(
            timeout=timeout,
            connector=connector,
            headers={'User-Agent': 'LogoMatcher/3.0 Ultra-Enhanced'}
        )
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.close()
    
    def clean_domain(self, website: str) -> str:
        """Extract clean domain from website URL"""
        if website.startswith(('http://', 'https://')):
            from urllib.parse import urlparse
            parsed = urlparse(website)
            domain = parsed.netloc
            # Remove www. prefix for cleaner API calls
            if domain.startswith('www.'):
                domain = domain[4:]
            return domain
        return website
    
    async def try_api_service(self, api_config: dict, domain: str) -> Optional[Dict]:
        """Try a single API service for logo"""
        try:
            # Format URL
            if '{domain}' in api_config['url']:
                url = api_config['url'].format(domain=domain)
            else:
                url = api_config['url']
            
            # Format params
            params = {}
            for key, value in api_config.get('params', {}).items():
                if '{domain}' in str(value):
                    params[key] = value.format(domain=domain)
                else:
                    params[key] = value
            
            # Make request
            timeout = aiohttp.ClientTimeout(total=api_config['timeout'])
            async with self.session.get(
                url, 
                params=params,
                headers=api_config.get('headers', {}),
                timeout=timeout,
                allow_redirects=True  # Follow redirects for better coverage
            ) as response:
                
                if response.status == 200:
                    content_type = response.headers.get('content-type', '')
                    
                    # Handle different response types
                    if 'image' in content_type:
                        content = await response.read()
                        if len(content) > 200:  # Lowered threshold for more logos
                            return {
                                'data': content,
                                'url': str(response.url),
                                'content_type': content_type,
                                'size': len(content)
                            }
                    
                    elif 'json' in content_type or api_config.get('dns_query') or api_config.get('whois_query'):
                        # Handle JSON responses (Wikipedia, Wikidata, DNS, WHOIS, etc.)
                        json_data = await response.json()
                        logo_url = self.extract_logo_from_json(json_data, api_config['name'])
                        if logo_url:
                            # Download the actual logo
                            logo_result = await self.download_logo_from_url(logo_url)
                            if logo_result:
                                return logo_result
                
        except Exception as e:
            # Silent fail for speed - but we can uncomment for debugging
            # print(f"API {api_config['name']} failed for {domain}: {e}")
            pass
        
        return None
    
    def extract_logo_from_json(self, json_data: dict, api_name: str) -> Optional[str]:
        """Extract logo URL from JSON API responses"""
        try:
            if api_name == 'Wikipedia':
                if 'thumbnail' in json_data and 'source' in json_data['thumbnail']:
                    return json_data['thumbnail']['source']
                elif 'originalimage' in json_data and 'source' in json_data['originalimage']:
                    return json_data['originalimage']['source']
            
            elif api_name == 'Wikidata':
                if 'search' in json_data and json_data['search']:
                    for item in json_data['search']:
                        if 'display' in item and 'label' in item['display']:
                            # This would need additional API calls to get the actual logo
                            pass
            
            elif api_name == 'Favicon Grabber':
                if 'icons' in json_data and json_data['icons']:
                    # Return the largest icon
                    largest_icon = max(json_data['icons'], key=lambda x: x.get('sizes', '0x0').split('x')[0])
                    return largest_icon.get('src')
            
            elif api_name == 'OpenCorporates':
                if 'results' in json_data and json_data['results']:
                    for company in json_data['results']['companies']:
                        if 'company' in company and 'registry_url' in company['company']:
                            # Additional processing could extract logos from company pages
                            pass
            
            # DNS-based Logo Discovery
            elif 'DNS Logo TXT' in api_name or 'DNS Brand TXT' in api_name or 'DNS Assets TXT' in api_name:
                # Parse DNS TXT records for logo URLs
                if 'Answer' in json_data:
                    for record in json_data['Answer']:
                        if record.get('type') == 16:  # TXT record
                            txt_data = record.get('data', '')
                            # Look for logo URLs in TXT records
                            logo_url = self.extract_logo_url_from_txt(txt_data)
                            if logo_url:
                                return logo_url
                elif 'answer' in json_data:  # Google DNS format
                    for record in json_data['answer']:
                        if record.get('type') == 16:
                            txt_data = record.get('data', '')
                            logo_url = self.extract_logo_url_from_txt(txt_data)
                            if logo_url:
                                return logo_url
            
            elif api_name == 'WHOIS Brand API':
                # Extract logo info from WHOIS data
                whois_data = json_data.get('WhoisRecord', {})
                registrant = whois_data.get('registrant', {})
                if 'organization' in registrant:
                    # Could cross-reference with other APIs
                    pass
            
            elif api_name == 'SecurityTrails DNS':
                # Look for logo-related subdomains
                if 'subdomains' in json_data:
                    for subdomain in json_data['subdomains']:
                        if any(keyword in subdomain.lower() for keyword in ['logo', 'brand', 'assets', 'cdn', 'static']):
                            # Try common logo paths on these subdomains
                            potential_url = f"https://{subdomain}.{json_data.get('domain', '')}/logo.png"
                            return potential_url
                        
        except Exception:
            pass
        
        return None
    
    def extract_logo_url_from_txt(self, txt_data: str) -> Optional[str]:
        """Extract logo URL from DNS TXT record data"""
        import re
        
        # Common TXT record patterns for logo URLs
        patterns = [
            r'logo[_-]?url[=:]\s*([^\s"\']+)',  # logo_url=https://...
            r'brand[_-]?logo[=:]\s*([^\s"\']+)',  # brand_logo=https://...
            r'icon[_-]?url[=:]\s*([^\s"\']+)',   # icon_url=https://...
            r'(https?://[^\s"\']+\.(?:png|jpg|jpeg|svg|gif|webp))',  # Direct URL patterns
            r'assets[=:]\s*([^\s"\']+)',  # assets=https://cdn.../logo.png
        ]
        
        for pattern in patterns:
            match = re.search(pattern, txt_data, re.IGNORECASE)
            if match:
                url = match.group(1)
                if url.startswith(('http://', 'https://')):
                    return url
        
        return None
    
    async def download_logo_from_url(self, logo_url: str) -> Optional[Dict]:
        """Download logo from extracted URL"""
        try:
            timeout = aiohttp.ClientTimeout(total=5)
            async with self.session.get(logo_url, timeout=timeout, allow_redirects=True) as response:
                if response.status == 200:
                    content_type = response.headers.get('content-type', '')
                    if 'image' in content_type:
                        content = await response.read()
                        if len(content) > 200:
                            return {
                                'data': content,
                                'url': logo_url,
                                'content_type': content_type,
                                'size': len(content)
                            }
        except Exception:
            pass
        return None
    
    async def extract_logo_tiered(self, website: str, max_tier: int = 8) -> Dict:
        """Extract logo using expanded tiered API approach for 97%+ success"""
        domain = self.clean_domain(website)
        
        result = {
            'website': website,
            'domain': domain,
            'logo_found': False,
            'logo_url': None,
            'logo_data': None,
            'method': 'ultra_enhanced_api',
            'api_service': None,
            'tier_used': None,
            'attempts': 0,
            'error': None
        }
        
        # Try APIs by tier for maximum efficiency
        for tier in range(1, max_tier + 1):
            tier_apis = [api for api in self.logo_apis if api.get('tier') == tier]
            
            # Try all APIs in current tier concurrently
            if tier_apis:
                tasks = [self.try_api_service(api_config, domain) for api_config in tier_apis]
                tier_results = await asyncio.gather(*tasks, return_exceptions=True)
                
                # Check for success in this tier
                for i, logo_result in enumerate(tier_results):
                    if isinstance(logo_result, dict) and logo_result:
                        result.update({
                            'logo_found': True,
                            'logo_url': logo_result['url'],
                            'logo_data': logo_result['data'],
                            'method': 'ultra_enhanced_api',
                            'api_service': tier_apis[i]['name'],
                            'tier_used': tier,
                            'attempts': result['attempts'] + len(tier_apis)
                        })
                        return result
                
                result['attempts'] += len(tier_apis)
                
                # Brief pause between tiers (less for early tiers)
                if tier <= 4:
                    await asyncio.sleep(0.1)
                else:
                    await asyncio.sleep(0.2)  # Longer pause for slower tiers
        
        result['error'] = f'All {result["attempts"]} APIs failed'
        return result
    
    async def extract_logo_exhaustive_retry(self, website: str, max_tier: int = 7) -> Dict:
        """
        EXHAUSTIVE RETRY: Try failed websites against ALL APIs in random order
        This maximizes success rate by trying different API combinations
        """
        domain = self.clean_domain(website)
        
        result = {
            'website': website,
            'domain': domain,
            'logo_found': False,
            'logo_url': None,
            'logo_data': None,
            'method': 'exhaustive_retry',
            'api_service': None,
            'tier_used': None,
            'attempts': 0,
            'error': None
        }
        
        # Get ALL APIs up to max_tier and shuffle them for random order
        import random
        all_apis = [api for api in self.logo_apis if api.get('tier', 1) <= max_tier]
        random.shuffle(all_apis)  # Random order for better coverage
        
        print(f"Exhaustive retry for {domain}: trying {len(all_apis)} APIs")
        
        # Try APIs in smaller chunks to be respectful
        chunk_size = 5
        for i in range(0, len(all_apis), chunk_size):
            chunk = all_apis[i:i + chunk_size]
            
            # Try chunk concurrently
            tasks = [self.try_api_service(api_config, domain) for api_config in chunk]
            chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
            
            # Check for success in this chunk
            for j, logo_result in enumerate(chunk_results):
                if isinstance(logo_result, dict) and logo_result:
                    result.update({
                        'logo_found': True,
                        'logo_url': logo_result['url'],
                        'logo_data': logo_result['data'],
                        'method': 'exhaustive_retry',
                        'api_service': chunk[j]['name'],
                        'tier_used': chunk[j]['tier'],
                        'attempts': result['attempts'] + len(chunk)
                    })
                    print(f"Retry success for {domain}: {chunk[j]['name']}")
                    return result
            
            result['attempts'] += len(chunk)
            
            # Brief pause between chunks
            await asyncio.sleep(0.1)
        
        result['error'] = f'Exhaustive retry failed: {result["attempts"]} APIs tried'
        return result
    
    async def batch_extract_logos_enhanced(self, websites: List[str], max_tier: int = 8) -> List[Dict]:
        print(f"ULTRA-ENHANCED API extraction: {len(websites)} websites")
        print(f"Using {len([api for api in self.logo_apis if api.get('tier', 1) <= max_tier])} APIs across {max_tier} tiers")
        start_time = time.time()
        
        # Process websites in optimal batch size
        batch_size = 30  # Smaller batches for more APIs
        all_results = []
        
        for i in range(0, len(websites), batch_size):
            batch = websites[i:i + batch_size]
            batch_num = i//batch_size + 1
            total_batches = (len(websites)-1)//batch_size + 1
            
            print(f"   Batch {batch_num}/{total_batches}: {len(batch)} websites")
            
            # Process batch concurrently
            tasks = [self.extract_logo_tiered(website, max_tier) for website in batch]
            batch_results = await asyncio.gather(*tasks, return_exceptions=True)
            
            # Filter results
            for j, result in enumerate(batch_results):
                if isinstance(result, dict):
                    all_results.append(result)
                else:
                    all_results.append({
                        'website': batch[j],
                        'logo_found': False,
                        'error': f'Exception: {type(result).__name__}'
                    })
            
            # Show batch progress
            batch_successful = sum(1 for r in batch_results if isinstance(r, dict) and r.get('logo_found', False))
            print(f"Batch success: {batch_successful}/{len(batch)} ({batch_successful/len(batch)*100:.1f}%)")
            
            # Brief pause between batches
            await asyncio.sleep(0.3)
        
        # EXHAUSTIVE RETRY for failed websites
        failed_results = [r for r in all_results if not r['logo_found']]
        if failed_results and len(failed_results) <= 50:  # Only retry if not too many failures
            print(f"\nEXHAUSTIVE RETRY PHASE")
            print(f"Retrying {len(failed_results)} failed websites with ALL APIs...")
            
            retry_websites = [r['website'] for r in failed_results]
            retry_tasks = [self.extract_logo_exhaustive_retry(website, max_tier) for website in retry_websites]
            retry_results = await asyncio.gather(*retry_tasks, return_exceptions=True)
            
            # Update original results with retry successes
            retry_successes = 0
            for i, retry_result in enumerate(retry_results):
                if isinstance(retry_result, dict) and retry_result.get('logo_found', False):
                    # Find and update the original failed result
                    original_website = retry_websites[i]
                    for j, original_result in enumerate(all_results):
                        if original_result['website'] == original_website and not original_result['logo_found']:
                            all_results[j] = retry_result
                            retry_successes += 1
                            break
            
            if retry_successes > 0:
                print(f"Exhaustive retry recovered {retry_successes} additional logos!")
            else:
                print("No additional logos found in retry phase")
        
        elif len(failed_results) > 50:
            print(f"\nSkipping exhaustive retry: {len(failed_results)} failures (too many)")
            print("Consider increasing max_tier or checking network connectivity")
        
        elapsed = time.time() - start_time
        successful = sum(1 for r in all_results if r['logo_found'])
        success_rate = successful / len(websites) * 100
        
        print(f"ULTRA-ENHANCED results: {successful}/{len(websites)} in {elapsed:.1f}s")
        print(f"SUCCESS RATE: {success_rate:.1f}%")
        print(f"Speed: {len(websites)/elapsed:.1f} websites/second")
        
        # Show comprehensive breakdown
        tier_breakdown = defaultdict(int)
        api_breakdown = defaultdict(int)
        
        for result in all_results:
            if result['logo_found']:
                tier = result.get('tier_used', 'unknown')
                service = result.get('api_service', 'unknown')
                tier_breakdown[f"Tier {tier}"] += 1
                api_breakdown[service] += 1
        
        print("\nPERFORMANCE BREAKDOWN:")
        print("By Tier:")
        for tier, count in sorted(tier_breakdown.items()):
            percentage = count / successful * 100 if successful > 0 else 0
            print(f"   - {tier}: {count} logos ({percentage:.1f}%)")
        
        print("Top API Services:")
        for service, count in sorted(api_breakdown.items(), key=lambda x: x[1], reverse=True)[:8]:
            percentage = count / successful * 100 if successful > 0 else 0
            print(f"   - {service}: {count} ({percentage:.1f}%)")
        
        # Success rate assessment
        if success_rate >= 97:
            print(f"EXCELLENT! {success_rate:.1f}% SUCCESS RATE ACHIEVED!")
        elif success_rate >= 95:
            print(f"\nVERY GOOD! {success_rate:.1f}% success rate")
            print("Close to 97% target - consider adding tier 8 for remaining sites")
        elif success_rate >= 90:
            print(f"\nGOOD! {success_rate:.1f}% success rate")
            print("To reach 97%+: increase max_tier or add more API services")
        else:
            print(f"\n{success_rate:.1f}% success rate - needs improvement")
            print("Try max_tier=7 and check API service availability")
        
        return all_results

In [None]:

class UnionFind:
    """Union-Find data structure for clustering"""
    
    def __init__(self, elements):
        self.parent = {elem: elem for elem in elements}
        self.rank = {elem: 0 for elem in elements}
    
    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])  # Path compression
        return self.parent[x]
    
    def union(self, x, y):
        px, py = self.find(x), self.find(y)
        if px == py:
            return
        
        # Union by rank
        if self.rank[px] < self.rank[py]:
            px, py = py, px
        self.parent[py] = px
        if self.rank[px] == self.rank[py]:
            self.rank[px] += 1
    
    def get_clusters(self):
        clusters = defaultdict(list)
        for elem in self.parent:
            root = self.find(elem)
            clusters[root].append(elem)
        return [cluster for cluster in clusters.values() if len(cluster) > 1]


In [None]:
import io

class FourierLogoAnalyzer:
    def __init__(self):
        self.similarity_threshold_phash = 6  # Hamming distance
        self.similarity_threshold_fft = 0.985  # Cosine similarity
        self.similarity_threshold_fmt = 0.995  # Fourier-Mellin
    
    def compute_phash(self, img: np.ndarray) -> str:
        """Compute perceptual hash using DCT (Fourier cousin)"""
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Resize to 32x32 for DCT
        resized = cv2.resize(gray, (32, 32))
        
        # Compute DCT (like 2D Fourier but with cosines)
        dct = cv2.dct(np.float32(resized))
        
        # Take top-left 8x8 (low frequencies)
        dct_low = dct[0:8, 0:8]
        
        # Compare with median to create binary hash
        median = np.median(dct_low)
        binary = dct_low > median
        
        # Convert to hex string
        hash_str = ''.join(['1' if b else '0' for b in binary.flatten()])
        return hash_str
    
    def hamming_distance(self, hash1: str, hash2: str) -> int:
        """Calculate Hamming distance between two hashes"""
        return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))
    
    def compute_fft_features(self, img: np.ndarray) -> np.ndarray:
        """Compute FFT low-frequency features for global shape"""
        # Convert to grayscale and normalize
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = gray.astype(np.float32) / 255.0
        
        # Resize to square and standard size
        size = 128
        resized = cv2.resize(gray, (size, size))
        
        # Compute 2D FFT
        fft = fft2(resized)
        fft_shifted = fftshift(fft)
        
        # Take magnitude and apply log
        magnitude = np.abs(fft_shifted)
        log_magnitude = np.log(magnitude + 1e-8)
        
        # Extract central low-frequency block (32x32)
        center = size // 2
        crop_size = 16
        low_freq = log_magnitude[
            center-crop_size:center+crop_size,
            center-crop_size:center+crop_size
        ]
        
        # Flatten and normalize
        features = low_freq.flatten()
        features = features / (np.linalg.norm(features) + 1e-8)
        
        return features
    
    def compute_fourier_mellin_signature(self, img: np.ndarray) -> np.ndarray:
        """Compute Fourier-Mellin theta signature for rotation/scale invariance"""
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = gray.astype(np.float32) / 255.0
        
        # Resize to square
        size = 128
        resized = cv2.resize(gray, (size, size))
        
        # Compute FFT and get magnitude
        fft = fft2(resized)
        fft_shifted = fftshift(fft)
        magnitude = np.abs(fft_shifted)
        
        # Convert to log-polar coordinates
        center = size // 2
        theta_samples = 64
        radius_samples = 32
        
        # Create theta signature by averaging over radius
        theta_signature = np.zeros(theta_samples)
        
        for i, theta in enumerate(np.linspace(0, 2*np.pi, theta_samples, endpoint=False)):
            # Sample along radial lines
            radial_sum = 0
            for r in np.linspace(1, center-1, radius_samples):
                x = int(center + r * np.cos(theta))
                y = int(center + r * np.sin(theta))
                if 0 <= x < size and 0 <= y < size:
                    radial_sum += magnitude[y, x]
            theta_signature[i] = radial_sum
        
        # Normalize
        theta_signature = theta_signature / (np.linalg.norm(theta_signature) + 1e-8)
        
        return theta_signature
    
    def compare_fourier_mellin(self, sig1: np.ndarray, sig2: np.ndarray) -> float:
        """Compare Fourier-Mellin signatures with rotation invariance"""
        # Use FFT to efficiently compute circular correlation
        # This finds the best alignment over all rotations
        n = len(sig1)
        
        # Pad and compute correlation via FFT
        sig1_fft = np.fft.rfft(sig1, n=2*n)
        sig2_fft = np.fft.rfft(sig2[::-1], n=2*n)  # Reverse for correlation
        
        correlation = np.fft.irfft(sig1_fft * sig2_fft)
        
        # Find maximum correlation (best rotation alignment)
        max_correlation = np.max(correlation)
        
        return max_correlation
    
    def compute_all_features(self, img: np.ndarray) -> Dict:
        """Compute all features including Fourier and keypoint-based methods"""
        # Fourier-based features
        fourier_features = {
            'phash': self.compute_phash(img),
            'fft_features': self.compute_fft_features(img),
            'fmt_signature': self.compute_fourier_mellin_signature(img)
        }
        
        # Keypoint-based features
        sift_features = self.compute_sift_features(img)
        orb_features = self.compute_orb_features(img)
        
        return {
            **fourier_features,
            'sift': sift_features,
            'orb': orb_features
        }
    
    def compute_sift_features(self, img: np.ndarray) -> Dict:
        """Compute SIFT keypoints and descriptors for logo matching"""
        try:
            # Initialize SIFT detector
            sift = cv2.SIFT_create(nfeatures=100)  # Limit keypoints for logos
            
            # Detect keypoints and compute descriptors
            keypoints, descriptors = sift.detectAndCompute(img, None)
            
            if descriptors is None or len(descriptors) == 0:
                return {'valid': False, 'keypoints': [], 'descriptors': np.array([]), 'signature': np.zeros(256)}
            
            # Create compact feature representation
            # Use mean and std of descriptors as global signature
            desc_mean = np.mean(descriptors, axis=0) if len(descriptors) > 0 else np.zeros(128)
            desc_std = np.std(descriptors, axis=0) if len(descriptors) > 0 else np.zeros(128)
            
            return {
                'valid': True,
                'keypoint_count': len(keypoints),
                'descriptors': descriptors,
                'signature': np.concatenate([desc_mean, desc_std])  # 256-dim signature
            }
            
        except Exception:
            return {'valid': False, 'keypoints': [], 'descriptors': np.array([]), 'signature': np.zeros(256)}
    
    def compute_orb_features(self, img: np.ndarray) -> Dict:
        """Compute ORB keypoints and descriptors (faster alternative to SIFT)"""
        try:
            # Initialize ORB detector
            orb = cv2.ORB_create(nfeatures=50)  # Fewer features for speed
            
            # Detect keypoints and compute descriptors
            keypoints, descriptors = orb.detectAndCompute(img, None)
            
            if descriptors is None or len(descriptors) == 0:
                return {'valid': False, 'keypoints': [], 'descriptors': np.array([]), 'signature': np.zeros(32)}
            
            # ORB descriptors are binary, create signature differently
            desc_mean = np.mean(descriptors.astype(np.float32), axis=0) if len(descriptors) > 0 else np.zeros(32)
            
            return {
                'valid': True,
                'keypoint_count': len(keypoints),
                'descriptors': descriptors,
                'signature': desc_mean  # 32-dim signature
            }
            
        except Exception:
            return {'valid': False, 'keypoints': [], 'descriptors': np.array([]), 'signature': np.zeros(32)}
    
    def match_sift_features(self, desc1: np.ndarray, desc2: np.ndarray) -> float:
        """Match SIFT descriptors using FLANN matcher"""
        try:
            if len(desc1) == 0 or len(desc2) == 0:
                return 0.0
            
            # FLANN parameters for SIFT
            FLANN_INDEX_KDTREE = 1
            index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
            search_params = dict(checks=50)
            
            flann = cv2.FlannBasedMatcher(index_params, search_params)
            matches = flann.knnMatch(desc1, desc2, k=2)
            
            # Apply Lowe's ratio test
            good_matches = []
            for match_pair in matches:
                if len(match_pair) == 2:
                    m, n = match_pair
                    if m.distance < 0.7 * n.distance:  # Lowe's ratio
                        good_matches.append(m)
            
            # Return ratio of good matches to total possible matches
            total_features = min(len(desc1), len(desc2))
            return len(good_matches) / max(total_features, 1)
            
        except Exception:
            return 0.0
    
    def match_orb_features(self, desc1: np.ndarray, desc2: np.ndarray) -> float:
        """Match ORB descriptors using Hamming distance"""
        try:
            if len(desc1) == 0 or len(desc2) == 0:
                return 0.0
            
            # Use BFMatcher for binary descriptors (ORB)
            bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
            matches = bf.match(desc1, desc2)
            
            # Filter good matches based on distance
            good_matches = [m for m in matches if m.distance < 50]  # Hamming distance threshold
            
            # Return ratio of good matches
            total_features = min(len(desc1), len(desc2))
            return len(good_matches) / max(total_features, 1)
            
        except Exception:
            return 0.0
    
    def are_similar(self, features1: Dict, features2: Dict) -> Tuple[bool, Dict]:
        """Determine if two logos are similar using multiple Fourier methods"""
        # pHash comparison (Hamming distance)
        phash_distance = self.hamming_distance(features1['phash'], features2['phash'])
        phash_similar = phash_distance <= self.similarity_threshold_phash
        
        # FFT features comparison (cosine similarity)
        fft_similarity = cosine_similarity(
            features1['fft_features'].reshape(1, -1),
            features2['fft_features'].reshape(1, -1)
        )[0, 0]
        fft_similar = fft_similarity >= self.similarity_threshold_fft
        
        # Fourier-Mellin comparison (rotation/scale invariant)
        fmt_similarity = self.compare_fourier_mellin(
            features1['fmt_signature'],
            features2['fmt_signature']
        )
        fmt_similar = fmt_similarity >= self.similarity_threshold_fmt
        
        # SIFT keypoint matching
        sift_similarity = 0.0
        sift_similar = False
        if features1['sift']['valid'] and features2['sift']['valid']:
            # Try descriptor matching
            sift_similarity = self.match_sift_features(
                features1['sift']['descriptors'], 
                features2['sift']['descriptors']
            )
            # Also compare signature vectors
            if len(features1['sift']['signature']) > 0 and len(features2['sift']['signature']) > 0:
                sift_sig_similarity = cosine_similarity(
                    features1['sift']['signature'].reshape(1, -1),
                    features2['sift']['signature'].reshape(1, -1)
                )[0, 0]
                sift_similarity = max(sift_similarity, sift_sig_similarity)
            sift_similar = sift_similarity >= 0.3  # SIFT threshold
        
        # ORB keypoint matching  
        orb_similarity = 0.0
        orb_similar = False
        if features1['orb']['valid'] and features2['orb']['valid']:
            # Try descriptor matching
            orb_similarity = self.match_orb_features(
                features1['orb']['descriptors'], 
                features2['orb']['descriptors']
            )
            # Also compare signature vectors
            if len(features1['orb']['signature']) > 0 and len(features2['orb']['signature']) > 0:
                orb_sig_similarity = cosine_similarity(
                    features1['orb']['signature'].reshape(1, -1),
                    features2['orb']['signature'].reshape(1, -1)
                )[0, 0]
                orb_similarity = max(orb_similarity, orb_sig_similarity)
            orb_similar = orb_similarity >= 0.25  # ORB threshold
        
        # Enhanced combination: Fourier OR keypoint methods
        is_similar = phash_similar or fft_similar or fmt_similar or sift_similar or orb_similar
        
        metrics = {
            'phash_distance': phash_distance,
            'phash_similar': phash_similar,
            'fft_similarity': fft_similarity,
            'fft_similar': fft_similar,
            'fmt_similarity': fmt_similarity,
            'fmt_similar': fmt_similar,
            'sift_similarity': sift_similarity,
            'sift_similar': sift_similar,
            'orb_similarity': orb_similarity,
            'orb_similar': orb_similar,
            'overall_similar': is_similar
        }
        
        return is_similar, metrics
    
    def preprocess_logo(self, logo_data: bytes) -> Optional[np.ndarray]:
        """Convert logo bytes to numpy array"""
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(logo_data))
            
            # Convert to RGB if needed
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            # Resize to standard size
            image = image.resize((128, 128), Image.Resampling.LANCZOS)
            
            # Convert to numpy array (RGB format)
            img_array = np.array(image)
            
            # Convert RGB to BGR for OpenCV compatibility
            img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
            
            return img_bgr
        except Exception as e:
            return None
    
    def analyze_logo_batch(self, logos: List[Dict]) -> List[Dict]:
        """Analyze a batch of logos and extract Fourier features"""
        print(f"Analyzing {len(logos)} logos for Fourier features...")
        start_time = time.time()
        
        analyzed_logos = []
        successful_analysis = 0
        
        for i, logo in enumerate(logos):
            if i % 100 == 0 and i > 0:
                print(f"   Progress: {i}/{len(logos)}")
            
            try:
                # Preprocess logo data
                img = self.preprocess_logo(logo['logo_data'])
                
                if img is not None:
                    # Extract all Fourier features
                    features = self.compute_all_features(img)
                    features['valid'] = True
                    successful_analysis += 1
                else:
                    features = {'valid': False}
                
                # Add features to logo data
                logo_with_features = logo.copy()
                logo_with_features['features'] = features
                analyzed_logos.append(logo_with_features)
                
            except Exception as e:
                # Add invalid features for failed analysis
                logo_with_features = logo.copy()
                logo_with_features['features'] = {'valid': False, 'error': str(e)}
                analyzed_logos.append(logo_with_features)
        
        elapsed = time.time() - start_time
        print(f"Feature analysis completed: {successful_analysis}/{len(logos)} valid in {elapsed:.1f}s")
        
        return analyzed_logos

print("Fourier Logo Analyzer implemented with batch analysis support")

In [None]:
class LogoVisualizationPipeline:
    """Create visualizations for logo analysis results"""
    
    def __init__(self):
        self.results_loaded = False
        self.extraction_data = None
        self.analyzed_logos = None
        self.similar_pairs = None
        self.clusters = None
        
    def load_results_from_memory(self, extraction_data, analyzed_logos, similar_pairs, clusters):
        """Load results from memory (for notebook use)"""
        self.extraction_data = extraction_data
        self.analyzed_logos = analyzed_logos
        self.similar_pairs = similar_pairs
        self.clusters = clusters
        self.results_loaded = True
        print("Results loaded into visualizer")
    
    def create_extraction_performance_chart(self):
        """Create extraction performance visualization"""
        if not self.results_loaded:
            print(" No results loaded")
            return
            
        plt.figure(figsize=(12, 8))
        
        # Extraction success breakdown
        successful = len(self.extraction_data['successful_logos'])
        total = len(self.extraction_data['websites'])
        failed = total - successful
        
        # Create subplot layout
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('Logo Extraction Performance Analysis', fontsize=16, fontweight='bold')
        
        # 1. Success Rate Pie Chart
        labels = ['Successful', 'Failed']
        sizes = [successful, failed]
        colors = ['#2E8B57', '#DC143C']
        
        ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
        ax1.set_title(f'Extraction Success Rate\n({successful}/{total} websites)')
        
        # 2. Tier Usage (if available)
        if hasattr(self.extraction_data['successful_logos'][0], 'tier_used'):
            tier_counts = defaultdict(int)
            for logo in self.extraction_data['successful_logos']:
                tier = logo.get('tier_used', 'Unknown')
                tier_counts[f"Tier {tier}"] += 1
            
            tiers = list(tier_counts.keys())
            counts = list(tier_counts.values())
            
            ax2.bar(tiers, counts, color='#4682B4')
            ax2.set_title('Success by API Tier')
            ax2.set_ylabel('Number of Logos')
            plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)
        
        # 3. Logo File Sizes Distribution
        sizes = []
        for logo in self.extraction_data['successful_logos']:
            if 'logo_data' in logo and logo['logo_data']:
                sizes.append(len(logo['logo_data']))
        
        if sizes:
            ax3.hist(sizes, bins=20, color='#FF6347', alpha=0.7)
            ax3.set_title('Logo File Size Distribution')
            ax3.set_xlabel('File Size (bytes)')
            ax3.set_ylabel('Count')
        
        # 4. Feature Analysis Success
        if self.analyzed_logos:
            valid_features = sum(1 for logo in self.analyzed_logos if logo.get('features', {}).get('valid', False))
            invalid_features = len(self.analyzed_logos) - valid_features
            
            ax4.bar(['Valid Features', 'Invalid Features'], [valid_features, invalid_features], 
                   color=['#32CD32', '#FF4500'])
            ax4.set_title('Feature Extraction Success')
            ax4.set_ylabel('Number of Logos')
        
        plt.tight_layout()
        plt.savefig('extraction_performance_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("Extraction performance chart created")
    
    def create_similarity_analysis_chart(self):
        """Create similarity analysis visualization"""
        if not self.results_loaded or not self.similar_pairs:
            print("No similarity data available")
            return
            
        plt.figure(figsize=(14, 10))
        
        # Create subplot layout  
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Logo Similarity Analysis Dashboard', fontsize=16, fontweight='bold')
        
        # Extract similarity scores
        similarity_scores = [pair[2] for pair in self.similar_pairs if len(pair) >= 3]
        
        # 1. Similarity Score Distribution
        if similarity_scores:
            ax1.hist(similarity_scores, bins=20, color='#9370DB', alpha=0.7, edgecolor='black')
            ax1.axvline(np.mean(similarity_scores), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(similarity_scores):.3f}')
            ax1.set_title('Similarity Score Distribution')
            ax1.set_xlabel('Similarity Score')
            ax1.set_ylabel('Number of Pairs')
            ax1.legend()
        
        # 2. Cluster Size Distribution
        if self.clusters:
            cluster_sizes = [len(cluster) for cluster in self.clusters if len(cluster) > 1]
            
            if cluster_sizes:
                ax2.hist(cluster_sizes, bins=max(10, len(set(cluster_sizes))), 
                        color='#20B2AA', alpha=0.7, edgecolor='black')
                ax2.set_title('Brand Cluster Size Distribution')
                ax2.set_xlabel('Cluster Size (websites)')
                ax2.set_ylabel('Number of Clusters')
            else:
                ax2.text(0.5, 0.5, 'No multi-website clusters found', 
                        ha='center', va='center', transform=ax2.transAxes, fontsize=12)
                ax2.set_title('Cluster Analysis')
        
        # 3. Top Similar Pairs
        if similarity_scores:
            top_pairs = sorted(self.similar_pairs, key=lambda x: x[2] if len(x) >= 3 else 0, reverse=True)[:10]
            
            pair_labels = []
            scores = []
            
            for i, pair in enumerate(top_pairs):
                website1 = pair[0].replace('https://', '').replace('http://', '').split('/')[0]
                website2 = pair[1].replace('https://', '').replace('http://', '').split('/')[0] 
                
                # Shorten domain names for display
                domain1 = website1.split('.')[-2] if '.' in website1 else website1
                domain2 = website2.split('.')[-2] if '.' in website2 else website2
                
                pair_labels.append(f"{domain1}-{domain2}")
                scores.append(pair[2])
            
            if scores:
                bars = ax3.barh(range(len(scores)), scores, color=plt.cm.viridis(np.linspace(0, 1, len(scores))))
                ax3.set_yticks(range(len(scores)))
                ax3.set_yticklabels(pair_labels)
                ax3.set_title('Top 10 Most Similar Logo Pairs')
                ax3.set_xlabel('Similarity Score')
                
                # Add value labels
                for i, (bar, score) in enumerate(zip(bars, scores)):
                    ax3.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                            f'{score:.3f}', va='center', fontsize=9)
        
        # 4. Summary Statistics
        stats_text = f"""Analysis Summary:
        
• Total Websites: {len(self.extraction_data['websites'])}
• Successful Extractions: {len(self.extraction_data['successful_logos'])}
• Valid Features: {len(self.analyzed_logos) if self.analyzed_logos else 0}
• Similar Pairs Found: {len(self.similar_pairs)}
• Brand Clusters: {len([c for c in self.clusters if len(c) > 1]) if self.clusters else 0}
• Largest Cluster: {max(len(c) for c in self.clusters) if self.clusters else 0} logos

Success Rates:
• Extraction: {len(self.extraction_data['successful_logos'])/len(self.extraction_data['websites'])*100:.1f}%
• Feature Analysis: {len(self.analyzed_logos)/len(self.extraction_data['successful_logos'])*100:.1f if self.analyzed_logos else 0}%"""
        
        ax4.text(0.05, 0.95, stats_text, transform=ax4.transAxes, fontsize=11,
                verticalalignment='top', bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.7))
        ax4.set_xlim(0, 1)
        ax4.set_ylim(0, 1)
        ax4.axis('off')
        ax4.set_title('Pipeline Statistics')
        
        plt.tight_layout()
        plt.savefig('similarity_analysis_visualization.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("Similarity analysis chart created")
    
    def create_cluster_dashboard(self):
        """Create comprehensive cluster analysis dashboard"""
        if not self.results_loaded or not self.clusters:
            print("No cluster data available")
            return
            
        # Filter multi-website clusters
        multi_clusters = [cluster for cluster in self.clusters if len(cluster) > 1]
        
        if not multi_clusters:
            print("No multi-website clusters found")
            return
        
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Brand Cluster Analysis Dashboard', fontsize=16, fontweight='bold')
        
        cluster_sizes = [len(cluster) for cluster in multi_clusters]
        
        # 1. Cluster Size Distribution
        ax1.hist(cluster_sizes, bins=max(5, len(set(cluster_sizes))), 
                color='#FF6B6B', alpha=0.7, edgecolor='black')
        ax1.set_title('Cluster Size Distribution')
        ax1.set_xlabel('Cluster Size (websites)')
        ax1.set_ylabel('Number of Clusters')
        ax1.axvline(np.mean(cluster_sizes), color='blue', linestyle='--',
                   label=f'Mean: {np.mean(cluster_sizes):.1f}')
        ax1.legend()
        
        # 2. Top Clusters by Size
        sorted_clusters = sorted(multi_clusters, key=len, reverse=True)[:10]
        cluster_names = []
        sizes = []
        
        for i, cluster in enumerate(sorted_clusters):
            # Create a representative name from the first domain
            sample_domain = cluster[0].replace('https://', '').replace('http://', '').split('/')[0]
            brand_name = sample_domain.split('.')[0]
            cluster_names.append(f"Cluster {i+1}\n({brand_name})")
            sizes.append(len(cluster))
        
        if sizes:
            bars = ax2.bar(range(len(sizes)), sizes, color=plt.cm.Set3(np.linspace(0, 1, len(sizes))))
            ax2.set_xticks(range(len(sizes)))
            ax2.set_xticklabels(cluster_names, rotation=45, ha='right')
            ax2.set_title('Top 10 Largest Brand Clusters')
            ax2.set_ylabel('Number of Websites')
            
            # Add value labels
            for bar, size in zip(bars, sizes):
                ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                        str(size), ha='center', va='bottom', fontweight='bold')
        
        # 3. Clustering Efficiency Metrics
        total_websites = len(self.extraction_data['websites'])
        clustered_websites = sum(len(cluster) for cluster in multi_clusters)
        single_clusters = len(self.clusters) - len(multi_clusters)
        
        metrics = {
            'Multi-Brand Clusters': len(multi_clusters),
            'Single Logo Clusters': single_clusters, 
            'Clustered Websites': clustered_websites,
            'Unclustered Websites': total_websites - len(self.clusters)
        }
        
        bars = ax3.bar(range(len(metrics)), list(metrics.values()),
                      color=['#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])
        ax3.set_xticks(range(len(metrics)))
        ax3.set_xticklabels(list(metrics.keys()), rotation=45, ha='right')
        ax3.set_title('Clustering Efficiency Metrics')
        ax3.set_ylabel('Count')
        
        # Add value labels
        for bar, value in zip(bars, metrics.values()):
            ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(metrics.values())*0.01,
                    str(value), ha='center', va='bottom', fontweight='bold')
        
        # 4. Cluster Quality Summary
        quality_text = f"""Cluster Quality Assessment:

📊 Total Brand Families: {len(multi_clusters)}
🏆 Largest Brand Cluster: {max(cluster_sizes)} websites  
📈 Average Cluster Size: {np.mean(cluster_sizes):.1f} websites
📉 Smallest Brand Cluster: {min(cluster_sizes)} websites

🎯 Coverage Metrics:
• Websites in Brand Clusters: {clustered_websites:,}
• Clustering Rate: {clustered_websites/total_websites*100:.1f}%
• Brand Discovery Rate: {len(multi_clusters)/total_websites*100:.2f}%

🔗 Similarity Metrics:
• Similar Pairs Found: {len(self.similar_pairs)}
• Avg Pairs per Brand: {len(self.similar_pairs)/len(multi_clusters):.1f}"""

        ax4.text(0.05, 0.95, quality_text, transform=ax4.transAxes, fontsize=10,
                verticalalignment='top', 
                bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.7))
        ax4.set_xlim(0, 1) 
        ax4.set_ylim(0, 1)
        ax4.axis('off')
        ax4.set_title('Quality Assessment')
        
        plt.tight_layout()
        plt.savefig('cluster_analysis_dashboard.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("Cluster analysis dashboard created")
    
    def create_all_visualizations(self):
        """Create all visualization charts"""
        print("\n📊 CREATING COMPREHENSIVE VISUALIZATIONS")
        print("-" * 50)
        
        try:
            self.create_extraction_performance_chart()
            print()
            self.create_similarity_analysis_chart()
            print()
            self.create_cluster_dashboard()
            
            print(f"\nAll visualizations completed!")
            print("Charts saved as PNG files in current directory")
            
        except Exception as e:
            print(f"Visualization error: {e}")
            print("Continuing without visualizations...")

print(" LogoVisualizationPipeline class implemented!")
print("   - Extraction performance charts") 
print("   - Similarity analysis dashboard")
print("   - Brand cluster analysis")
print("   - Memory-based loading for notebooks")

# Enhanced Logo Similarity Analysis with SIFT/ORB

## Multi-Method Approach for Maximum Accuracy

This enhanced analysis combines **5 different similarity methods**:

### **Fourier-Based Methods (Global Shape Analysis)**
1. **pHash (DCT)** - Perceptual hashing for near-duplicate detection
2. **FFT Features** - Low-frequency global shape signatures  
3. **Fourier-Mellin** - Rotation and scale invariant matching

### **Keypoint-Based Methods (Local Feature Analysis)**
4. **SIFT Features** - Scale-Invariant Feature Transform for distinctive keypoints
5. **ORB Features** - Oriented FAST and Rotated BRIEF for fast binary matching

### **Why This Combination Works Better:**

- **Fourier methods** excel at detecting logos with similar global shapes/patterns
- **SIFT/ORB methods** excel at matching logos with distinctive keypoints (text, corners, unique shapes)
- **Complementary strengths** cover different types of logo similarities:
  - Similar color schemes → FFT
  - Rotated/scaled versions → Fourier-Mellin + SIFT
  - Different backgrounds → SIFT/ORB keypoints
  - Partial logos → ORB features
  
### **Decision Logic:**
**OR combination** - logos are similar if **ANY** method detects similarity, maximizing recall while maintaining precision through carefully tuned thresholds.

In [None]:
async def run_complete_logo_analysis_pipeline(sample_size=None, max_tier=5, create_visuals=True):
    """
    Complete end-to-end logo analysis pipeline with all enhancements
    
    Args:
        sample_size: Number of websites to process (None for all in parquet)
        max_tier: Maximum API tier to use (1-5, higher = more coverage but slower)
        create_visuals: Whether to generate visualization charts
    
    Returns:
        Complete analysis results with extraction, similarity, clustering, and visuals
    """
    
    print(" COMPLETE LOGO ANALYSIS PIPELINE WITH ALL ENHANCEMENTS")
    print("=" * 70)
    
    # CHECK FOR EXISTING SAVED RESULTS
    import os
    if os.path.exists('logo_extraction_results.pkl'):
        print("\nFOUND EXISTING EXTRACTION RESULTS!")
        print("You can resume from saved data instead of re-extracting logos.")
        print("To resume: saved_data = load_saved_extraction_results()")
        print("          results = await resume_pipeline_from_extraction(saved_data)")
        print("Continuing with fresh extraction...\n")
    
    total_start_time = time.time()
    
    # Step 1: Load Data
    print("\n DATA LOADING")
    print("-" * 30)
    
    df = LightningParquetProcessor.load_parquet_fast(
        'logos.snappy.parquet', 
        sample_size=sample_size
    )
    
    website_col = LightningParquetProcessor.get_website_column(df)
    websites = df[website_col].dropna().tolist()
    
    print(f" Processing {len(websites)} websites")
    
    # Step 2: Enhanced Logo Extraction (targeting 97%+ success)
    print(f"\n ENHANCED LOGO EXTRACTION (Max Tier: {max_tier})")
    print("-" * 50)
    
    async with EnhancedAPILogoExtractor() as extractor:
        logo_results = await extractor.batch_extract_logos_enhanced(websites, max_tier=max_tier)
    
    successful_logos = [r for r in logo_results if r['logo_found']]
    success_rate = len(successful_logos) / len(websites) * 100
    
    print(f"Logo extraction: {len(successful_logos)}/{len(websites)} ({success_rate:.1f}% success)")
    
    # SAVE EXTRACTION RESULTS IMMEDIATELY (prevent data loss)
    print(f"\n SAVING EXTRACTION RESULTS")
    print("-" * 40)
    
    import pickle
    import json
    from datetime import datetime
    
    # Save extraction results in multiple formats for safety
    extraction_results = {
        'timestamp': datetime.now().isoformat(),
        'websites': websites,
        'logo_results': logo_results,
        'successful_logos': successful_logos,
        'success_rate': success_rate,
        'total_processed': len(websites),
        'total_successful': len(successful_logos),
        'max_tier_used': max_tier
    }
    
    try:
        # Save as pickle (preserves binary logo data)
        with open('logo_extraction_results.pkl', 'wb') as f:
            pickle.dump(extraction_results, f)
        print(" Saved extraction results as pickle file")
        
        # Save metadata as JSON (human-readable backup)
        json_safe_results = {
            'timestamp': extraction_results['timestamp'],
            'websites': extraction_results['websites'],
            'success_rate': extraction_results['success_rate'],
            'total_processed': extraction_results['total_processed'],
            'total_successful': extraction_results['total_successful'],
            'max_tier_used': extraction_results['max_tier_used'],
            'successful_websites': [r['website'] for r in successful_logos],
            'api_breakdown': {}
        }
        
        # Add API service breakdown
        api_counts = {}
        for result in successful_logos:
            service = result.get('api_service', 'Unknown')
            api_counts[service] = api_counts.get(service, 0) + 1
        json_safe_results['api_breakdown'] = api_counts
        
        with open('logo_extraction_metadata.json', 'w') as f:
            json.dump(json_safe_results, f, indent=2)
        print(" Saved extraction metadata as JSON file")
        
        print(f"📁 Extraction results safely stored in:")
        print(f"   - logo_extraction_results.pkl (complete data)")
        print(f"   - logo_extraction_metadata.json (summary)")
        
    except Exception as e:
        print(f" Warning: Could not save extraction results: {e}")
        print("Continuing with pipeline...")
    
    if len(successful_logos) < 2:
        print("\nNeed at least 2 logos for similarity analysis")
        print("💾 Extraction results saved - you can resume later!")
        return extraction_results
    
    # Step 3: Fourier Feature Analysis
    print(f"\n FOURIER FEATURE ANALYSIS")
    print("-" * 40)
    
    # Use external classes if available, otherwise use notebook definitions
    if USE_EXTERNAL_CLASSES:
        print("Using FourierLogoAnalyzer from similarity_pipeline.py")
        analyzer = FourierLogoAnalyzer()
    else:
        print("Using notebook FourierLogoAnalyzer definition")
        analyzer = FourierLogoAnalyzer()
    
    analyzed_logos = analyzer.analyze_logo_batch(successful_logos)
    valid_logos = [logo for logo in analyzed_logos if logo['features']['valid']]
    
    print(f"Feature analysis: {len(valid_logos)}/{len(successful_logos)} logos with valid features")
    
    if len(valid_logos) < 2:
        print(" Need at least 2 valid logos for similarity analysis")
        return None
    
    # Step 4: Similarity Analysis
    print(f"\nSIMILARITY ANALYSIS")
    print("-" * 35)
    
    # Use existing FourierLogoAnalyzer directly (no wrapper needed!)
    similar_pairs = analyzer.find_similar_pairs(analyzed_logos, threshold=0.7)
    print(f"Similarity analysis: {len(similar_pairs)} similar pairs found")
    
    # Step 5: Union-Find Clustering
    print(f"\nUNION-FIND CLUSTERING")
    print("-" * 35)
    
    if similar_pairs:
        # Get all websites from valid logos
        all_websites = [logo['website'] for logo in valid_logos]
        
        # Use existing UnionFind class directly 
        uf = UnionFind(all_websites)
        
        # Process similar pairs
        for website1, website2, similarity in similar_pairs:
            if website1 in all_websites and website2 in all_websites:
                uf.union(website1, website2)
        
        clusters = uf.get_clusters()
        union_trace = []  # Could add if needed
        
        # Filter multi-logo clusters
        multi_clusters = [cluster for cluster in clusters if len(cluster) > 1]
        print(f" Clustering: {len(multi_clusters)} brand clusters discovered")
        
        # Show largest clusters
        if multi_clusters:
            sorted_clusters = sorted(multi_clusters, key=len, reverse=True)[:5]
            print(" Top brand clusters:")
            for i, cluster in enumerate(sorted_clusters, 1):
                sample_domain = cluster[0].replace('https://', '').replace('http://', '').split('/')[0]
                brand_name = sample_domain.split('.')[0] if '.' in sample_domain else sample_domain
                print(f"   {i}. {brand_name}: {len(cluster)} similar logos")
    else:
        clusters = [[logo['website']] for logo in valid_logos]  # Each logo in its own cluster
        union_trace = []
        print(" No similar pairs found - each logo in separate cluster")
    
    # Step 6: Create Visualizations
    if create_visuals:
        print(f"\nVISUALIZATION GENERATION")
        print("-" * 40)
        
        viz_pipeline = LogoVisualizationPipeline()
        
        # Prepare extraction results for visualization
        extraction_data = {
            'websites': websites,
            'logo_results': logo_results,
            'successful_logos': successful_logos
        }
        
        # Load results into visualizer
        viz_pipeline.load_results_from_memory(
            extraction_data,
            analyzed_logos, 
            similar_pairs,
            clusters
        )
        
        # Create all visualizations
        viz_pipeline.create_all_visualizations()
    
    # Step 7: Summary Report
    total_elapsed = time.time() - total_start_time
    
    print(f"\n PIPELINE COMPLETE!")
    print("=" * 50)
    print(f" RESULTS SUMMARY:")
    print(f"   - Websites processed: {len(websites)}")
    print(f"   - Logos extracted: {len(successful_logos)} ({success_rate:.1f}% success)")
    print(f"   - Valid features: {len(valid_logos)}")
    print(f"   - Similar pairs: {len(similar_pairs)}")
    print(f"   - Brand clusters: {len(clusters)}")
    print(f"   - Processing time: {total_elapsed:.1f} seconds")
    print(f"   - API tier used: 1-{max_tier}")
    
    if success_rate >= 97:
        print(f" EXCELLENT! {success_rate:.1f}% success rate achieved!")
    elif success_rate >= 90:
        print(f" GREAT! {success_rate:.1f}% success rate")
    else:
        print(f"🔧 Consider increasing max_tier for better coverage")
    
    # Return complete results
    return {
        'websites': websites,
        'logo_results': logo_results,
        'successful_logos': successful_logos,
        'analyzed_logos': analyzed_logos,
        'valid_logos': valid_logos,
        'similar_pairs': similar_pairs,
        'clusters': clusters,
        'union_trace': union_trace if 'union_trace' in locals() else [],
        'success_rate': success_rate,
        'processing_time': total_elapsed,
        'visualizations_created': create_visuals
    }

print(" Complete integrated pipeline ready!")

In [None]:
def load_saved_extraction_results():
    """Load previously saved extraction results to resume pipeline"""
    import pickle
    import os
    
    try:
        if os.path.exists('logo_extraction_results.pkl'):
            with open('logo_extraction_results.pkl', 'rb') as f:
                results = pickle.load(f)
            
            print("LOADED SAVED EXTRACTION RESULTS")
            print("=" * 50)
            print(f"Timestamp: {results['timestamp']}")
            print(f" Websites processed: {results['total_processed']}")
            print(f"Successful extractions: {results['total_successful']}")
            print(f"Success rate: {results['success_rate']:.1f}%")
            print(f"Max tier used: {results['max_tier_used']}")
            
            return results
        else:
            print("No saved extraction results found")
            return None
            
    except Exception as e:
        print(f"rror loading saved results: {e}")
        return None


async def resume_pipeline_from_extraction(saved_results, create_visuals=True):
    """Resume pipeline from saved extraction results"""
    
    print("\nRESUMING PIPELINE FROM SAVED EXTRACTION RESULTS")
    print("=" * 60)
    
    # Extract data from saved results
    websites = saved_results['websites']
    logo_results = saved_results['logo_results']
    successful_logos = saved_results['successful_logos']
    success_rate = saved_results['success_rate']
    
    print(f"Resuming with {len(successful_logos)} successfully extracted logos...")
    
    if len(successful_logos) < 2:
        print(" Need at least 2 logos for similarity analysis")
        return saved_results
    
    # Continue with Step 3: Fourier Feature Analysis
    print(f"\n🔍 FOURIER FEATURE ANALYSIS")
    print("-" * 40)
    
    # Use external classes if available, otherwise use notebook definitions
    if USE_EXTERNAL_CLASSES:
        print("Using FourierLogoAnalyzer from similarity_pipeline.py")
        analyzer = FourierLogoAnalyzer()
    else:
        print("Using notebook FourierLogoAnalyzer definition")
        analyzer = FourierLogoAnalyzer()
    
    analyzed_logos = analyzer.analyze_logo_batch(successful_logos)
    valid_logos = [logo for logo in analyzed_logos if logo['features']['valid']]
    
    print(f"Feature analysis: {len(valid_logos)}/{len(successful_logos)} logos with valid features")
    
    if len(valid_logos) < 2:
        print("Need at least 2 valid logos for similarity analysis")
        return {**saved_results, 'analyzed_logos': analyzed_logos, 'valid_logos': valid_logos}
    
    # Step 4: Similarity Analysis
    print(f"\n SIMILARITY ANALYSIS")
    print("-" * 35)
    
    similar_pairs = analyzer.find_similar_pairs(analyzed_logos, threshold=0.7)
    print(f"Similarity analysis: {len(similar_pairs)} similar pairs found")
    
    # Step 5: Union-Find Clustering
    print(f"\n UNION-FIND CLUSTERING")
    print("-" * 35)
    
    if similar_pairs:
        all_websites = [logo['website'] for logo in valid_logos]
        uf = UnionFind(all_websites)
        
        for website1, website2, similarity in similar_pairs:
            if website1 in all_websites and website2 in all_websites:
                uf.union(website1, website2)
        
        clusters = uf.get_clusters()
        multi_clusters = [cluster for cluster in clusters if len(cluster) > 1]
        print(f"Clustering: {len(multi_clusters)} brand clusters discovered")
        
        if multi_clusters:
            sorted_clusters = sorted(multi_clusters, key=len, reverse=True)[:5]
            print("Top brand clusters:")
            for i, cluster in enumerate(sorted_clusters, 1):
                sample_domain = cluster[0].replace('https://', '').replace('http://', '').split('/')[0]
                brand_name = sample_domain.split('.')[0] if '.' in sample_domain else sample_domain
                print(f"   {i}. {brand_name}: {len(cluster)} similar logos")
    else:
        clusters = [[logo['website']] for logo in valid_logos]
        print("No similar pairs found - each logo in separate cluster")
    
    # Step 6: Create Visualizations
    if create_visuals:
        print(f"\n📊 VISUALIZATION GENERATION")
        print("-" * 40)
        
        viz_pipeline = LogoVisualizationPipeline()
        
        extraction_data = {
            'websites': websites,
            'logo_results': logo_results,
            'successful_logos': successful_logos
        }
        
        viz_pipeline.load_results_from_memory(
            extraction_data,
            analyzed_logos, 
            similar_pairs,
            clusters
        )
        
        viz_pipeline.create_all_visualizations()
    
    # Complete results
    complete_results = {
        **saved_results,  # Include original extraction results
        'analyzed_logos': analyzed_logos,
        'valid_logos': valid_logos,
        'similar_pairs': similar_pairs,
        'clusters': clusters,
        'union_trace': [],
        'visualizations_created': create_visuals,
        'pipeline_completed': True
    }
    
    print(f"\n✅ PIPELINE RESUMED AND COMPLETED!")
    print("=" * 50)
    
    return complete_results


print("Data persistence utilities ready!")
print("   - Auto-save after logo extraction")
print("   - Resume pipeline from saved results")
print("   - Multiple backup formats (pickle + JSON)")

In [None]:
# 💾 EXAMPLE: How to use the recovery features

print("🔄 DATA RECOVERY & RESUME EXAMPLES")
print("=" * 50)

print("\n1️⃣ RESUME FROM SAVED EXTRACTION:")
print("# If your pipeline failed after logo extraction, you can resume:")
print("saved_data = load_saved_extraction_results()")
print("if saved_data:")
print("    results = await resume_pipeline_from_extraction(saved_data)")

print("\n2️⃣ CHECK FOR EXISTING DATA:")
print("# Before running full pipeline, check if you have saved data:")
print("if os.path.exists('logo_extraction_results.pkl'):")
print("    print('Found saved extraction results!')")
print("    # Option to load and continue vs restart")

print("\n3️⃣ SELECTIVE PROCESSING:")
print("# You can also run just extraction with different settings:")
print("# results = await run_complete_logo_analysis_pipeline(")
print("#     sample_size=100,    # Start small")
print("#     max_tier=3,         # Use fewer tiers for speed")
print("#     create_visuals=False # Skip visuals for now")
print("# )")

print("\n🛡️ DATA SAFETY FEATURES:")
print("✅ Auto-save after logo extraction")
print("✅ Multiple backup formats (pickle + JSON)")
print("✅ Resume capability from any saved point")
print("✅ Timestamped results")
print("✅ Detailed metadata logging")

print("\n📁 FILES CREATED:")
print("- logo_extraction_results.pkl (complete data with binary logos)")
print("- logo_extraction_metadata.json (human-readable summary)")

print("\nReady to use! Run your pipeline with confidence! 🚀")

In [None]:
# OPTION 3: Full Production Pipeline (ALL websites from parquet)
print("\nOPTION 3: Full Production Pipeline - Process ALL websites in parquet file")
print("This will process all websites in the parquet file (may take several minutes)")
print("Uncomment the code below when ready for full production run:")

# Uncomment for full production run:
full_results = await run_complete_logo_analysis_pipeline(
    sample_size=None,     # Process ALL websites in parquet
    max_tier=5,           # Use all API tiers for maximum success rate
    create_visuals=True   # Generate comprehensive visualizations
)

print("\nPipeline configurations ready!")
print("Choose the option that fits your needs and run the cell")