In [3]:
import random
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import os

# Define categories and associated keywords for classification
# This is a sample categorization; keywords can be adjusted for better accuracy
CATEGORIES_KEYWORDS = {
    "Retail & E-commerce": ["shop", "store", "market", "retail", "boutique", "goods", "fashion", "apparel", "clothing", "style", "beauty", "cosmetics", "jewelry", "eyewear", "collection", "trading", "brands", "llc", "inc", "co", "ltd", "target", "walmart", "amazon", "etsy", "wayfair", "depot", "sephora", "macys", "kohls"],
    "Media & Entertainment": ["media", "music", "entertainment", "presents", "productions", "records", "magazine", "news", "tv", "radio", "festival", "publishing", "arts", "theatre", "events", "concerts", "live", "times", "post", "journal", "daily", "hulu", "netflix", "disney", "hbo", "spotify", "gaming", "sports"],
    "Tech & Software": ["tech", "digital", "software", "systems", "ai", "labs", "online", "data", "analytics", "cloud", "network", "solutions", "meta", "google", "microsoft", "apple", "nvidia", "sony", "logitech", "hubspot", "adobe", "instacart", "doordash", "uber", "airbnb", "canva"],
    "Business & Financial Services": ["finance", "financial", "bank", "capital", "investment", "wealth", "business", "group", "holdings", "ventures", "consulting", "services", "advisory", "insurance", "payments", "paypal", "mastercard", "visa", "amex", "agency", "marketing", "advertising", "communications", "pr"],
    "Health, Wellness & Food": ["health", "wellness", "medical", "clinic", "pharma", "nutrition", "fitness", "care", "hospital", "biotech", "food", "drink", "coffee", "restaurant", "cafe", "brewing", "catering", "foods", "beverages", "organic", "hellofresh", "optum", "cvs", "walgreens"],
    "Education & Non-Profit": ["university", "college", "academy", "school", "institute", "education", "learning", "foundation", "org", "association", "nonprofit"],
    "Travel & Hospitality": ["travel", "hotels", "resorts", "airlines", "tours", "getaways", "cruises", "booking", "expedia", "marriott", "hilton", "hyatt", "delta"]
}

DEFAULT_CATEGORY = "Other"

def extract_advertisers_from_html(html_content):
    """
    Parses HTML content to extract advertiser names.
    Args:
        html_content (str): The HTML content as a string.
    Returns:
        list: A list of unique, cleaned advertiser names.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    advertiser_names = set() # Use a set to store unique names initially

    # Advertiser names are in <div class="_2ph_ _a6-p">ADVERTISER_NAME</div>
    # These divs are nested within <section class="_a6-g"> elements.
    # More robustly, find all divs with the specific class that directly contains the name.
    name_divs = soup.find_all('div', class_='_2ph_ _a6-p')

    for div in name_divs:
        # Check if this div is a direct child of a section that seems to be an advertiser block
        # This check helps to avoid grabbing divs with class '_2ph_ _a6-p' from other contexts if any.
        # Based on the structure: section._a6-g > div._2pi8._2pic._a6-p > section._a6-g > div._2ph_._a6-p
        # However, a simpler approach is to assume all such divs contain advertiser names,
        # as the file is specific to advertisers.
        name = div.get_text(strip=True)
        if name: # Ensure the name is not empty
            advertiser_names.add(name)

    return sorted(list(advertiser_names)) # Return a sorted list

def categorize_advertiser(advertiser_name):
    """
    Categorizes an advertiser based on keywords in their name.
    Args:
        advertiser_name (str): The name of the advertiser.
    Returns:
        str: The category name.
    """
    lower_name = advertiser_name.lower()
    for category, keywords in CATEGORIES_KEYWORDS.items():
        for keyword in keywords:
            # Use word boundaries for more precise matching, e.g., "shop" not "photoshop"
            if re.search(r'\b' + re.escape(keyword) + r'\b', lower_name):
                return category
    return DEFAULT_CATEGORY

def select_advertisers(all_advertisers_categorized, total_advertisers_list):
    """
    Selects advertisers based on the specified criteria.
    Args:
        all_advertisers_categorized (dict): Dict with categories as keys and lists of advertisers as values.
        total_advertisers_list (list): List of all unique advertiser names.
    Returns:
        tuple: (list of selected advertisers, dict of selected advertisers by category) or (None, None) if criteria cannot be met.
    """
    # 1. Filter for categories with at least 10 advertisers
    eligible_categories = {
        cat: advertisers
        for cat, advertisers in all_advertisers_categorized.items()
        if len(advertisers) >= 10
    }

    if len(eligible_categories) < 5:
        print(f"Error: Found only {len(eligible_categories)} categories with at least 10 advertisers. Need at least 5.")
        print("Eligible categories and their counts:")
        for cat, advs in eligible_categories.items():
            print(f"- {cat}: {len(advs)}")
        print("\nAll categories and their counts:")
        for cat, advs in all_advertisers_categorized.items():
            print(f"- {cat}: {len(advs)}")
        return None, None

    # 2. Randomly select 5 distinct categories from the eligible ones
    chosen_category_names = random.sample(list(eligible_categories.keys()), 5)

    selected_advertisers_list = []
    selected_advertisers_by_category = defaultdict(list)

    # Keep track of advertisers already selected to avoid duplicates in the initial 50
    globally_selected_set = set()

    # 3. Select 10 advertisers from each of these 5 categories
    for category_name in chosen_category_names:
        advertisers_in_category = eligible_categories[category_name]

        # Ensure we can pick 10 unique advertisers not already picked from other chosen categories
        available_for_selection = [adv for adv in advertisers_in_category if adv not in globally_selected_set]

        if len(available_for_selection) < 10:
            # This case should be rare if categories are distinct enough and large enough
            # Fallback: pick as many as possible, then try to fill from other categories or general pool later
            # For now, we assume this won't happen due to pre-filtering `eligible_categories`
            # and random sampling of 10 from a list of >=10.
            # If an advertiser could belong to multiple "chosen_categories", this check is more important.
            # However, our categorization assigns one category per advertiser.
            pass

        selected_from_category = random.sample(available_for_selection, min(10, len(available_for_selection))) # should always be 10

        for advertiser in selected_from_category:
            selected_advertisers_list.append(advertiser)
            selected_advertisers_by_category[category_name].append(advertiser)
            globally_selected_set.add(advertiser)

    # At this point, len(selected_advertisers_list) is 50 (or close if samples overlapped, which they shouldn't with current logic)

    # 4. Determine the target total number of advertisers to select (between 50 and 200)
    # The minimum is already met (50 advertisers from 5 categories * 10 each)
    target_sample_size = random.randint(50, 200)

    # 5. Select additional advertisers if needed
    advertisers_needed_more = target_sample_size - len(selected_advertisers_list)

    if advertisers_needed_more > 0:
        # Create a pool of remaining advertisers (all unique advertisers MINUS those already selected)
        remaining_pool = [
            adv for adv in total_advertisers_list if adv not in globally_selected_set
        ]
        random.shuffle(remaining_pool) # Shuffle for random selection

        additional_selections = remaining_pool[:min(advertisers_needed_more, len(remaining_pool))]

        for advertiser in additional_selections:
            selected_advertisers_list.append(advertiser)
            globally_selected_set.add(advertiser) # Add to set to keep track
            # For categorizing these additional selections:
            category_of_additional = categorize_advertiser(advertiser) # Re-categorize if needed, or store earlier
            selected_advertisers_by_category[category_of_additional].append(advertiser)


    # Ensure the final list has unique advertisers (though current logic should maintain uniqueness)
    final_selected_advertisers = sorted(list(set(selected_advertisers_list)))

    # Reconstruct selected_advertisers_by_category based on the final unique list
    final_selected_by_category = defaultdict(list)
    for adv in final_selected_advertisers:
        cat = categorize_advertiser(adv) # Get its assigned category
        final_selected_by_category[cat].append(adv)

    return final_selected_advertisers, final_selected_by_category


def main():
    # --- Configuration ---
    # User provided file name: advertisers_who_uploaded_a_contact_list_with_your_information.html
    # Actual file name from fetched content that matches this description:
    html_file_path = 'advertisers_using_your_activity_or_information.html'

    # Check if the HTML file exists
    if not os.path.exists(html_file_path):
        print(f"Error: The file '{html_file_path}' was not found.")
        print("Please make sure the HTML file is in the same directory as the script, or provide the correct path.")
        # Attempt to create a dummy file for demonstration if it doesn't exist
        print(f"Creating a dummy '{html_file_path}' for demonstration purposes as it was not found.")
        dummy_html_content = """
        <html><body>
            <section class="_a6-g"><div class="_2pi8 _2pic _a6-p"><section class="_a6-g"><div class="_2ph_ _a6-p">Advertiser Alpha (Tech)</div></section></div></section>
            <section class="_a6-g"><div class="_2pi8 _2pic _a6-p"><section class="_a6-g"><div class="_2ph_ _a6-p">Advertiser Beta (Retail)</div></section></div></section>
            <section class="_a6-g"><div class="_2pi8 _2pic _a6-p"><section class="_a6-g"><div class="_2ph_ _a6-p">Advertiser Gamma (Tech)</div></section></div></section>
            <section class="_a6-g"><div class="_2pi8 _2pic _a6-p"><section class="_a6-g"><div class="_2ph_ _a6-p">Advertiser Delta (Media)</div></section></div></section>
            <section class="_a6-g"><div class="_2pi8 _2pic _a6-p"><section class="_a6-g"><div class="_2ph_ _a6-p">Advertiser Epsilon (Retail)</div></section></div></section>
        """
        # Create 100 dummy advertisers to satisfy constraints for testing
        for i in range(100):
            cat_choice = random.choice(list(CATEGORIES_KEYWORDS.keys()))
            keyword_for_cat = random.choice(CATEGORIES_KEYWORDS[cat_choice]).capitalize()
            dummy_html_content += f"""
            <section class="_a6-g">
                <div class="_2pi8 _2pic _a6-p">
                    <section class="_a6-g">
                        <div class="_2ph_ _a6-p">Dummy Advertiser {i+1} ({keyword_for_cat})</div>
                    </section>
                </div>
            </section>
            """
        dummy_html_content += "</body></html>"
        with open(html_file_path, 'w', encoding='utf-8') as f:
            f.write(dummy_html_content)
        print(f"A dummy file '{html_file_path}' has been created with sample data. Please replace it with your actual file.")


    # --- Script Execution ---
    try:
        with open(html_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
    except Exception as e:
        print(f"Error reading file {html_file_path}: {e}")
        return

    # 1. Extract and clean advertisers
    all_unique_advertisers = extract_advertisers_from_html(html_content)
    if not all_unique_advertisers:
        print("No advertisers found in the HTML file.")
        return
    print(f"Found {len(all_unique_advertisers)} unique advertisers in total.")

    # 2. Categorize all advertisers
    all_categorized_advertisers = defaultdict(list)
    for advertiser_name in all_unique_advertisers:
        category = categorize_advertiser(advertiser_name)
        all_categorized_advertisers[category].append(advertiser_name)

    print("\n--- Advertiser Categorization Summary (All Advertisers) ---")
    for category, advertisers in sorted(all_categorized_advertisers.items()):
        print(f"- {category}: {len(advertisers)} advertisers")

    # 3. Perform the selection
    selected_advertisers, selected_by_category = select_advertisers(all_categorized_advertisers, all_unique_advertisers)

    if not selected_advertisers:
        print("\nCould not select advertisers based on the criteria.")
        return

    # 4. Output results
    print(f"\n--- Selected Advertisers ({len(selected_advertisers)} total) ---")
    for i, advertiser in enumerate(selected_advertisers):
        print(f"{i+1}. {advertiser} (Category: {categorize_advertiser(advertiser)})")

    print("\n--- Summary of Selected Advertisers by Category ---")
    total_selected_count = 0
    for category, advertisers in sorted(selected_by_category.items()):
        count = len(advertisers)
        print(f"- {category}: {count} advertisers")
        total_selected_count += count

    # This should match len(selected_advertisers)
    print(f"Total verified selected: {total_selected_count}")

    percentage_selected = (len(selected_advertisers) / len(all_unique_advertisers)) * 100 if len(all_unique_advertisers) > 0 else 0
    print(f"\nThe selected advertisers represent {percentage_selected:.2f}% of the total unique advertisers.")

if __name__ == '__main__':
    main()

Found 1824 unique advertisers in total.

--- Advertiser Categorization Summary (All Advertisers) ---
- Business & Financial Services: 77 advertisers
- Education & Non-Profit: 19 advertisers
- Health, Wellness & Food: 43 advertisers
- Media & Entertainment: 162 advertisers
- Other: 1334 advertisers
- Retail & E-commerce: 121 advertisers
- Tech & Software: 58 advertisers
- Travel & Hospitality: 10 advertisers

--- Selected Advertisers (87 total) ---
1. 999 CLUB (Category: Other)
2. AMSOIL INC. (Category: Retail & E-commerce)
3. AdParlor (Category: Other)
4. Adobe (Category: Tech & Software)
5. Air New Zealand (Category: Other)
6. Allegiance Fundraising Group - Sarasota Division (Category: Business & Financial Services)
7. Allied Global Marketing (Category: Business & Financial Services)
8. Argo Prep (Category: Other)
9. BOL7 Digital Marketing Agency (Category: Tech & Software)
10. Bayer Aspirin (Category: Other)
11. Beiersdorf Inc (Category: Retail & E-commerce)
12. Big Agnes (Category: 