<div style="background-color: rgb(28, 97, 154); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    1. Main Code
</div>

- It can be used for most of the banks except for ING, Argenta, KBC and BNP Paribas Fortis.

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import time
import json
import os
import re
import google.generativeai as genai
from tqdm import tqdm
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ---------------------- CONFIG ----------------------
genai.configure(api_key="AIzaSyBzOT2O03scMENbdWouWexYa10v4K4OVPE")

# ---------------------- STEP 1: Load Data ----------------------
def load_file(file_path):
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    elif file_path.endswith(".xlsx"):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format")
    return df

# ---------------------- STEP 2: Typology Classification ----------------------needs to updated
def classify_url(url):
    url = url.lower()
    if any(term in url for term in ["product", "producten", "produits",  # general
                                    "lenen", "loan", "pret",  # loans
                                    "sparen", "saving", "epargne",  # savings
                                    "rekening", "account", "compte",  # accounts
                                    "beleggen", "investment", "investir",  # investments
                                    "hypotheek", "mortgage", "hypothecaire",  # mortgage
                                    "verzekering", "insurance", "assurance",  # insurance
                                    "kaart", "card", "carte",  # cards
                                    "bankieren", "banking", "banque"]):
        return "Product"
    elif any(term in url for term in ["faq", "support", "help", "hulp", "ondersteuning", "aide", "questions", 
                                       "klantenservice", "clientservice", "contactcenter", "assistance"]):
        return "FAQ"
    elif any(term in url for term in ["legal", "juridisch", "juridique", "voorwaarden", "terms", 
                                      "conditions", "privacy", "beleid", "policy", "cookie", 
                                      "gdpr", "compliance", "disclaimer", "protection", "gegevensbescherming"]):
        return "Legal"
    elif any(term in url for term in ["contact", "locatie", "location", "agences", "branches", "agents", 
                                      "kantoren", "bureaux", "afspraak", "appointment", "form", 
                                      "formulier", "trouver", "bereikbaarheid"]):
        return "Contact"
    elif "blog" in url:
        return "Blog"
    else:
        return "Other"

# ---------------------- STEP 3: Stratified Sampling ----------------------
def stratified_sample(df):
    df["Page Type"] = df["Address"].apply(classify_url)
    sample = []
    for typ in ["Product", "FAQ", "Legal", "Contact"]:
        group = df[df["Page Type"] == typ]
        sample.extend(group.sample(min(3, len(group)), random_state=42).to_dict("records"))
    remaining = 30 - len(sample)
    other = df[df["Page Type"] == "Other"]
    sample.extend(other.sample(min(remaining, len(other)), random_state=42).to_dict("records"))
    return pd.DataFrame(sample)

# ---------------------- STEP 4: Extract Page Text ----------------------
def extract_clean_text(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')
        for tag in soup(['script', 'style', 'nav', 'footer']):
            tag.decompose()
        text = soup.get_text(separator=' ', strip=True)
        return ' '.join(text.split())[:10000]  # Truncate to token-safe length
    except:
        return ""

def extract_clean_text_with_selenium(url):
    try:
        options = Options()
        options.headless = True
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')

        driver = webdriver.Chrome(options=options)
        driver.get(url)

        # Wait for full page load (you can add WebDriverWait here for better reliability)
        driver.implicitly_wait(5)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()

        # Remove unwanted tags
        for tag in soup(['script', 'style', 'nav', 'footer']):
            tag.decompose()

        text = soup.get_text(separator=' ', strip=True)
        return ' '.join(text.split())[:10000]

    except Exception as e:
        print(f"‚ùå Error with Selenium extraction: {e}")
        return ""
    

def clean_extracted_text(raw_text):
    """
    Universal text cleaning function for Belgian banks (KBC, BNP Paribas, Belfius, ING)
    Handles multi-language content (NL/FR/EN) and bank-specific patterns
    """
    if not raw_text or len(raw_text.strip()) < 20:
        return ""
    
    # Universal navigation and header patterns
    navigation_patterns = [
        # Skip to content links
        r'Skip to .*?Log in',
        r'Retour au .*?Se connecter',
        r'Terug naar de inhoud',
        r'Overslaan en naar de inhoud gaan',
        r'Aller au contenu principal',
        r'Skip to main content',
        
        # Language switchers
        r'FR\s+NL\s+EN',
        r'Nederlands\s+Fran√ßais\s+English',
        r'NL\s+FR\s+DE',
        r'\bFR\s+NL\b',
        r'\bFran√ßais\b\s*\bNederlands\b',
        
        # Main navigation menus
        r'Home.*?Contact.*?Login',
        r'Accueil.*?Contact.*?Connexion',
        r'Thuis.*?Contact.*?Inloggen',
        r'Menu\s+Sluiten',
        r'Menu\s+Fermer',
        r'Close\s+Menu',
        
        # Search functionality
        r'Zoeken \(Optioneel\).*?Contact',
        r'Rechercher \(En option\).*?Contact',
        r'Search \(Optional\).*?Contact',
        r'Zoeken.*?Zoek',
        r'Rechercher.*?Recherche',
        r'Search.*?Search',
        
        # Bank-specific navigation
        # KBC specific
        r'KBC.*?Inloggen',
        r'KBC.*?Se connecter',
        r'Online Banking.*?KBC',
        
        # BNP Paribas specific
        r'BNP Paribas Fortis.*?Inloggen',
        r'BNP Paribas Fortis.*?Se connecter',
        r'Word klant.*?Beobank Online',
        r'Devenir client.*?Beobank Online',
        
        # Belfius specific
        r'Belfius.*?Inloggen',
        r'Belfius.*?Se connecter',
        r'Belfius Direct Net',
        
        # ING specific
        r'ING.*?Inloggen',
        r'ING.*?Se connecter',
        r'Mijn ING.*?Inloggen',
    ]
    
    # Cookie and privacy notices (comprehensive)
    cookie_patterns = [
        # Cookie acceptance
        r'Accept all cookies.*?Manage cookies',
        r'Accepter tous les cookies.*?G√©rer les cookies',
        r'Alle cookies accepteren.*?Cookies beheren',
        r'Deze website gebruikt cookies.*?Alles accepteren',
        r'Ce site utilise des cookies.*?Tout accepter',
        r'This website uses cookies.*?Accept all',
        
        # Cookie management sections
        r'Cookie settings.*?Save preferences',
        r'Param√®tres des cookies.*?Sauvegarder',
        r'Cookie-instellingen.*?Voorkeuren opslaan',
        r'Mijn cookies beheren.*?Alles accepteren',
        r'G√©rer mes cookies.*?Tout accepter',
        r'Manage my cookies.*?Accept all',
        
        # Cookie descriptions
        r'Functionele cookies.*?verbeteren\.',
        r'Les cookies fonctionnels.*?par des tiers\.',
        r'Functional cookies.*?third parties\.',
        r'Analytische cookies.*?voorkeuren zijn\.',
        r'Les cookies de mesure.*?leurs pr√©f√©rences\.',
        r'Analytics cookies.*?their preferences\.',
        r'Marketing cookies.*?te tonen\.',
        r'Les cookies publicitaires.*?pertinentes\.',
        r'Marketing cookies.*?relevant\.',
        
        # Privacy policy links
        r'Privacy policy.*?Terms',
        r'Politique de confidentialit√©.*?Conditions',
        r'Privacybeleid.*?Voorwaarden',
    ]
    
    # Technical and browser notices
    technical_patterns = [
        r'Voor een betere surfervaring.*?Chrome',
        r'Pour une meilleure exp√©rience.*?Chrome\.',
        r'For a better browsing experience.*?Chrome',
        r'Adblock detection:.*?Sluiten',
        r'Adblock detection:.*?Fermer',
        r'Adblock detection:.*?Close',
        r'You have not yet given permission.*?Load video',
        r'JavaScript is disabled.*?Enable JavaScript',
        r'Loading\.\.\.',
        r'Laden\.\.\.',
        r'Chargement\.\.\.',
    ]
    
    # Footer and related content
    footer_patterns = [
        # Related articles
        r'Other articles that might interest you.*',
        r'Autres articles qui pourraient vous int√©resser.*',
        r'Andere artikels die u kunnen interesseren.*',
        r'Gerelateerde concepten.*?Lees meer',
        r'Termes li√©s.*?Lire la suite',
        r'Related terms.*?Read more',
        
        # Newsletter and blog subscriptions
        r'Ontdek de.*?blog.*?Fran√ßais',
        r'D√©couvrir le blog.*?Nederlands',
        r'Discover the.*?blog.*?Dutch',
        r'Schrijf u in op onze nieuwsbrief.*?Inschrijven',
        r'Inscrivez-vous √† notre newsletter.*?S\'inscrire',
        r'Subscribe to our newsletter.*?Subscribe',
        
        # Legal and compliance
        r'Terms and conditions.*?Privacy',
        r'Termes et conditions.*?Confidentialit√©',
        r'Algemene voorwaarden.*?Privacy',
        r'Disclaimer.*?Copyright',
        r'Avertissement.*?Droits d\'auteur',
        r'Vrijwaring.*?Auteursrecht',
        
        # Copyright notices
        r'¬©.*?\d{4}.*?(KBC|BNP|Belfius|ING)',
        r'Alle rechten voorbehouden',
        r'Tous droits r√©serv√©s',
        r'All rights reserved',
    ]
    
    # Banking-specific call-to-action patterns
    banking_cta_patterns = [
        # Appointment booking
        r'Maak een afspraak!.*?',
        r'Prenez rendez-vous.*?',
        r'Make an appointment.*?',
        r'Boek een gesprek.*?',
        r'R√©servez un entretien.*?',
        
        # Investment advice
        r'Ontdek ons advies.*?',
        r'Laissez-vous conseiller.*?',
        r'Discover our advice.*?',
        r'Klaar om te beleggen\?.*?Maak een afspraak!',
        r'Pr√™t\(e\) √† investir\?.*?Prenez rendez-vous',
        r'Ready to invest\?.*?Make an appointment',
        
        # Product promotions
        r'Ontdek onze.*?producten',
        r'D√©couvrez nos.*?produits',
        r'Discover our.*?products',
        r'Meer informatie.*?aanvragen',
        r'Plus d\'informations.*?demander',
        r'More information.*?request',
    ]
    
    # Social media and sharing
    social_patterns = [
        r'Share on.*?Facebook',
        r'Partager sur.*?Facebook',
        r'Delen op.*?Facebook',
        r'Tweet.*?Twitter',
        r'Tweeter.*?Twitter',
        r'LinkedIn.*?delen',
        r'LinkedIn.*?partager',
        r'LinkedIn.*?share',
        r'WhatsApp.*?delen',
        r'WhatsApp.*?partager',
        r'WhatsApp.*?share',
        r'E-mail.*?versturen',
        r'E-mail.*?envoyer',
        r'E-mail.*?send',
        r'Print this page',
        r'Imprimez cette page',
        r'Print deze pagina',
        r'Download PDF',
        r'T√©l√©charger PDF',
        r'PDF downloaden',
    ]
    
    # Breadcrumb and metadata
    metadata_patterns = [
        r'Home\s*‚Ä∫.*?‚Ä∫',
        r'Accueil\s*‚Ä∫.*?‚Ä∫',
        r'Thuis\s*‚Ä∫.*?‚Ä∫',
        r'Last updated:.*?\d{4}',
        r'Derni√®re mise √† jour:.*?\d{4}',
        r'Laatst bijgewerkt:.*?\d{4}',
        r'Posted on.*?\d{4}',
        r'Publi√© le.*?\d{4}',
        r'Geplaatst op.*?\d{4}',
        r'Tags:.*?(?=\n|\.|$)',
        r'√âtiquettes:.*?(?=\n|\.|$)',
        r'Labels:.*?(?=\n|\.|$)',
        r'\d+\s+min read',
        r'\d+\s+min de lecture',
        r'\d+\s+min lezen',
    ]
    
    # Combine all patterns
    all_patterns = (navigation_patterns + cookie_patterns + technical_patterns + 
                   footer_patterns + banking_cta_patterns + social_patterns + metadata_patterns)
    
    # Apply cleaning
    cleaned = raw_text
    for pattern in all_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.DOTALL)
    
    # Remove repeated "Read more" links in all languages
    read_more_patterns = [
        r'Lees meer\s*',
        r'Lire la suite\s*',
        r'Read more\s*',
        r'Meer lezen\s*',
        r'En savoir plus\s*',
        r'Learn more\s*'
    ]
    
    for pattern in read_more_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
    
    # Remove investment sidebar content
    cleaned = re.sub(r'Beleggen in \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'Investir dans \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'Investing in \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    
    # Remove standalone navigation and form words
    standalone_words = [
        'Contact', 'Zoeken', 'Rechercher', 'Search',
        'Email adres', 'Adresse email', 'Email address',
        'Inschrijven', 'S\'inscrire', 'Subscribe',
        'Versturen', 'Envoyer', 'Send',
        'Annuleren', 'Annuler', 'Cancel',
        'Bevestigen', 'Confirmer', 'Confirm'
    ]
    
    for word in standalone_words:
        cleaned = re.sub(rf'\b{re.escape(word)}\b', '', cleaned, flags=re.IGNORECASE)
    
    # Clean up formatting issues
    # Remove excessive punctuation
    cleaned = re.sub(r'[.]{2,}', '.', cleaned)
    cleaned = re.sub(r'[-]{3,}', '', cleaned)
    cleaned = re.sub(r'[_]{3,}', '', cleaned)
    cleaned = re.sub(r'(\b\w+\b)(\s+\1){2,}', r'\1', cleaned)  # Remove repeated words
    
    # Fix spacing around punctuation
    cleaned = re.sub(r'\s+([.,!?;:])', r'\1', cleaned)
    cleaned = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1 \2', cleaned)
    
    # Remove empty brackets and parentheses
    cleaned = re.sub(r'\(\s*\)', '', cleaned)
    cleaned = re.sub(r'\[\s*\]', '', cleaned)
    cleaned = re.sub(r'\{\s*\}', '', cleaned)
    
    # Normalize whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'\n\s*\n', '\n\n', cleaned)
    
    return cleaned.strip()


# ---------------------- STEP 5: Score with Gemini ----------------------
def score_page_with_gemini(text, page_type):
    prompt = f"""
**Context:** This prompt is designed for the Gemini language model to evaluate the CEFR B2 level compliance of webpage content from retail banking websites for regulatory compliance. The evaluation focuses on vocabulary, grammar, clarity, and coherence to determine if the text is easily understandable for someone at a B2 level in English, French, or Dutch. The desired output includes the compliance level percentage and individual scores for vocabulary complexity, grammatical structures, overall clarity, and coherence, with a detailed rationale for each evaluated address presented in a single cell of an output file (e.g., CSV or Excel). The goal is to ensure the evaluation effectively differentiates between webpages with varying levels of B2 compliance, leading to a wider range of scores, and that the rationale is comprehensive yet concise enough to fit within a single cell per address. **It is important to consider that these are banking websites, and some technical or financial terms may be inherent to the content.**

**Task:** Assess the CEFR B2 compliance level of the provided webpage content, ensuring a variable range of scores and a detailed, single-cell rationale for each evaluated address, **while acknowledging the potential presence of necessary banking terminology.**

**Instructions:**

1. **Identify Language:** Determine if the input text is in English, French, or Dutch.

2. **Evaluate B2 Compliance with Granularity (Considering Banking Terms):** Analyze the text against the CEFR B2 criteria for the identified language, critically and precisely assessing the following aspects on a scale of 0 to 10. Avoid assigning only 0 or 10; use the full scale based on nuance and subtlety. ‚ÄúDo not hesitate to assign low (0‚Äì4) or high (8‚Äì10) scores when the text clearly deserves it. Avoid accumulating around 6‚Äì7 unless the text is truly average.‚Äù Remember the compliancy threshold is 70% (7/10) for B2 level. Therefore if a text is generally compliant it should receive a total score of higher than or equal to 70.

- **Vocabulary Complexity (0‚Äì10)**
  - 10 ‚Üí very simple, common words, basic banking terms, no jargon
  - 7‚Äì9 ‚Üí mostly common words, occasional technical terms explained
  - 4‚Äì6 ‚Üí mix of general and technical terms, some unnecessarily complex or rare words
  - 1‚Äì3 ‚Üí frequent use of complex, low-frequency words or jargon, often unexplained
  - 0 ‚Üí highly complex, dense language with rare or unexplained terms everywhere

- **Grammatical Structures (0‚Äì10)**
  - 10 ‚Üí simple sentences, clear structure, active voice, no complex clauses
  - 7‚Äì9 ‚Üí mostly simple, some moderate clauses, minor passive use
  - 4‚Äì6 ‚Üí mix of simple and complex sentences, occasional embedded or passive forms
  - 1‚Äì3 ‚Üí mostly long, embedded, or passive structures, hard to follow
  - 0 ‚Üí extremely complex grammar, frequent embedding, difficult to parse

- **Overall Clarity (0‚Äì10)**
  - 10 ‚Üí very clear, easy to understand, minimal effort required
  - 7‚Äì9 ‚Üí mostly clear, small moments of complexity
  - 4‚Äì6 ‚Üí mixed clarity, occasional confusion or ambiguity
  - 1‚Äì3 ‚Üí often unclear, requires effort to interpret
  - 0 ‚Üí very unclear, confusing, hard to follow

- **Coherence (0‚Äì10)**
  - 10 ‚Üí logical flow, clear organization, excellent connectors
  - 7‚Äì9 ‚Üí mostly logical, some jumps, minor missing links
  - 4‚Äì6 ‚Üí mixed coherence, weak transitions, partial disorganization
  - 1‚Äì3 ‚Üí often disorganized, unclear connections
  - 0 ‚Üí no logical order, chaotic, fragmented

3. **Provide Detailed Rationale (Single Cell):** Explain the reasoning behind each of the four scores within a single text string suitable for one Excel cell. Explicitly point out specific linguistic features (vocabulary, grammar, discourse markers) that contribute to the assigned level of complexity or simplicity for each criterion. When discussing vocabulary, specifically comment on the presence and handling of banking terminology. Justify why the text is or is not strictly at the B2 level for each aspect. Use clear separators (e.g., "; ") between the rationale for each criterion to ensure readability within the single cell.
```xml
<rationale>Vocabulary: [Explanation with examples, noting banking terms]; Grammar: [Explanation with examples]; Clarity: [Explanation with examples, considering banking terms]; Coherence: [Explanation with examples]</rationale>

**Output Format:**
Return the evaluation in the following XML format, ensuring all information for a single evaluated webpage address can be represented as a single row in an output file:
```xml
<vocabulary_complexity>Y</vocabulary_complexity>
<grammatical_structures>Z</grammatical_structures>
<overall_clarity>W</overall_clarity>
<coherence>V</coherence>
<rationale>Vocabulary: [Explanation with examples, noting banking terms], Grammar: [Explanation with examples], Clarity: [Explanation with examples, considering banking terms], Coherence: [Explanation with examples]</rationale>

Examples of B2 Compliant Texts and C1 Texts Which Are Not B2 Compliant
 To help you understand the evaluation criteria, here are some examples of texts rated at B2 and C1 levels:

English
B2 level text
 Source: LinguaPress Unsolved mysteries ‚Äì a short story by Sarah Wollbach
 Megan‚Äôs acting career began one morning a couple of years ago, when a woman approached her in the parking lot of her neighborhood grocery store. ‚ÄúExcuse me,‚Äù she said, ‚Äúbut have you ever taken acting lessons?‚Äù ‚Äî ‚ÄúNo,‚Äù she answered hesitantly. 
 The woman reached into her pocket and handed Megan a card. ‚ÄúI‚Äôm a casting director for Unsolved Mysteries,‚Äù she said, shaking her hand. Megan had always been stage-struck. 
 For years she'd fantasized about being an actor, sure that deep within her lurked a brilliant chameleon like Meryl Streep or Julia Roberts. Maybe this was her big break. 
 ‚ÄúThe show‚Äôs doing a feature about a woman who was kidnapped,‚Äù the lady continued, ‚Äúand you look exactly like her. The resemblance is amazing. Would you be interested in auditioning?‚Äù 
 The episode aired the next week, with a couple of thousand dollars for two days‚Äô work, plus travel, lodging, and food expenses.


C1 level text
 Source: LinguaPress The Enigma of the Missing Manuscript by John Doe
 The mystery of the missing manuscript has eluded generations of writers. It was said to contain the final, unpublished works and annotations of the author, whose sudden disappearance 
 had only added to the intrigue. The manuscript was believed to be hidden somewhere in the old mansion, a labyrinthine structure filled with secret passages and hidden rooms. 
 Many had tried to find it, but all had failed. The clues were cryptic, the dangers real, and the stakes high. For those who dared to search, it was a journey into the unknown, a test of wit and courage.



French
B2 level text
 Source: LinguaPress Myst√®res non r√©solus ‚Äì une histoire courte par Sarah Wollbach
 La carri√®re d‚Äôactrice de Megan a commenc√© un matin il y a quelques ann√©es, lorsqu‚Äôune femme l‚Äôa abord√©e dans le parking de son √©picerie de quartier. 
 ‚ÄúExcusez-moi,‚Äù dit-elle, ‚Äúmais avez-vous d√©j√† pris des cours de th√©√¢tre?‚Äù ‚Äî ‚ÄúNon,‚Äù r√©pondit-elle avec h√©sitation. La femme a fouill√© dans sa poche et tendu une carte √† Megan. 
 ‚ÄúJe suis directrice de casting de Myst√®res non r√©solus,‚Äù dit-elle en lui serrant la main. Megan avait toujours √©t√© fascin√©e par la sc√®ne. 
 Pendant des ann√©es, elle avait nourri en secret le r√™ve d‚Äô√™tre actrice, convaincue qu‚Äôau fond d‚Äôelle-m√™me se cachait un brillant cam√©l√©on comme Meryl Streep ou Julia Roberts. 
 Peut-√™tre que c‚Äô√©tait sa grande chance. ‚ÄúL‚Äô√©mission fait un reportage sur une femme qui a √©t√© kidnapp√©e,‚Äù continua la dame, ‚Äúet vous lui ressemblez exactement. 
 La ressemblance est incroyable. Seriez-vous int√©ress√©e par une audition?‚Äù Elle expliqua que le r√¥le valait quelques milliers de dollars pour deux jours de travail, plus les frais de voyage, de logement et de nourriture.


C1 level text
 Source: LinguaPress L‚ÄôEnigme du Manuscrit Disparu par Jean Dupont
 Le myst√®re du manuscrit disparu que tout le monde tentait de percer depuis des d√©cennies. On disait qu‚Äôil contenait les derni√®res ≈ìuvres finales, non publi√©es, d‚Äôun auteur renomm√©, 
 dont la disparition soudaine n‚Äôavait fait qu‚Äôajouter √† l‚Äôintrigue. On croyait que le manuscrit √©tait cach√© quelque part dans le vieux manoir, une structure labyrinthique remplie de passages secrets et de pi√®ces cach√©es. 
 Beaucoup avaient essay√© de le trouver, mais tous avaient √©chou√©. Les indices √©taient cryptiques, les dangers r√©els, et les enjeux √©lev√©s. Pour ceux qui osaient chercher, c‚Äô√©tait un voyage dans l‚Äôinconnu, un test d‚Äôesprit et de courage.



Dutch
B2 level text
 Source: LinguaPress Opgeloste mysteries ‚Äì een kortverhaal door Sarah Wollbach
 Megan‚Äôs acteercarri√®re begon op een ochtend een paar jaar geleden, toen een vrouw haar benaderde op de parkeerplaats van haar buurtwinkel. ‚ÄúExcuseer me,‚Äù zei ze, ‚Äúmaar heb je ooit acteerlessen gevolgd?‚Äù ‚Äî ‚ÄúNee,‚Äù antwoordde ze aarzelend. 
 De vrouw stak haar hand in haar zak en gaf Megan een kaartje. ‚ÄúIk ben een castingdirecteur voor Opgeloste mysteries,‚Äù zei ze, terwijl ze haar hand schudde. Megan was altijd al gefascineerd door het toneel. 
 Jarenlang had ze gefantaseerd over het zijn van een actrice, ervan overtuigd dat diep vanbinnen een briljante actrice zoals Meryl Streep of Julia Roberts schuilde. Misschien was dit haar grote doorbraak. 
 ‚ÄúDe show doet een reportage over een vrouw die ontvoerd is,‚Äù vervolgde de dame, ‚Äúen je lijkt precies op haar. De gelijkenis is verbazingwekkend. Zou je ge√Ønteresseerd zijn in een auditie?‚Äù 
 Ze zette uit dat dit alles een paar duizend dollar waard was voor twee dagen werk, plus reis-, verblijf- en voedselkosten.


C1 level text
 Source: LinguaPress Het Raadsel van het Verdwenen Manuscript door Jan Jansen
 Het mysterie van het verdwenen manuscript dat generaties schrijvers decennialang verbijsterd. Er werd gezegd dat het de laatste, ongepubliceerde werken van een beroemde auteur bevatte, wiens plotselinge verdwijning alleen maar bijdroeg aan de intrige. 
 Het gerucht deed de ronde dat het manuscript ergens in het oude herenhuis verborgen was, een labyrintische structuur vol geheime gangen en verborgen kamers. Velen hadden geprobeerd het te vinden, maar allemaal waren ze mislukt. 
 De aanwijzingen waren cryptisch, de gevaren echt, en de inzet hoog. Voor degenen die durfden te zoeken, was het een reis in het onbekende, een test van verstand en moed.


Input Text content to check: \"\"\"{text}\"\"\" 
"""
    try:

        model = genai.GenerativeModel("gemini-2.5-flash")
        response = model.generate_content(
            prompt,
            generation_config={"temperature": 0.2}
        )
        output = response.text.strip()

        if output.startswith("```"):
            output = output.strip("` \n").replace("xml", "").strip()

        # Extract scores using regex from the XML
        scores = {
            "vocabulary_complexity": extract_xml_score(output, "vocabulary_complexity"),
            "grammatical_structures": extract_xml_score(output, "grammatical_structures"),
            "overall_clarity": extract_xml_score(output, "overall_clarity"),
            "coherence": extract_xml_score(output, "coherence"),
            "rationale": extract_xml_rationale(output),
        }

        return scores

    except Exception as e:
        print(f"‚ùå Error scoring page with Gemini: {e}")
        return {
            "compliance_level": 0,
            "vocabulary_complexity": 0,
            "grammatical_structures": 0,
            "overall_clarity": 0,
            "coherence": 0,
            "rationale": "Error occurred during evaluation."
        }

def extract_xml_score(xml_text, tag):
    match = re.search(fr"<{tag}>(\d+)</{tag}>", xml_text)
    return int(match.group(1)) if match else 0

def extract_xml_rationale(xml_text):
    match = re.search(r"<rationale>(.*?)</rationale>", xml_text, re.DOTALL)
    return match.group(1).strip() if match else "No rationale found."

# ---------------------- STEP 6: Evaluation + Warning ----------------------

def evaluate_accessibility(df, file_path):
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    if base_name.endswith("_urls"):
        base_name = base_name.replace("_urls", "")
    scores = []
    checkpoint_file = f"{base_name}_b2_accessibility_checkpoint.xlsx"
    log_file = f"{base_name}_b2_accessibility_log.txt"

    # Check if log exists ‚Üí skip already processed URLs
    processed_urls = set()
    if os.path.exists(log_file):
        with open(log_file, 'r') as f:
            processed_urls = set(line.strip() for line in f.readlines())

    # Resume from checkpoint if exists
    if os.path.exists(checkpoint_file):
        scores_df = pd.read_excel(checkpoint_file)
        scores = scores_df.to_dict(orient='records')
    else:
        scores_df = pd.DataFrame()

    CHECKPOINT_EVERY = 20

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring pages"):
        url = row['Address']

        if url in processed_urls:
            continue

        print(f"Scoring: {url}")
        try:
            text = extract_clean_text_with_selenium(row['Address'])
            cleaned_text = clean_extracted_text(text)
            result = score_page_with_gemini(cleaned_text, row['Page Type'])

            sub_scores = [
                result.get("vocabulary_complexity", 0),
                result.get("grammatical_structures", 0),
                result.get("overall_clarity", 0),
                result.get("coherence", 0),
            ]
            compliance_value = round(sum(sub_scores) / 4 * 10) if all(isinstance(score, int) and 0 <= score <= 10 for score in sub_scores) else 0

        except Exception as e:
            print(f"Error scoring page {url}: {e}")
            compliance_value = 0
            result = {
                "vocabulary_complexity": 0,
                "grammatical_structures": 0,
                "overall_clarity": 0,
                "coherence": 0,
                "rationale": "Error occurred during evaluation.",
            }

        scores.append({
            "URL": url,
            "Page Type": row['Page Type'],
            "Compliance Level": compliance_value,
            "Vocabulary Complexity": result.get("vocabulary_complexity"),
            "Grammatical Structures": result.get("grammatical_structures"),
            "Overall Clarity": result.get("overall_clarity"),
            "Coherence": result.get("coherence"),
            "Rationale": result.get("rationale"),
        })

        # Log processed URL
        with open(log_file, 'a') as f:
            f.write(url + '\n')

        # Save checkpoint every N steps
        if len(scores) % CHECKPOINT_EVERY == 0:
            pd.DataFrame(scores).to_excel(checkpoint_file, index=False)
            print(f"‚úÖ Checkpoint saved at {len(scores)} items")

        time.sleep(2)

    # Final save
    final_df = pd.DataFrame(scores)
    final_df.to_excel(f"{base_name}_b2_accessibility_final.xlsx", index=False)
    print("‚úÖ Final results saved")

    return final_df

# ---------------------- STEP 7: Output & Summary ----------------------
def output_summary(result_df, input_path):
    # Extract filename without extension (e.g., "belfius_urls" ‚Üí "belfius")
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    if base_name.endswith("_urls"):
        base_name = base_name.replace("_urls", "")

    output_filename = f"{base_name}_b2_accessibility_scores.xlsx"
    output_path = os.path.join("/Users/furkandemir/Desktop/Sailpeak/Accesibility", output_filename)

    result_df.to_excel(output_path, index=False)
    print(f"\n‚úÖ Results saved to {output_filename}\n")

    # Score summaries
    overall_score = result_df["Compliance Level"].mean()
    print(f"üåê Overall CEFR B2 Accessibility Score: {overall_score:.2f}%")

    typology_avg = result_df.groupby("Page Type")["Compliance Level"].mean()
    for typ, score in typology_avg.items():
        if score < 70:
            print(f"‚ö†Ô∏è {typ} pages may require language simplification (avg score: {score:.2f})")

# ---------------------- Main Runner ----------------------
def main(file_path):
    df = load_file(file_path)
    sampled_df = stratified_sample(df) #Sampling
    df['Address'] = df['Address'].apply(lambda x: urlparse(x).path) #Sampling
    df["Page Type"] = df["Address"].apply(classify_url)
    #sampled_df = df  # Score all URLs
    result_df = evaluate_accessibility(sampled_df, file_path)
    output_summary(result_df, file_path)  # <-- pass file_path here


# Example usage:
# main("crelan_internal_html.xlsx")

<div style="background-color: rgb(28, 97, 154); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    2. Main Code Runner
</div>

In [3]:
main("Belfius/belfius_urls.xlsx")

Scoring pages:   0%|          | 0/30 [00:00<?, ?it/s]

Scoring: https://www.belfius.be/retail/nl/producten/sparen-beleggen/info-publicaties/publicaties/uw-beleggingen/2019-12/index.aspx
‚ùå Error with Selenium extraction: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=137.0.7151.120)
Stacktrace:
0   chromedriver                        0x0000000101311d14 cxxbridge1$str$ptr + 2735276
1   chromedriver                        0x0000000101309f88 cxxbridge1$str$ptr + 2703136
2   chromedriver                        0x0000000100e5a6f0 cxxbridge1$string$len + 90424
3   chromedriver                        0x0000000100e34720 chromedriver + 132896
4   chromedriver                        0x0000000100ec9cc4 cxxbridge1$string$len + 546572
5   chromedriver                        0x0000000100ee2c08 cxxbridge1$string$len + 648784
6   chromedriver                        0x0000000100e95be8 cxxbridge1$string$len + 333360
7   chromedriver                        0x00000001012d5598 cxxbridge1$st

Scoring pages:   0%|          | 0/30 [00:05<?, ?it/s]


KeyboardInterrupt: 

<div style="background-color: rgb(2, 124, 18); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    3. Specialized Code for BNP Paribas Fortis
</div>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
import re
import google.generativeai as genai
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from functools import lru_cache

# ---------------------- CONFIG ----------------------
genai.configure(api_key="AIzaSyBzOT2O03scMENbdWouWexYa10v4K4OVPE")

# Thread-local storage for WebDriver instances
thread_local = threading.local()

def get_driver():
    """Get a WebDriver instance for the current thread"""
    if not hasattr(thread_local, 'driver'):
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-images')  # Speed up loading
        options.add_argument('--disable-javascript')  # Speed up loading
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        thread_local.driver = webdriver.Chrome(options=options)
        thread_local.driver.set_page_load_timeout(30)  # Reduced timeout
        thread_local.driver.implicitly_wait(5)  # Reduced wait
    
    return thread_local.driver

def cleanup_driver():
    """Clean up the WebDriver for the current thread"""
    if hasattr(thread_local, 'driver'):
        thread_local.driver.quit()
        del thread_local.driver

# ---------------------- URL Classification ----------------------
@lru_cache(maxsize=1000)  # Cache classification results
def classify_bnp_url(url):
    """Enhanced URL classification specifically for BNP Paribas Fortis"""
    url = url.lower()
    
    if any(term in url for term in [
        "product", "producten", "produits", "sparen", "saving", "epargne", 
        "lenen", "loan", "pret", "credit", "rekening", "account", "compte",
        "beleggen", "investment", "investir", "hypotheek", "mortgage", 
        "verzekering", "insurance", "assurance", "kaart", "card", "carte",
        "bankieren", "banking", "banque", "easy-banking", "business-banking"
    ]):
        return "Product"
    
    elif any(term in url for term in [
        "faq", "support", "help", "hulp", "ondersteuning", "aide", 
        "questions", "klantenservice", "clientservice", "assistance"
    ]):
        return "FAQ"
    
    elif any(term in url for term in [
        "legal", "juridisch", "juridique", "voorwaarden", "terms",
        "conditions", "privacy", "beleid", "policy", "cookie", "gdpr"
    ]):
        return "Legal"
    
    elif any(term in url for term in [
        "contact", "locatie", "location", "agences", "branches", 
        "kantoren", "afspraak", "appointment"
    ]):
        return "Contact"
    
    elif any(term in url for term in ["blog", "nieuws", "news", "actualites"]):
        return "Blog"
    
    else:
        return "Other"

# ---------------------- Text Extraction ----------------------
def extract_clean_text_bnp(url):
    """Optimized text extraction for BNP Paribas Fortis"""
    try:
        driver = get_driver()
        driver.get(url)
        
        # Reduced wait time
        try:
            WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
        except:
            time.sleep(2)  # Reduced sleep
        
        # Quick cookie handling
        try:
            cookie_selectors = ['[data-testid*="accept"]', '[class*="accept"]']
            for selector in cookie_selectors:
                buttons = driver.find_elements(By.CSS_SELECTOR, selector)
                for button in buttons[:1]:  # Only try first button
                    if button.is_displayed():
                        driver.execute_script("arguments[0].click();", button)
                        time.sleep(1)
                        break
                if buttons:
                    break
        except:
            pass
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Remove unwanted elements
        for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
            if tag:
                tag.decompose()
        
        # Extract main content
        main_content = soup.find('main') or soup.find('article') or soup.find('.content')
        if main_content:
            text = main_content.get_text(separator=' ', strip=True)
        else:
            text = soup.get_text(separator=' ', strip=True)
        
        if text and len(text.strip()) > 100:
            result = ' '.join(text.split())[:10000]
            return result
        else:
            return ""

    except Exception as e:
        return ""

def clean_bnp_text(raw_text):
    """
    Universal text cleaning function for Belgian banks (KBC, BNP Paribas, Belfius, ING)
    Handles multi-language content (NL/FR/EN) and bank-specific patterns
    """
    if not raw_text or len(raw_text.strip()) < 20:
        return ""
    
    # Universal navigation and header patterns
    navigation_patterns = [
        # Skip to content links
        r'Skip to .*?Log in',
        r'Retour au .*?Se connecter',
        r'Terug naar de inhoud',
        r'Overslaan en naar de inhoud gaan',
        r'Aller au contenu principal',
        r'Skip to main content',
        
        # Language switchers
        r'FR\s+NL\s+EN',
        r'Nederlands\s+Fran√ßais\s+English',
        r'NL\s+FR\s+DE',
        r'\bFR\s+NL\b',
        r'\bFran√ßais\b\s*\bNederlands\b',
        
        # Main navigation menus
        r'Home.*?Contact.*?Login',
        r'Accueil.*?Contact.*?Connexion',
        r'Thuis.*?Contact.*?Inloggen',
        r'Menu\s+Sluiten',
        r'Menu\s+Fermer',
        r'Close\s+Menu',
        
        # Search functionality
        r'Zoeken \(Optioneel\).*?Contact',
        r'Rechercher \(En option\).*?Contact',
        r'Search \(Optional\).*?Contact',
        r'Zoeken.*?Zoek',
        r'Rechercher.*?Recherche',
        r'Search.*?Search',
        
        # Bank-specific navigation
        # KBC specific
        r'KBC.*?Inloggen',
        r'KBC.*?Se connecter',
        r'Online Banking.*?KBC',
        
        # BNP Paribas specific
        r'BNP Paribas Fortis.*?Inloggen',
        r'BNP Paribas Fortis.*?Se connecter',
        r'Word klant.*?Beobank Online',
        r'Devenir client.*?Beobank Online',
        
        # Belfius specific
        r'Belfius.*?Inloggen',
        r'Belfius.*?Se connecter',
        r'Belfius Direct Net',
        
        # ING specific
        r'ING.*?Inloggen',
        r'ING.*?Se connecter',
        r'Mijn ING.*?Inloggen',
    ]
    
    # Cookie and privacy notices (comprehensive)
    cookie_patterns = [
        # Cookie acceptance
        r'Accept all cookies.*?Manage cookies',
        r'Accepter tous les cookies.*?G√©rer les cookies',
        r'Alle cookies accepteren.*?Cookies beheren',
        r'Deze website gebruikt cookies.*?Alles accepteren',
        r'Ce site utilise des cookies.*?Tout accepter',
        r'This website uses cookies.*?Accept all',
        
        # Cookie management sections
        r'Cookie settings.*?Save preferences',
        r'Param√®tres des cookies.*?Sauvegarder',
        r'Cookie-instellingen.*?Voorkeuren opslaan',
        r'Mijn cookies beheren.*?Alles accepteren',
        r'G√©rer mes cookies.*?Tout accepter',
        r'Manage my cookies.*?Accept all',
        
        # Cookie descriptions
        r'Functionele cookies.*?verbeteren\.',
        r'Les cookies fonctionnels.*?par des tiers\.',
        r'Functional cookies.*?third parties\.',
        r'Analytische cookies.*?voorkeuren zijn\.',
        r'Les cookies de mesure.*?leurs pr√©f√©rences\.',
        r'Analytics cookies.*?their preferences\.',
        r'Marketing cookies.*?te tonen\.',
        r'Les cookies publicitaires.*?pertinentes\.',
        r'Marketing cookies.*?relevant\.',
        
        # Privacy policy links
        r'Privacy policy.*?Terms',
        r'Politique de confidentialit√©.*?Conditions',
        r'Privacybeleid.*?Voorwaarden',
    ]
    
    # Technical and browser notices
    technical_patterns = [
        r'Voor een betere surfervaring.*?Chrome',
        r'Pour une meilleure exp√©rience.*?Chrome\.',
        r'For a better browsing experience.*?Chrome',
        r'Adblock detection:.*?Sluiten',
        r'Adblock detection:.*?Fermer',
        r'Adblock detection:.*?Close',
        r'You have not yet given permission.*?Load video',
        r'JavaScript is disabled.*?Enable JavaScript',
        r'Loading\.\.\.',
        r'Laden\.\.\.',
        r'Chargement\.\.\.',
    ]
    
    # Footer and related content
    footer_patterns = [
        # Related articles
        r'Other articles that might interest you.*',
        r'Autres articles qui pourraient vous int√©resser.*',
        r'Andere artikels die u kunnen interesseren.*',
        r'Gerelateerde concepten.*?Lees meer',
        r'Termes li√©s.*?Lire la suite',
        r'Related terms.*?Read more',
        
        # Newsletter and blog subscriptions
        r'Ontdek de.*?blog.*?Fran√ßais',
        r'D√©couvrir le blog.*?Nederlands',
        r'Discover the.*?blog.*?Dutch',
        r'Schrijf u in op onze nieuwsbrief.*?Inschrijven',
        r'Inscrivez-vous √† notre newsletter.*?S\'inscrire',
        r'Subscribe to our newsletter.*?Subscribe',
        
        # Legal and compliance
        r'Terms and conditions.*?Privacy',
        r'Termes et conditions.*?Confidentialit√©',
        r'Algemene voorwaarden.*?Privacy',
        r'Disclaimer.*?Copyright',
        r'Avertissement.*?Droits d\'auteur',
        r'Vrijwaring.*?Auteursrecht',
        
        # Copyright notices
        r'¬©.*?\d{4}.*?(KBC|BNP|Belfius|ING)',
        r'Alle rechten voorbehouden',
        r'Tous droits r√©serv√©s',
        r'All rights reserved',
    ]
    
    # Banking-specific call-to-action patterns
    banking_cta_patterns = [
        # Appointment booking
        r'Maak een afspraak!.*?',
        r'Prenez rendez-vous.*?',
        r'Make an appointment.*?',
        r'Boek een gesprek.*?',
        r'R√©servez un entretien.*?',
        
        # Investment advice
        r'Ontdek ons advies.*?',
        r'Laissez-vous conseiller.*?',
        r'Discover our advice.*?',
        r'Klaar om te beleggen\?.*?Maak een afspraak!',
        r'Pr√™t\(e\) √† investir\?.*?Prenez rendez-vous',
        r'Ready to invest\?.*?Make an appointment',
        
        # Product promotions
        r'Ontdek onze.*?producten',
        r'D√©couvrez nos.*?produits',
        r'Discover our.*?products',
        r'Meer informatie.*?aanvragen',
        r'Plus d\'informations.*?demander',
        r'More information.*?request',
    ]
    
    # Social media and sharing
    social_patterns = [
        r'Share on.*?Facebook',
        r'Partager sur.*?Facebook',
        r'Delen op.*?Facebook',
        r'Tweet.*?Twitter',
        r'Tweeter.*?Twitter',
        r'LinkedIn.*?delen',
        r'LinkedIn.*?partager',
        r'LinkedIn.*?share',
        r'WhatsApp.*?delen',
        r'WhatsApp.*?partager',
        r'WhatsApp.*?share',
        r'E-mail.*?versturen',
        r'E-mail.*?envoyer',
        r'E-mail.*?send',
        r'Print this page',
        r'Imprimez cette page',
        r'Print deze pagina',
        r'Download PDF',
        r'T√©l√©charger PDF',
        r'PDF downloaden',
    ]
    
    # Breadcrumb and metadata
    metadata_patterns = [
        r'Home\s*‚Ä∫.*?‚Ä∫',
        r'Accueil\s*‚Ä∫.*?‚Ä∫',
        r'Thuis\s*‚Ä∫.*?‚Ä∫',
        r'Last updated:.*?\d{4}',
        r'Derni√®re mise √† jour:.*?\d{4}',
        r'Laatst bijgewerkt:.*?\d{4}',
        r'Posted on.*?\d{4}',
        r'Publi√© le.*?\d{4}',
        r'Geplaatst op.*?\d{4}',
        r'Tags:.*?(?=\n|\.|$)',
        r'√âtiquettes:.*?(?=\n|\.|$)',
        r'Labels:.*?(?=\n|\.|$)',
        r'\d+\s+min read',
        r'\d+\s+min de lecture',
        r'\d+\s+min lezen',
    ]
    
    # Combine all patterns
    all_patterns = (navigation_patterns + cookie_patterns + technical_patterns + 
                   footer_patterns + banking_cta_patterns + social_patterns + metadata_patterns)
    
    # Apply cleaning
    cleaned = raw_text
    for pattern in all_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.DOTALL)
    
    # Remove repeated "Read more" links in all languages
    read_more_patterns = [
        r'Lees meer\s*',
        r'Lire la suite\s*',
        r'Read more\s*',
        r'Meer lezen\s*',
        r'En savoir plus\s*',
        r'Learn more\s*'
    ]
    
    for pattern in read_more_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
    
    # Remove investment sidebar content
    cleaned = re.sub(r'Beleggen in \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'Investir dans \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'Investing in \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    
    # Remove standalone navigation and form words
    standalone_words = [
        'Contact', 'Zoeken', 'Rechercher', 'Search',
        'Email adres', 'Adresse email', 'Email address',
        'Inschrijven', 'S\'inscrire', 'Subscribe',
        'Versturen', 'Envoyer', 'Send',
        'Annuleren', 'Annuler', 'Cancel',
        'Bevestigen', 'Confirmer', 'Confirm'
    ]
    
    for word in standalone_words:
        cleaned = re.sub(rf'\b{re.escape(word)}\b', '', cleaned, flags=re.IGNORECASE)
    
    # Clean up formatting issues
    # Remove excessive punctuation
    cleaned = re.sub(r'[.]{2,}', '.', cleaned)
    cleaned = re.sub(r'[-]{3,}', '', cleaned)
    cleaned = re.sub(r'[_]{3,}', '', cleaned)
    cleaned = re.sub(r'(\b\w+\b)(\s+\1){2,}', r'\1', cleaned)  # Remove repeated words
    
    # Fix spacing around punctuation
    cleaned = re.sub(r'\s+([.,!?;:])', r'\1', cleaned)
    cleaned = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1 \2', cleaned)
    
    # Remove empty brackets and parentheses
    cleaned = re.sub(r'\(\s*\)', '', cleaned)
    cleaned = re.sub(r'\[\s*\]', '', cleaned)
    cleaned = re.sub(r'\{\s*\}', '', cleaned)
    
    # Normalize whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'\n\s*\n', '\n\n', cleaned)
    
    return cleaned.strip()

# ---------------------- Gemini Scoring ----------------------
def score_page_with_gemini(text, page_type):
    prompt = f"""
**Context:** This prompt is designed for the Gemini language model to evaluate the CEFR B2 level compliance of webpage content from retail banking websites for regulatory compliance. The evaluation focuses on vocabulary, grammar, clarity, and coherence to determine if the text is easily understandable for someone at a B2 level in English, French, or Dutch. The desired output includes the compliance level percentage and individual scores for vocabulary complexity, grammatical structures, overall clarity, and coherence, with a detailed rationale for each evaluated address presented in a single cell of an output file (e.g., CSV or Excel). The goal is to ensure the evaluation effectively differentiates between webpages with varying levels of B2 compliance, leading to a wider range of scores, and that the rationale is comprehensive yet concise enough to fit within a single cell per address. **It is important to consider that these are banking websites, and some technical or financial terms may be inherent to the content.**

**Task:** Assess the CEFR B2 compliance level of the provided webpage content, ensuring a variable range of scores and a detailed, single-cell rationale for each evaluated address, **while acknowledging the potential presence of necessary banking terminology.**

**Instructions:**

1. **Identify Language:** Determine if the input text is in English, French, or Dutch.

2. **Evaluate B2 Compliance with Granularity (Considering Banking Terms):** Analyze the text against the CEFR B2 criteria for the identified language, critically and precisely assessing the following aspects on a scale of 0 to 10. Avoid assigning only 0 or 10; use the full scale based on nuance and subtlety. ‚ÄúDo not hesitate to assign low (0‚Äì4) or high (8‚Äì10) scores when the text clearly deserves it. Avoid accumulating around 6‚Äì7 unless the text is truly average.‚Äù Remember the compliancy threshold is 70% (7/10) for B2 level. Therefore if a text is generally compliant it should receive a total score of higher than or equal to 70.

- **Vocabulary Complexity (0‚Äì10)**
  - 10 ‚Üí very simple, common words, basic banking terms, no jargon
  - 7‚Äì9 ‚Üí mostly common words, occasional technical terms explained
  - 4‚Äì6 ‚Üí mix of general and technical terms, some unnecessarily complex or rare words
  - 1‚Äì3 ‚Üí frequent use of complex, low-frequency words or jargon, often unexplained
  - 0 ‚Üí highly complex, dense language with rare or unexplained terms everywhere

- **Grammatical Structures (0‚Äì10)**
  - 10 ‚Üí simple sentences, clear structure, active voice, no complex clauses
  - 7‚Äì9 ‚Üí mostly simple, some moderate clauses, minor passive use
  - 4‚Äì6 ‚Üí mix of simple and complex sentences, occasional embedded or passive forms
  - 1‚Äì3 ‚Üí mostly long, embedded, or passive structures, hard to follow
  - 0 ‚Üí extremely complex grammar, frequent embedding, difficult to parse

- **Overall Clarity (0‚Äì10)**
  - 10 ‚Üí very clear, easy to understand, minimal effort required
  - 7‚Äì9 ‚Üí mostly clear, small moments of complexity
  - 4‚Äì6 ‚Üí mixed clarity, occasional confusion or ambiguity
  - 1‚Äì3 ‚Üí often unclear, requires effort to interpret
  - 0 ‚Üí very unclear, confusing, hard to follow

- **Coherence (0‚Äì10)**
  - 10 ‚Üí logical flow, clear organization, excellent connectors
  - 7‚Äì9 ‚Üí mostly logical, some jumps, minor missing links
  - 4‚Äì6 ‚Üí mixed coherence, weak transitions, partial disorganization
  - 1‚Äì3 ‚Üí often disorganized, unclear connections
  - 0 ‚Üí no logical order, chaotic, fragmented

3. **Provide Detailed Rationale (Single Cell):** Explain the reasoning behind each of the four scores within a single text string suitable for one Excel cell. Explicitly point out specific linguistic features (vocabulary, grammar, discourse markers) that contribute to the assigned level of complexity or simplicity for each criterion. When discussing vocabulary, specifically comment on the presence and handling of banking terminology. Justify why the text is or is not strictly at the B2 level for each aspect. Use clear separators (e.g., "; ") between the rationale for each criterion to ensure readability within the single cell.
```xml
<rationale>Vocabulary: [Explanation with examples, noting banking terms]; Grammar: [Explanation with examples]; Clarity: [Explanation with examples, considering banking terms]; Coherence: [Explanation with examples]</rationale>

**Output Format:**
Return the evaluation in the following XML format, ensuring all information for a single evaluated webpage address can be represented as a single row in an output file:
```xml
<vocabulary_complexity>Y</vocabulary_complexity>
<grammatical_structures>Z</grammatical_structures>
<overall_clarity>W</overall_clarity>
<coherence>V</coherence>
<rationale>Vocabulary: [Explanation with examples, noting banking terms], Grammar: [Explanation with examples], Clarity: [Explanation with examples, considering banking terms], Coherence: [Explanation with examples]</rationale>

Examples of B2 Compliant Texts and C1 Texts Which Are Not B2 Compliant
 To help you understand the evaluation criteria, here are some examples of texts rated at B2 and C1 levels:

English
B2 level text
 Source: LinguaPress Unsolved mysteries ‚Äì a short story by Sarah Wollbach
 Megan‚Äôs acting career began one morning a couple of years ago, when a woman approached her in the parking lot of her neighborhood grocery store. ‚ÄúExcuse me,‚Äù she said, ‚Äúbut have you ever taken acting lessons?‚Äù ‚Äî ‚ÄúNo,‚Äù she answered hesitantly. 
 The woman reached into her pocket and handed Megan a card. ‚ÄúI‚Äôm a casting director for Unsolved Mysteries,‚Äù she said, shaking her hand. Megan had always been stage-struck. 
 For years she'd fantasized about being an actor, sure that deep within her lurked a brilliant chameleon like Meryl Streep or Julia Roberts. Maybe this was her big break. 
 ‚ÄúThe show‚Äôs doing a feature about a woman who was kidnapped,‚Äù the lady continued, ‚Äúand you look exactly like her. The resemblance is amazing. Would you be interested in auditioning?‚Äù 
 The episode aired the next week, with a couple of thousand dollars for two days‚Äô work, plus travel, lodging, and food expenses.


C1 level text
 Source: LinguaPress The Enigma of the Missing Manuscript by John Doe
 The mystery of the missing manuscript has eluded generations of writers. It was said to contain the final, unpublished works and annotations of the author, whose sudden disappearance 
 had only added to the intrigue. The manuscript was believed to be hidden somewhere in the old mansion, a labyrinthine structure filled with secret passages and hidden rooms. 
 Many had tried to find it, but all had failed. The clues were cryptic, the dangers real, and the stakes high. For those who dared to search, it was a journey into the unknown, a test of wit and courage.



French
B2 level text
 Source: LinguaPress Myst√®res non r√©solus ‚Äì une histoire courte par Sarah Wollbach
 La carri√®re d‚Äôactrice de Megan a commenc√© un matin il y a quelques ann√©es, lorsqu‚Äôune femme l‚Äôa abord√©e dans le parking de son √©picerie de quartier. 
 ‚ÄúExcusez-moi,‚Äù dit-elle, ‚Äúmais avez-vous d√©j√† pris des cours de th√©√¢tre?‚Äù ‚Äî ‚ÄúNon,‚Äù r√©pondit-elle avec h√©sitation. La femme a fouill√© dans sa poche et tendu une carte √† Megan. 
 ‚ÄúJe suis directrice de casting de Myst√®res non r√©solus,‚Äù dit-elle en lui serrant la main. Megan avait toujours √©t√© fascin√©e par la sc√®ne. 
 Pendant des ann√©es, elle avait nourri en secret le r√™ve d‚Äô√™tre actrice, convaincue qu‚Äôau fond d‚Äôelle-m√™me se cachait un brillant cam√©l√©on comme Meryl Streep ou Julia Roberts. 
 Peut-√™tre que c‚Äô√©tait sa grande chance. ‚ÄúL‚Äô√©mission fait un reportage sur une femme qui a √©t√© kidnapp√©e,‚Äù continua la dame, ‚Äúet vous lui ressemblez exactement. 
 La ressemblance est incroyable. Seriez-vous int√©ress√©e par une audition?‚Äù Elle expliqua que le r√¥le valait quelques milliers de dollars pour deux jours de travail, plus les frais de voyage, de logement et de nourriture.


C1 level text
 Source: LinguaPress L‚ÄôEnigme du Manuscrit Disparu par Jean Dupont
 Le myst√®re du manuscrit disparu que tout le monde tentait de percer depuis des d√©cennies. On disait qu‚Äôil contenait les derni√®res ≈ìuvres finales, non publi√©es, d‚Äôun auteur renomm√©, 
 dont la disparition soudaine n‚Äôavait fait qu‚Äôajouter √† l‚Äôintrigue. On croyait que le manuscrit √©tait cach√© quelque part dans le vieux manoir, une structure labyrinthique remplie de passages secrets et de pi√®ces cach√©es. 
 Beaucoup avaient essay√© de le trouver, mais tous avaient √©chou√©. Les indices √©taient cryptiques, les dangers r√©els, et les enjeux √©lev√©s. Pour ceux qui osaient chercher, c‚Äô√©tait un voyage dans l‚Äôinconnu, un test d‚Äôesprit et de courage.



Dutch
B2 level text
 Source: LinguaPress Opgeloste mysteries ‚Äì een kortverhaal door Sarah Wollbach
 Megan‚Äôs acteercarri√®re begon op een ochtend een paar jaar geleden, toen een vrouw haar benaderde op de parkeerplaats van haar buurtwinkel. ‚ÄúExcuseer me,‚Äù zei ze, ‚Äúmaar heb je ooit acteerlessen gevolgd?‚Äù ‚Äî ‚ÄúNee,‚Äù antwoordde ze aarzelend. 
 De vrouw stak haar hand in haar zak en gaf Megan een kaartje. ‚ÄúIk ben een castingdirecteur voor Opgeloste mysteries,‚Äù zei ze, terwijl ze haar hand schudde. Megan was altijd al gefascineerd door het toneel. 
 Jarenlang had ze gefantaseerd over het zijn van een actrice, ervan overtuigd dat diep vanbinnen een briljante actrice zoals Meryl Streep of Julia Roberts schuilde. Misschien was dit haar grote doorbraak. 
 ‚ÄúDe show doet een reportage over een vrouw die ontvoerd is,‚Äù vervolgde de dame, ‚Äúen je lijkt precies op haar. De gelijkenis is verbazingwekkend. Zou je ge√Ønteresseerd zijn in een auditie?‚Äù 
 Ze zette uit dat dit alles een paar duizend dollar waard was voor twee dagen werk, plus reis-, verblijf- en voedselkosten.


C1 level text
 Source: LinguaPress Het Raadsel van het Verdwenen Manuscript door Jan Jansen
 Het mysterie van het verdwenen manuscript dat generaties schrijvers decennialang verbijsterd. Er werd gezegd dat het de laatste, ongepubliceerde werken van een beroemde auteur bevatte, wiens plotselinge verdwijning alleen maar bijdroeg aan de intrige. 
 Het gerucht deed de ronde dat het manuscript ergens in het oude herenhuis verborgen was, een labyrintische structuur vol geheime gangen en verborgen kamers. Velen hadden geprobeerd het te vinden, maar allemaal waren ze mislukt. 
 De aanwijzingen waren cryptisch, de gevaren echt, en de inzet hoog. Voor degenen die durfden te zoeken, was het een reis in het onbekende, een test van verstand en moed.


Input Text content to check: \"\"\"{text}\"\"\" 
"""
    try:
        model = genai.GenerativeModel("gemini-2.0-flash")
        response = model.generate_content(prompt, generation_config={"temperature": 0.2})
        output = response.text.strip()

        if output.startswith("```"):
            output = output.strip("` \n").replace("xml", "").strip()

        scores = {
            "vocabulary_complexity": extract_xml_score(output, "vocabulary_complexity"),
            "grammatical_structures": extract_xml_score(output, "grammatical_structures"),
            "overall_clarity": extract_xml_score(output, "overall_clarity"),
            "coherence": extract_xml_score(output, "coherence"),
            "rationale": extract_xml_rationale(output),
        }
        return scores

    except Exception as e:
        return {
            "vocabulary_complexity": 0,
            "grammatical_structures": 0,
            "overall_clarity": 0,
            "coherence": 0,
            "rationale": f"Error: {str(e)}",
        }

def extract_xml_score(xml_text, tag):
    match = re.search(fr"<{tag}>(\d+)</{tag}>", xml_text)
    return int(match.group(1)) if match else 0

def extract_xml_rationale(xml_text):
    match = re.search(r"<rationale>(.*?)</rationale>", xml_text, re.DOTALL)
    return match.group(1).strip() if match else "No rationale found."

# ---------------------- Processing Function ----------------------
def process_single_url(url_data):
    """Process a single URL - designed for parallel execution"""
    url, page_type = url_data
    
    try:
        # Extract and clean text
        text = extract_clean_text_bnp(url)
        if not text.strip():
            return create_error_result(url, page_type, "No text extracted")
        
        cleaned_text = clean_bnp_text(text)
        
        # Score with Gemini
        result = score_page_with_gemini(cleaned_text, page_type)
        
        sub_scores = [
            result.get("vocabulary_complexity", 0),
            result.get("grammatical_structures", 0),
            result.get("overall_clarity", 0),
            result.get("coherence", 0),
        ]
        
        compliance_value = round(sum(sub_scores) / 4 * 10) if all(
            isinstance(score, int) and 0 <= score <= 10 for score in sub_scores
        ) else 0
        
        return {
            "URL": url,
            "Page Type": page_type,
            "Compliance Level": compliance_value,
            "Vocabulary Complexity": result.get("vocabulary_complexity"),
            "Grammatical Structures": result.get("grammatical_structures"),
            "Overall Clarity": result.get("overall_clarity"),
            "Coherence": result.get("coherence"),
            "Rationale": result.get("rationale"),
        }
        
    except Exception as e:
        return create_error_result(url, page_type, str(e))

def create_error_result(url, page_type, error_msg):
    """Create error result dictionary"""
    return {
        "URL": url,
        "Page Type": page_type,
        "Compliance Level": 0,
        "Vocabulary Complexity": 0,
        "Grammatical Structures": 0,
        "Overall Clarity": 0,
        "Coherence": 0,
        "Rationale": f"Error: {error_msg}",
    }

# ---------------------- Main Analysis ----------------------
def analyze_bnp_b2_compliance(excel_file_path, max_workers=4, batch_size=20):
    """Optimized main function with parallel processing"""
    
    print("üöÄ BNP Paribas Fortis B2 Compliance Analysis (Optimized)")
    print("="*60)
    
    # Load URLs from Excel
    try:
        df_urls = pd.read_excel(excel_file_path)
        url_columns = ['Address', 'URL', 'url', 'address', 'link', 'Link']
        url_column = None
        for col in url_columns:
            if col in df_urls.columns:
                url_column = col
                break
        
        if url_column is None:
            url_column = df_urls.columns[0]
        
        urls = df_urls[url_column].dropna().tolist()
        print(f"‚úÖ Loaded {len(urls)} URLs from {url_column} column")
        
    except Exception as e:
        print(f"‚ùå Error loading Excel file: {e}")
        return
    
    # Create DataFrame and classify URLs
    df = pd.DataFrame({'Address': urls})
    df["Page Type"] = df["Address"].apply(classify_bnp_url)
    
    print(f"\nüìä URL Distribution:")
    print(df['Page Type'].value_counts())
    
    # Setup files
    checkpoint_file = "BNPPF/bnp_paribas_fortis_b2_checkpoint.csv"
    log_file = "BNPPF/bnp_paribas_fortis_b2_log.txt"
    
    # Resume from checkpoint
    processed_urls = set()
    scores = []
    
    if os.path.exists(log_file):
        with open(log_file, 'r') as f:
            processed_urls = set(line.strip() for line in f.readlines())
    
    if os.path.exists(checkpoint_file):
        scores_df = pd.read_csv(checkpoint_file)
        scores = scores_df.to_dict(orient='records')
        print(f"üìÇ Resuming from checkpoint with {len(scores)} existing scores")
    
    # Filter unprocessed URLs
    unprocessed_data = [
        (url, page_type) for url, page_type in zip(df['Address'], df['Page Type'])
        if url not in processed_urls
    ]
    
    if not unprocessed_data:
        print("‚úÖ All URLs already processed!")
        return pd.DataFrame(scores)
    
    print(f"üîÑ Processing {len(unprocessed_data)} remaining URLs with {max_workers} workers")
    
    # Process in batches with parallel execution
    for i in range(0, len(unprocessed_data), batch_size):
        batch = unprocessed_data[i:i + batch_size]
        batch_results = []
        
        print(f"\nüì¶ Processing batch {i//batch_size + 1}/{(len(unprocessed_data)-1)//batch_size + 1}")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks in the batch
            future_to_url = {
                executor.submit(process_single_url, url_data): url_data[0] 
                for url_data in batch
            }
            
            # Collect results with progress bar
            for future in tqdm(as_completed(future_to_url), total=len(batch), desc="ü§ñ Scoring"):
                url = future_to_url[future]
                try:
                    result = future.result(timeout=60)  # 60 second timeout per URL
                    batch_results.append(result)
                    
                    # Log processed URL
                    with open(log_file, 'a') as f:
                        f.write(url + '\n')
                        
                except Exception as e:
                    print(f"‚ùå Failed to process {url}: {e}")
                    # Add error result
                    batch_results.append(create_error_result(url, "Unknown", str(e)))
        
        # Add batch results to main scores
        scores.extend(batch_results)
        
        # Save checkpoint after each batch
        pd.DataFrame(scores).to_csv(checkpoint_file, index=False)
        print(f"üíæ Checkpoint saved: {len(scores)} total items processed")
        
        # Cleanup drivers for this batch
        for _ in range(max_workers):
            try:
                cleanup_driver()
            except:
                pass
        
        # Brief pause between batches
        time.sleep(2)
    
    # Final save and cleanup
    final_df = pd.DataFrame(scores)
    final_df.to_excel("BNPPF/bnp_paribas_fortis_b2_final_results.xlsx", index=False)
    
    # Summary
    if len(final_df) > 0:
        overall_score = final_df["Compliance Level"].mean()
        print(f"\nüåê Overall CEFR B2 Accessibility Score: {overall_score:.2f}%")
        
        typology_avg = final_df.groupby("Page Type")["Compliance Level"].mean()
        for typ, score in typology_avg.items():
            status = "‚úÖ" if score >= 70 else "‚ö†Ô∏è"
            print(f"{status} {typ} pages: {score:.2f}% average compliance")
    
    print(f"\n‚úÖ Analysis complete! Results saved to: bnp_paribas_fortis_b2_final_results.xlsx")
    return final_df

# ---------------------- EXECUTION ----------------------
if __name__ == "__main__":
    excel_file_path = "BNPPF/bnppf_urls.xlsx"
    # Adjust parameters for your system:
    # max_workers: Number of parallel threads (4-8 recommended)
    # batch_size: URLs processed before checkpoint (20-50 recommended)
    analyze_bnp_b2_compliance(excel_file_path, max_workers=6, batch_size=30)

üöÄ BNP Paribas Fortis B2 Compliance Analysis (Optimized)
‚úÖ Loaded 4078 URLs from Address column

üìä URL Distribution:
Page Type
Product    2207
Other      1094
FAQ         738
Legal        29
Contact       6
Blog          4
Name: count, dtype: int64
üìÇ Resuming from checkpoint with 1470 existing scores
üîÑ Processing 2535 remaining URLs with 6 workers

üì¶ Processing batch 1/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.68s/it]


üíæ Checkpoint saved: 1500 total items processed

üì¶ Processing batch 2/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:14<00:00,  4.49s/it]


üíæ Checkpoint saved: 1530 total items processed

üì¶ Processing batch 3/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.57s/it]


üíæ Checkpoint saved: 1560 total items processed

üì¶ Processing batch 4/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.72s/it]


üíæ Checkpoint saved: 1590 total items processed

üì¶ Processing batch 5/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.68s/it]


üíæ Checkpoint saved: 1620 total items processed

üì¶ Processing batch 6/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.63s/it]


üíæ Checkpoint saved: 1650 total items processed

üì¶ Processing batch 7/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:23<00:00,  4.79s/it]


üíæ Checkpoint saved: 1680 total items processed

üì¶ Processing batch 8/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:22<00:00,  4.75s/it]


üíæ Checkpoint saved: 1710 total items processed

üì¶ Processing batch 9/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.60s/it]


üíæ Checkpoint saved: 1740 total items processed

üì¶ Processing batch 10/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.61s/it]


üíæ Checkpoint saved: 1770 total items processed

üì¶ Processing batch 11/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.68s/it]


üíæ Checkpoint saved: 1800 total items processed

üì¶ Processing batch 12/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:34<00:00,  5.16s/it]


üíæ Checkpoint saved: 1830 total items processed

üì¶ Processing batch 13/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.65s/it]


üíæ Checkpoint saved: 1860 total items processed

üì¶ Processing batch 14/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:16<00:00,  4.56s/it]


üíæ Checkpoint saved: 1890 total items processed

üì¶ Processing batch 15/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.71s/it]


üíæ Checkpoint saved: 1920 total items processed

üì¶ Processing batch 16/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.58s/it]


üíæ Checkpoint saved: 1950 total items processed

üì¶ Processing batch 17/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.58s/it]


üíæ Checkpoint saved: 1980 total items processed

üì¶ Processing batch 18/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.70s/it]


üíæ Checkpoint saved: 2010 total items processed

üì¶ Processing batch 19/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.59s/it]


üíæ Checkpoint saved: 2040 total items processed

üì¶ Processing batch 20/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.58s/it]


üíæ Checkpoint saved: 2070 total items processed

üì¶ Processing batch 21/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:23<00:00,  4.80s/it]


üíæ Checkpoint saved: 2100 total items processed

üì¶ Processing batch 22/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.64s/it]


üíæ Checkpoint saved: 2130 total items processed

üì¶ Processing batch 23/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.69s/it]


üíæ Checkpoint saved: 2160 total items processed

üì¶ Processing batch 24/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.59s/it]


üíæ Checkpoint saved: 2190 total items processed

üì¶ Processing batch 25/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.64s/it]


üíæ Checkpoint saved: 2220 total items processed

üì¶ Processing batch 26/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:24<00:00,  4.81s/it]


üíæ Checkpoint saved: 2250 total items processed

üì¶ Processing batch 27/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.73s/it]


üíæ Checkpoint saved: 2280 total items processed

üì¶ Processing batch 28/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.70s/it]


üíæ Checkpoint saved: 2310 total items processed

üì¶ Processing batch 29/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.70s/it]


üíæ Checkpoint saved: 2340 total items processed

üì¶ Processing batch 30/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.67s/it]


üíæ Checkpoint saved: 2370 total items processed

üì¶ Processing batch 31/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.71s/it]


üíæ Checkpoint saved: 2400 total items processed

üì¶ Processing batch 32/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:30<00:00,  5.01s/it]


üíæ Checkpoint saved: 2430 total items processed

üì¶ Processing batch 33/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.62s/it]


üíæ Checkpoint saved: 2460 total items processed

üì¶ Processing batch 34/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.62s/it]


üíæ Checkpoint saved: 2490 total items processed

üì¶ Processing batch 35/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.66s/it]


üíæ Checkpoint saved: 2520 total items processed

üì¶ Processing batch 36/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.62s/it]


üíæ Checkpoint saved: 2550 total items processed

üì¶ Processing batch 37/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:22<00:00,  4.74s/it]


üíæ Checkpoint saved: 2580 total items processed

üì¶ Processing batch 38/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:23<00:00,  4.77s/it]


üíæ Checkpoint saved: 2610 total items processed

üì¶ Processing batch 39/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.72s/it]


üíæ Checkpoint saved: 2640 total items processed

üì¶ Processing batch 40/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:25<00:00,  4.86s/it]


üíæ Checkpoint saved: 2670 total items processed

üì¶ Processing batch 41/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.69s/it]


üíæ Checkpoint saved: 2700 total items processed

üì¶ Processing batch 42/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.69s/it]


üíæ Checkpoint saved: 2730 total items processed

üì¶ Processing batch 43/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:22<00:00,  4.75s/it]


üíæ Checkpoint saved: 2760 total items processed

üì¶ Processing batch 44/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.60s/it]


üíæ Checkpoint saved: 2790 total items processed

üì¶ Processing batch 45/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:32<00:00,  5.10s/it]


üíæ Checkpoint saved: 2820 total items processed

üì¶ Processing batch 46/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.66s/it]


üíæ Checkpoint saved: 2850 total items processed

üì¶ Processing batch 47/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:28<00:00,  4.94s/it]


üíæ Checkpoint saved: 2880 total items processed

üì¶ Processing batch 48/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.68s/it]


üíæ Checkpoint saved: 2910 total items processed

üì¶ Processing batch 49/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.68s/it]


üíæ Checkpoint saved: 2940 total items processed

üì¶ Processing batch 50/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.67s/it]


üíæ Checkpoint saved: 2970 total items processed

üì¶ Processing batch 51/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.67s/it]


üíæ Checkpoint saved: 3000 total items processed

üì¶ Processing batch 52/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:23<00:00,  4.80s/it]


üíæ Checkpoint saved: 3030 total items processed

üì¶ Processing batch 53/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.67s/it]


üíæ Checkpoint saved: 3060 total items processed

üì¶ Processing batch 54/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.63s/it]


üíæ Checkpoint saved: 3090 total items processed

üì¶ Processing batch 55/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:22<00:00,  4.74s/it]


üíæ Checkpoint saved: 3120 total items processed

üì¶ Processing batch 56/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:16<00:00,  4.56s/it]


üíæ Checkpoint saved: 3150 total items processed

üì¶ Processing batch 57/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.71s/it]


üíæ Checkpoint saved: 3180 total items processed

üì¶ Processing batch 58/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.67s/it]


üíæ Checkpoint saved: 3210 total items processed

üì¶ Processing batch 59/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.71s/it]


üíæ Checkpoint saved: 3240 total items processed

üì¶ Processing batch 60/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.60s/it]


üíæ Checkpoint saved: 3270 total items processed

üì¶ Processing batch 61/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.61s/it]


üíæ Checkpoint saved: 3300 total items processed

üì¶ Processing batch 62/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.73s/it]


üíæ Checkpoint saved: 3330 total items processed

üì¶ Processing batch 63/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:23<00:00,  4.77s/it]


üíæ Checkpoint saved: 3360 total items processed

üì¶ Processing batch 64/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.69s/it]


üíæ Checkpoint saved: 3390 total items processed

üì¶ Processing batch 65/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.67s/it]


üíæ Checkpoint saved: 3420 total items processed

üì¶ Processing batch 66/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.65s/it]


üíæ Checkpoint saved: 3450 total items processed

üì¶ Processing batch 67/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.62s/it]


üíæ Checkpoint saved: 3480 total items processed

üì¶ Processing batch 68/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.69s/it]


üíæ Checkpoint saved: 3510 total items processed

üì¶ Processing batch 69/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.70s/it]


üíæ Checkpoint saved: 3540 total items processed

üì¶ Processing batch 70/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.70s/it]


üíæ Checkpoint saved: 3570 total items processed

üì¶ Processing batch 71/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:16<00:00,  4.55s/it]


üíæ Checkpoint saved: 3600 total items processed

üì¶ Processing batch 72/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.57s/it]


üíæ Checkpoint saved: 3630 total items processed

üì¶ Processing batch 73/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.66s/it]


üíæ Checkpoint saved: 3660 total items processed

üì¶ Processing batch 74/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.60s/it]


üíæ Checkpoint saved: 3690 total items processed

üì¶ Processing batch 75/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:22<00:00,  4.74s/it]


üíæ Checkpoint saved: 3720 total items processed

üì¶ Processing batch 76/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:30<00:00,  5.02s/it]


üíæ Checkpoint saved: 3750 total items processed

üì¶ Processing batch 77/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:23<00:00,  4.79s/it]


üíæ Checkpoint saved: 3780 total items processed

üì¶ Processing batch 78/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:23<00:00,  4.78s/it]


üíæ Checkpoint saved: 3810 total items processed

üì¶ Processing batch 79/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:18<00:00,  4.60s/it]


üíæ Checkpoint saved: 3840 total items processed

üì¶ Processing batch 80/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:20<00:00,  4.67s/it]


üíæ Checkpoint saved: 3870 total items processed

üì¶ Processing batch 81/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:19<00:00,  4.64s/it]


üíæ Checkpoint saved: 3900 total items processed

üì¶ Processing batch 82/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:40<00:00,  5.34s/it]


üíæ Checkpoint saved: 3930 total items processed

üì¶ Processing batch 83/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:17<00:00,  4.58s/it]


üíæ Checkpoint saved: 3960 total items processed

üì¶ Processing batch 84/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:21<00:00,  4.71s/it]


üíæ Checkpoint saved: 3990 total items processed

üì¶ Processing batch 85/85


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [01:26<00:00,  5.77s/it]


üíæ Checkpoint saved: 4005 total items processed

üåê Overall CEFR B2 Accessibility Score: 69.73%
‚úÖ Blog pages: 78.75% average compliance
‚ö†Ô∏è Contact pages: 68.17% average compliance
‚ö†Ô∏è FAQ pages: 68.01% average compliance
‚ö†Ô∏è Legal pages: 64.23% average compliance
‚ö†Ô∏è Other pages: 65.49% average compliance
‚úÖ Product pages: 72.48% average compliance

‚úÖ Analysis complete! Results saved to: bnp_paribas_fortis_b2_final_results.xlsx


<div style="background-color: rgb(35, 137, 206); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    4. Specialized Code for KBC
</div>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
import re
import google.generativeai as genai
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from functools import lru_cache

# ---------------------- CONFIG ----------------------
genai.configure(api_key="AIzaSyBzOT2O03scMENbdWouWexYa10v4K4OVPE")

# Thread-local storage for WebDriver instances
thread_local = threading.local()

def get_driver():
    """Get a WebDriver instance for the current thread"""
    if not hasattr(thread_local, 'driver'):
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-images')  # Speed up loading
        options.add_argument('--disable-javascript')  # Speed up loading
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        thread_local.driver = webdriver.Chrome(options=options)
        thread_local.driver.set_page_load_timeout(30)  # Reduced timeout
        thread_local.driver.implicitly_wait(5)  # Reduced wait
    
    return thread_local.driver

def cleanup_driver():
    """Clean up the WebDriver for the current thread"""
    if hasattr(thread_local, 'driver'):
        thread_local.driver.quit()
        del thread_local.driver

# ---------------------- URL Classification ----------------------
@lru_cache(maxsize=1000)  # Cache classification results
def classify_kbc_url(url):
   """Enhanced URL classification specifically for KBC Bank (EN/FR/NL/DE)"""
   url = url.lower()
   
   if any(term in url for term in [
       # Products - EN/FR/NL/DE
       "product", "producten", "produits", "produkte", "sparen", "saving", "epargne",
       "lenen", "loan", "pret", "kredit", "credit", "rekening", "account", "compte", "konto",
       "beleggen", "investment", "investir", "investieren", "hypotheek", "mortgage",
       "verzekering", "insurance", "assurance", "versicherung", "kaart", "card", "carte", "karte",
       "bankieren", "banking", "banque", "kbc-mobile", "kbc-plus", "kbc-basic", "kbc-premium"
   ]):
       return "Product"
   
   elif any(term in url for term in [
       # Support - EN/FR/NL/DE
       "faq", "support", "help", "hulp", "ondersteuning", "aide", "hilfe",
       "questions", "klantenservice", "service-client", "kundendienst", "assistance",
       "kbc-live", "live-chat"
   ]):
       return "FAQ"
   
   elif any(term in url for term in [
       # Legal - EN/FR/NL/DE
       "legal", "juridisch", "juridique", "rechtlich", "voorwaarden", "terms",
       "conditions", "bedingungen", "privacy", "beleid", "policy", "datenschutz",
       "cookie", "gdpr", "compliance", "tarieven", "tarifs", "fees", "gebuehren"
   ]):
       return "Legal"
   
   elif any(term in url for term in [
       # Contact - EN/FR/NL/DE
       "contact", "locatie", "location", "standort", "agences", "branches", "filialen",
       "kantoren", "afspraak", "appointment", "rendez-vous", "termin"
   ]):
       return "Contact"
   
   elif any(term in url for term in [
       # News/Blog - EN/FR/NL/DE
       "blog", "nieuws", "news", "actualites", "nachrichten", "insights", "perspectives", "moments-cles"
   ]):
       return "Blog"
   
   else:
       return "Other"

# ---------------------- Text Extraction ----------------------
def extract_clean_text_kbc(url):
    """Optimized text extraction for BNP Paribas Fortis"""
    try:
        driver = get_driver()
        driver.get(url)
        
        # Reduced wait time
        try:
            WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
        except:
            time.sleep(2)  # Reduced sleep
        
        # Quick cookie handling
        try:
            cookie_selectors = ['[data-testid*="accept"]', '[class*="accept"]']
            for selector in cookie_selectors:
                buttons = driver.find_elements(By.CSS_SELECTOR, selector)
                for button in buttons[:1]:  # Only try first button
                    if button.is_displayed():
                        driver.execute_script("arguments[0].click();", button)
                        time.sleep(1)
                        break
                if buttons:
                    break
        except:
            pass
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Remove unwanted elements
        for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
            if tag:
                tag.decompose()
        
        # Extract main content
        main_content = soup.find('main') or soup.find('article') or soup.find('.content')
        if main_content:
            text = main_content.get_text(separator=' ', strip=True)
        else:
            text = soup.get_text(separator=' ', strip=True)
        
        if text and len(text.strip()) > 100:
            result = ' '.join(text.split())[:10000]
            return result
        else:
            return ""

    except Exception as e:
        return ""

def clean_kbc_text(raw_text):
    """
    Universal text cleaning function for Belgian banks (KBC, BNP Paribas, Belfius, ING)
    Handles multi-language content (NL/FR/EN) and bank-specific patterns
    """
    if not raw_text or len(raw_text.strip()) < 20:
        return ""
    
    # Universal navigation and header patterns
    navigation_patterns = [
        # Skip to content links
        r'Skip to .*?Log in',
        r'Retour au .*?Se connecter',
        r'Terug naar de inhoud',
        r'Overslaan en naar de inhoud gaan',
        r'Aller au contenu principal',
        r'Skip to main content',
        
        # Language switchers
        r'FR\s+NL\s+EN',
        r'Nederlands\s+Fran√ßais\s+English',
        r'NL\s+FR\s+DE',
        r'\bFR\s+NL\b',
        r'\bFran√ßais\b\s*\bNederlands\b',
        
        # Main navigation menus
        r'Home.*?Contact.*?Login',
        r'Accueil.*?Contact.*?Connexion',
        r'Thuis.*?Contact.*?Inloggen',
        r'Menu\s+Sluiten',
        r'Menu\s+Fermer',
        r'Close\s+Menu',
        
        # Search functionality
        r'Zoeken \(Optioneel\).*?Contact',
        r'Rechercher \(En option\).*?Contact',
        r'Search \(Optional\).*?Contact',
        r'Zoeken.*?Zoek',
        r'Rechercher.*?Recherche',
        r'Search.*?Search',
        
        # Bank-specific navigation
        # KBC specific
        r'KBC.*?Inloggen',
        r'KBC.*?Se connecter',
        r'Online Banking.*?KBC',
        
        # BNP Paribas specific
        r'BNP Paribas Fortis.*?Inloggen',
        r'BNP Paribas Fortis.*?Se connecter',
        r'Word klant.*?Beobank Online',
        r'Devenir client.*?Beobank Online',
        
        # Belfius specific
        r'Belfius.*?Inloggen',
        r'Belfius.*?Se connecter',
        r'Belfius Direct Net',
        
        # ING specific
        r'ING.*?Inloggen',
        r'ING.*?Se connecter',
        r'Mijn ING.*?Inloggen',
    ]
    
    # Cookie and privacy notices (comprehensive)
    cookie_patterns = [
        # Cookie acceptance
        r'Accept all cookies.*?Manage cookies',
        r'Accepter tous les cookies.*?G√©rer les cookies',
        r'Alle cookies accepteren.*?Cookies beheren',
        r'Deze website gebruikt cookies.*?Alles accepteren',
        r'Ce site utilise des cookies.*?Tout accepter',
        r'This website uses cookies.*?Accept all',
        
        # Cookie management sections
        r'Cookie settings.*?Save preferences',
        r'Param√®tres des cookies.*?Sauvegarder',
        r'Cookie-instellingen.*?Voorkeuren opslaan',
        r'Mijn cookies beheren.*?Alles accepteren',
        r'G√©rer mes cookies.*?Tout accepter',
        r'Manage my cookies.*?Accept all',
        
        # Cookie descriptions
        r'Functionele cookies.*?verbeteren\.',
        r'Les cookies fonctionnels.*?par des tiers\.',
        r'Functional cookies.*?third parties\.',
        r'Analytische cookies.*?voorkeuren zijn\.',
        r'Les cookies de mesure.*?leurs pr√©f√©rences\.',
        r'Analytics cookies.*?their preferences\.',
        r'Marketing cookies.*?te tonen\.',
        r'Les cookies publicitaires.*?pertinentes\.',
        r'Marketing cookies.*?relevant\.',
        
        # Privacy policy links
        r'Privacy policy.*?Terms',
        r'Politique de confidentialit√©.*?Conditions',
        r'Privacybeleid.*?Voorwaarden',
    ]
    
    # Technical and browser notices
    technical_patterns = [
        r'Voor een betere surfervaring.*?Chrome',
        r'Pour une meilleure exp√©rience.*?Chrome\.',
        r'For a better browsing experience.*?Chrome',
        r'Adblock detection:.*?Sluiten',
        r'Adblock detection:.*?Fermer',
        r'Adblock detection:.*?Close',
        r'You have not yet given permission.*?Load video',
        r'JavaScript is disabled.*?Enable JavaScript',
        r'Loading\.\.\.',
        r'Laden\.\.\.',
        r'Chargement\.\.\.',
    ]
    
    # Footer and related content
    footer_patterns = [
        # Related articles
        r'Other articles that might interest you.*',
        r'Autres articles qui pourraient vous int√©resser.*',
        r'Andere artikels die u kunnen interesseren.*',
        r'Gerelateerde concepten.*?Lees meer',
        r'Termes li√©s.*?Lire la suite',
        r'Related terms.*?Read more',
        
        # Newsletter and blog subscriptions
        r'Ontdek de.*?blog.*?Fran√ßais',
        r'D√©couvrir le blog.*?Nederlands',
        r'Discover the.*?blog.*?Dutch',
        r'Schrijf u in op onze nieuwsbrief.*?Inschrijven',
        r'Inscrivez-vous √† notre newsletter.*?S\'inscrire',
        r'Subscribe to our newsletter.*?Subscribe',
        
        # Legal and compliance
        r'Terms and conditions.*?Privacy',
        r'Termes et conditions.*?Confidentialit√©',
        r'Algemene voorwaarden.*?Privacy',
        r'Disclaimer.*?Copyright',
        r'Avertissement.*?Droits d\'auteur',
        r'Vrijwaring.*?Auteursrecht',
        
        # Copyright notices
        r'¬©.*?\d{4}.*?(KBC|BNP|Belfius|ING)',
        r'Alle rechten voorbehouden',
        r'Tous droits r√©serv√©s',
        r'All rights reserved',
    ]
    
    # Banking-specific call-to-action patterns
    banking_cta_patterns = [
        # Appointment booking
        r'Maak een afspraak!.*?',
        r'Prenez rendez-vous.*?',
        r'Make an appointment.*?',
        r'Boek een gesprek.*?',
        r'R√©servez un entretien.*?',
        
        # Investment advice
        r'Ontdek ons advies.*?',
        r'Laissez-vous conseiller.*?',
        r'Discover our advice.*?',
        r'Klaar om te beleggen\?.*?Maak een afspraak!',
        r'Pr√™t\(e\) √† investir\?.*?Prenez rendez-vous',
        r'Ready to invest\?.*?Make an appointment',
        
        # Product promotions
        r'Ontdek onze.*?producten',
        r'D√©couvrez nos.*?produits',
        r'Discover our.*?products',
        r'Meer informatie.*?aanvragen',
        r'Plus d\'informations.*?demander',
        r'More information.*?request',
    ]
    
    # Social media and sharing
    social_patterns = [
        r'Share on.*?Facebook',
        r'Partager sur.*?Facebook',
        r'Delen op.*?Facebook',
        r'Tweet.*?Twitter',
        r'Tweeter.*?Twitter',
        r'LinkedIn.*?delen',
        r'LinkedIn.*?partager',
        r'LinkedIn.*?share',
        r'WhatsApp.*?delen',
        r'WhatsApp.*?partager',
        r'WhatsApp.*?share',
        r'E-mail.*?versturen',
        r'E-mail.*?envoyer',
        r'E-mail.*?send',
        r'Print this page',
        r'Imprimez cette page',
        r'Print deze pagina',
        r'Download PDF',
        r'T√©l√©charger PDF',
        r'PDF downloaden',
    ]
    
    # Breadcrumb and metadata
    metadata_patterns = [
        r'Home\s*‚Ä∫.*?‚Ä∫',
        r'Accueil\s*‚Ä∫.*?‚Ä∫',
        r'Thuis\s*‚Ä∫.*?‚Ä∫',
        r'Last updated:.*?\d{4}',
        r'Derni√®re mise √† jour:.*?\d{4}',
        r'Laatst bijgewerkt:.*?\d{4}',
        r'Posted on.*?\d{4}',
        r'Publi√© le.*?\d{4}',
        r'Geplaatst op.*?\d{4}',
        r'Tags:.*?(?=\n|\.|$)',
        r'√âtiquettes:.*?(?=\n|\.|$)',
        r'Labels:.*?(?=\n|\.|$)',
        r'\d+\s+min read',
        r'\d+\s+min de lecture',
        r'\d+\s+min lezen',
    ]
    
    # Combine all patterns
    all_patterns = (navigation_patterns + cookie_patterns + technical_patterns + 
                   footer_patterns + banking_cta_patterns + social_patterns + metadata_patterns)
    
    # Apply cleaning
    cleaned = raw_text
    for pattern in all_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.DOTALL)
    
    # Remove repeated "Read more" links in all languages
    read_more_patterns = [
        r'Lees meer\s*',
        r'Lire la suite\s*',
        r'Read more\s*',
        r'Meer lezen\s*',
        r'En savoir plus\s*',
        r'Learn more\s*'
    ]
    
    for pattern in read_more_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
    
    # Remove investment sidebar content
    cleaned = re.sub(r'Beleggen in \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'Investir dans \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'Investing in \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    
    # Remove standalone navigation and form words
    standalone_words = [
        'Contact', 'Zoeken', 'Rechercher', 'Search',
        'Email adres', 'Adresse email', 'Email address',
        'Inschrijven', 'S\'inscrire', 'Subscribe',
        'Versturen', 'Envoyer', 'Send',
        'Annuleren', 'Annuler', 'Cancel',
        'Bevestigen', 'Confirmer', 'Confirm'
    ]
    
    for word in standalone_words:
        cleaned = re.sub(rf'\b{re.escape(word)}\b', '', cleaned, flags=re.IGNORECASE)
    
    # Clean up formatting issues
    # Remove excessive punctuation
    cleaned = re.sub(r'[.]{2,}', '.', cleaned)
    cleaned = re.sub(r'[-]{3,}', '', cleaned)
    cleaned = re.sub(r'[_]{3,}', '', cleaned)
    cleaned = re.sub(r'(\b\w+\b)(\s+\1){2,}', r'\1', cleaned)  # Remove repeated words
    
    # Fix spacing around punctuation
    cleaned = re.sub(r'\s+([.,!?;:])', r'\1', cleaned)
    cleaned = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1 \2', cleaned)
    
    # Remove empty brackets and parentheses
    cleaned = re.sub(r'\(\s*\)', '', cleaned)
    cleaned = re.sub(r'\[\s*\]', '', cleaned)
    cleaned = re.sub(r'\{\s*\}', '', cleaned)
    
    # Normalize whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'\n\s*\n', '\n\n', cleaned)
    
    return cleaned.strip()

# ---------------------- Gemini Scoring ----------------------
def score_page_with_gemini(text, page_type):
    prompt = f"""
**Context:** This prompt is designed for the Gemini language model to evaluate the CEFR B2 level compliance of webpage content from retail banking websites for regulatory compliance. The evaluation focuses on vocabulary, grammar, clarity, and coherence to determine if the text is easily understandable for someone at a B2 level in English, French, Dutch or German. The desired output includes the compliance level percentage and individual scores for vocabulary complexity, grammatical structures, overall clarity, and coherence, with a detailed rationale for each evaluated address presented in a single cell of an output file (e.g., CSV or Excel). The goal is to ensure the evaluation effectively differentiates between webpages with varying levels of B2 compliance, leading to a wider range of scores, and that the rationale is comprehensive yet concise enough to fit within a single cell per address. **It is important to consider that these are banking websites, and some technical or financial terms may be inherent to the content.**

**Task:** Assess the CEFR B2 compliance level of the provided webpage content, ensuring a variable range of scores and a detailed, single-cell rationale for each evaluated address, **while acknowledging the potential presence of necessary banking terminology.**

**Instructions:**

1. **Identify Language:** Determine if the input text is in English, French, Dutch or German.

2. **Evaluate B2 Compliance with Granularity (Considering Banking Terms):** Analyze the text against the CEFR B2 criteria for the identified language, critically and precisely assessing the following aspects on a scale of 0 to 10. Avoid assigning only 0 or 10; use the full scale based on nuance and subtlety. ‚ÄúDo not hesitate to assign low (0‚Äì4) or high (8‚Äì10) scores when the text clearly deserves it. Avoid accumulating around 6‚Äì7 unless the text is truly average.‚Äù Remember the compliancy threshold is 70% (7/10) for B2 level. Therefore if a text is generally compliant it should receive a total score of higher than or equal to 70.

- **Vocabulary Complexity (0‚Äì10)**
  - 10 ‚Üí very simple, common words, basic banking terms, no jargon
  - 7‚Äì9 ‚Üí mostly common words, occasional technical terms explained
  - 4‚Äì6 ‚Üí mix of general and technical terms, some unnecessarily complex or rare words
  - 1‚Äì3 ‚Üí frequent use of complex, low-frequency words or jargon, often unexplained
  - 0 ‚Üí highly complex, dense language with rare or unexplained terms everywhere

- **Grammatical Structures (0‚Äì10)**
  - 10 ‚Üí simple sentences, clear structure, active voice, no complex clauses
  - 7‚Äì9 ‚Üí mostly simple, some moderate clauses, minor passive use
  - 4‚Äì6 ‚Üí mix of simple and complex sentences, occasional embedded or passive forms
  - 1‚Äì3 ‚Üí mostly long, embedded, or passive structures, hard to follow
  - 0 ‚Üí extremely complex grammar, frequent embedding, difficult to parse

- **Overall Clarity (0‚Äì10)**
  - 10 ‚Üí very clear, easy to understand, minimal effort required
  - 7‚Äì9 ‚Üí mostly clear, small moments of complexity
  - 4‚Äì6 ‚Üí mixed clarity, occasional confusion or ambiguity
  - 1‚Äì3 ‚Üí often unclear, requires effort to interpret
  - 0 ‚Üí very unclear, confusing, hard to follow

- **Coherence (0‚Äì10)**
  - 10 ‚Üí logical flow, clear organization, excellent connectors
  - 7‚Äì9 ‚Üí mostly logical, some jumps, minor missing links
  - 4‚Äì6 ‚Üí mixed coherence, weak transitions, partial disorganization
  - 1‚Äì3 ‚Üí often disorganized, unclear connections
  - 0 ‚Üí no logical order, chaotic, fragmented

3. **Provide Detailed Rationale (Single Cell):** Explain the reasoning behind each of the four scores within a single text string suitable for one Excel cell. Explicitly point out specific linguistic features (vocabulary, grammar, discourse markers) that contribute to the assigned level of complexity or simplicity for each criterion. When discussing vocabulary, specifically comment on the presence and handling of banking terminology. Justify why the text is or is not strictly at the B2 level for each aspect. Use clear separators (e.g., "; ") between the rationale for each criterion to ensure readability within the single cell.
```xml
<rationale>Vocabulary: [Explanation with examples, noting banking terms]; Grammar: [Explanation with examples]; Clarity: [Explanation with examples, considering banking terms]; Coherence: [Explanation with examples]</rationale>

**Output Format:**
Return the evaluation in the following XML format, ensuring all information for a single evaluated webpage address can be represented as a single row in an output file:
```xml
<vocabulary_complexity>Y</vocabulary_complexity>
<grammatical_structures>Z</grammatical_structures>
<overall_clarity>W</overall_clarity>
<coherence>V</coherence>
<rationale>Vocabulary: [Explanation with examples, noting banking terms], Grammar: [Explanation with examples], Clarity: [Explanation with examples, considering banking terms], Coherence: [Explanation with examples]</rationale>

Examples of B2 Compliant Texts and C1 Texts Which Are Not B2 Compliant
 To help you understand the evaluation criteria, here are some examples of texts rated at B2 and C1 levels:

English
B2 level text
 Source: LinguaPress Unsolved mysteries ‚Äì a short story by Sarah Wollbach
 Megan‚Äôs acting career began one morning a couple of years ago, when a woman approached her in the parking lot of her neighborhood grocery store. ‚ÄúExcuse me,‚Äù she said, ‚Äúbut have you ever taken acting lessons?‚Äù ‚Äî ‚ÄúNo,‚Äù she answered hesitantly. 
 The woman reached into her pocket and handed Megan a card. ‚ÄúI‚Äôm a casting director for Unsolved Mysteries,‚Äù she said, shaking her hand. Megan had always been stage-struck. 
 For years she'd fantasized about being an actor, sure that deep within her lurked a brilliant chameleon like Meryl Streep or Julia Roberts. Maybe this was her big break. 
 ‚ÄúThe show‚Äôs doing a feature about a woman who was kidnapped,‚Äù the lady continued, ‚Äúand you look exactly like her. The resemblance is amazing. Would you be interested in auditioning?‚Äù 
 The episode aired the next week, with a couple of thousand dollars for two days‚Äô work, plus travel, lodging, and food expenses.


C1 level text
 Source: LinguaPress The Enigma of the Missing Manuscript by John Doe
 The mystery of the missing manuscript has eluded generations of writers. It was said to contain the final, unpublished works and annotations of the author, whose sudden disappearance 
 had only added to the intrigue. The manuscript was believed to be hidden somewhere in the old mansion, a labyrinthine structure filled with secret passages and hidden rooms. 
 Many had tried to find it, but all had failed. The clues were cryptic, the dangers real, and the stakes high. For those who dared to search, it was a journey into the unknown, a test of wit and courage.



French
B2 level text
 Source: LinguaPress Myst√®res non r√©solus ‚Äì une histoire courte par Sarah Wollbach
 La carri√®re d‚Äôactrice de Megan a commenc√© un matin il y a quelques ann√©es, lorsqu‚Äôune femme l‚Äôa abord√©e dans le parking de son √©picerie de quartier. 
 ‚ÄúExcusez-moi,‚Äù dit-elle, ‚Äúmais avez-vous d√©j√† pris des cours de th√©√¢tre?‚Äù ‚Äî ‚ÄúNon,‚Äù r√©pondit-elle avec h√©sitation. La femme a fouill√© dans sa poche et tendu une carte √† Megan. 
 ‚ÄúJe suis directrice de casting de Myst√®res non r√©solus,‚Äù dit-elle en lui serrant la main. Megan avait toujours √©t√© fascin√©e par la sc√®ne. 
 Pendant des ann√©es, elle avait nourri en secret le r√™ve d‚Äô√™tre actrice, convaincue qu‚Äôau fond d‚Äôelle-m√™me se cachait un brillant cam√©l√©on comme Meryl Streep ou Julia Roberts. 
 Peut-√™tre que c‚Äô√©tait sa grande chance. ‚ÄúL‚Äô√©mission fait un reportage sur une femme qui a √©t√© kidnapp√©e,‚Äù continua la dame, ‚Äúet vous lui ressemblez exactement. 
 La ressemblance est incroyable. Seriez-vous int√©ress√©e par une audition?‚Äù Elle expliqua que le r√¥le valait quelques milliers de dollars pour deux jours de travail, plus les frais de voyage, de logement et de nourriture.


C1 level text
 Source: LinguaPress L‚ÄôEnigme du Manuscrit Disparu par Jean Dupont
 Le myst√®re du manuscrit disparu que tout le monde tentait de percer depuis des d√©cennies. On disait qu‚Äôil contenait les derni√®res ≈ìuvres finales, non publi√©es, d‚Äôun auteur renomm√©, 
 dont la disparition soudaine n‚Äôavait fait qu‚Äôajouter √† l‚Äôintrigue. On croyait que le manuscrit √©tait cach√© quelque part dans le vieux manoir, une structure labyrinthique remplie de passages secrets et de pi√®ces cach√©es. 
 Beaucoup avaient essay√© de le trouver, mais tous avaient √©chou√©. Les indices √©taient cryptiques, les dangers r√©els, et les enjeux √©lev√©s. Pour ceux qui osaient chercher, c‚Äô√©tait un voyage dans l‚Äôinconnu, un test d‚Äôesprit et de courage.



Dutch
B2 level text
 Source: LinguaPress Opgeloste mysteries ‚Äì een kortverhaal door Sarah Wollbach
 Megan‚Äôs acteercarri√®re begon op een ochtend een paar jaar geleden, toen een vrouw haar benaderde op de parkeerplaats van haar buurtwinkel. ‚ÄúExcuseer me,‚Äù zei ze, ‚Äúmaar heb je ooit acteerlessen gevolgd?‚Äù ‚Äî ‚ÄúNee,‚Äù antwoordde ze aarzelend. 
 De vrouw stak haar hand in haar zak en gaf Megan een kaartje. ‚ÄúIk ben een castingdirecteur voor Opgeloste mysteries,‚Äù zei ze, terwijl ze haar hand schudde. Megan was altijd al gefascineerd door het toneel. 
 Jarenlang had ze gefantaseerd over het zijn van een actrice, ervan overtuigd dat diep vanbinnen een briljante actrice zoals Meryl Streep of Julia Roberts schuilde. Misschien was dit haar grote doorbraak. 
 ‚ÄúDe show doet een reportage over een vrouw die ontvoerd is,‚Äù vervolgde de dame, ‚Äúen je lijkt precies op haar. De gelijkenis is verbazingwekkend. Zou je ge√Ønteresseerd zijn in een auditie?‚Äù 
 Ze zette uit dat dit alles een paar duizend dollar waard was voor twee dagen werk, plus reis-, verblijf- en voedselkosten.


C1 level text
 Source: LinguaPress Het Raadsel van het Verdwenen Manuscript door Jan Jansen
 Het mysterie van het verdwenen manuscript dat generaties schrijvers decennialang verbijsterd. Er werd gezegd dat het de laatste, ongepubliceerde werken van een beroemde auteur bevatte, wiens plotselinge verdwijning alleen maar bijdroeg aan de intrige. 
 Het gerucht deed de ronde dat het manuscript ergens in het oude herenhuis verborgen was, een labyrintische structuur vol geheime gangen en verborgen kamers. Velen hadden geprobeerd het te vinden, maar allemaal waren ze mislukt. 
 De aanwijzingen waren cryptisch, de gevaren echt, en de inzet hoog. Voor degenen die durfden te zoeken, was het een reis in het onbekende, een test van verstand en moed.


Input Text content to check: \"\"\"{text}\"\"\" 
"""
    try:
        model = genai.GenerativeModel("gemini-2.0-flash")
        response = model.generate_content(prompt, generation_config={"temperature": 0.2})
        output = response.text.strip()

        if output.startswith("```"):
            output = output.strip("` \n").replace("xml", "").strip()

        scores = {
            "vocabulary_complexity": extract_xml_score(output, "vocabulary_complexity"),
            "grammatical_structures": extract_xml_score(output, "grammatical_structures"),
            "overall_clarity": extract_xml_score(output, "overall_clarity"),
            "coherence": extract_xml_score(output, "coherence"),
            "rationale": extract_xml_rationale(output),
        }
        return scores

    except Exception as e:
        return {
            "vocabulary_complexity": 0,
            "grammatical_structures": 0,
            "overall_clarity": 0,
            "coherence": 0,
            "rationale": f"Error: {str(e)}",
        }

def extract_xml_score(xml_text, tag):
    match = re.search(fr"<{tag}>(\d+)</{tag}>", xml_text)
    return int(match.group(1)) if match else 0

def extract_xml_rationale(xml_text):
    match = re.search(r"<rationale>(.*?)</rationale>", xml_text, re.DOTALL)
    return match.group(1).strip() if match else "No rationale found."

# ---------------------- Processing Function ----------------------
def process_single_url(url_data):
    """Process a single URL - designed for parallel execution"""
    url, page_type = url_data
    
    try:
        # Extract and clean text
        text = extract_clean_text_kbc(url)
        if not text.strip():
            return create_error_result(url, page_type, "No text extracted")
        
        cleaned_text = clean_kbc_text(text)
        
        # Score with Gemini
        result = score_page_with_gemini(cleaned_text, page_type)
        
        sub_scores = [
            result.get("vocabulary_complexity", 0),
            result.get("grammatical_structures", 0),
            result.get("overall_clarity", 0),
            result.get("coherence", 0),
        ]
        
        compliance_value = round(sum(sub_scores) / 4 * 10) if all(
            isinstance(score, int) and 0 <= score <= 10 for score in sub_scores
        ) else 0
        
        return {
            "URL": url,
            "Page Type": page_type,
            "Compliance Level": compliance_value,
            "Vocabulary Complexity": result.get("vocabulary_complexity"),
            "Grammatical Structures": result.get("grammatical_structures"),
            "Overall Clarity": result.get("overall_clarity"),
            "Coherence": result.get("coherence"),
            "Rationale": result.get("rationale"),
        }
        
    except Exception as e:
        return create_error_result(url, page_type, str(e))

def create_error_result(url, page_type, error_msg):
    """Create error result dictionary"""
    return {
        "URL": url,
        "Page Type": page_type,
        "Compliance Level": 0,
        "Vocabulary Complexity": 0,
        "Grammatical Structures": 0,
        "Overall Clarity": 0,
        "Coherence": 0,
        "Rationale": f"Error: {error_msg}",
    }

# ---------------------- Main Analysis ----------------------
def analyze_kbc_b2_compliance(excel_file_path, max_workers=4, batch_size=20):
    """Optimized main function with parallel processing"""
    
    print("üöÄ KBC B2 Compliance Analysis (Optimized)")
    print("="*60)
    
    # Load URLs from Excel
    try:
        df_urls = pd.read_excel(excel_file_path)
        url_columns = ['Address', 'URL', 'url', 'address', 'link', 'Link']
        url_column = None
        for col in url_columns:
            if col in df_urls.columns:
                url_column = col
                break
        
        if url_column is None:
            url_column = df_urls.columns[0]
        
        urls = df_urls[url_column].dropna().tolist()
        print(f"‚úÖ Loaded {len(urls)} URLs from {url_column} column")
        
    except Exception as e:
        print(f"‚ùå Error loading Excel file: {e}")
        return
    
    # Create DataFrame and classify URLs
    df = pd.DataFrame({'Address': urls})
    df["Page Type"] = df["Address"].apply(classify_kbc_url)
    
    print(f"\nüìä URL Distribution:")
    print(df['Page Type'].value_counts())
    
    # Setup files
    checkpoint_file = "KBC/kbc_b2_checkpoint.csv"
    log_file = "KBC/kbc_b2_log.txt"
    
    # Resume from checkpoint
    processed_urls = set()
    scores = []
    
    if os.path.exists(log_file):
        with open(log_file, 'r') as f:
            processed_urls = set(line.strip() for line in f.readlines())
    
    if os.path.exists(checkpoint_file):
        scores_df = pd.read_csv(checkpoint_file)
        scores = scores_df.to_dict(orient='records')
        print(f"üìÇ Resuming from checkpoint with {len(scores)} existing scores")
    
    # Filter unprocessed URLs
    unprocessed_data = [
        (url, page_type) for url, page_type in zip(df['Address'], df['Page Type'])
        if url not in processed_urls
    ]
    
    if not unprocessed_data:
        print("‚úÖ All URLs already processed!")
        return pd.DataFrame(scores)
    
    print(f"üîÑ Processing {len(unprocessed_data)} remaining URLs with {max_workers} workers")
    
    # Process in batches with parallel execution
    for i in range(0, len(unprocessed_data), batch_size):
        batch = unprocessed_data[i:i + batch_size]
        batch_results = []
        
        print(f"\nüì¶ Processing batch {i//batch_size + 1}/{(len(unprocessed_data)-1)//batch_size + 1}")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks in the batch
            future_to_url = {
                executor.submit(process_single_url, url_data): url_data[0] 
                for url_data in batch
            }
            
            # Collect results with progress bar
            for future in tqdm(as_completed(future_to_url), total=len(batch), desc="ü§ñ Scoring"):
                url = future_to_url[future]
                try:
                    result = future.result(timeout=60)  # 60 second timeout per URL
                    batch_results.append(result)
                    
                    # Log processed URL
                    with open(log_file, 'a') as f:
                        f.write(url + '\n')
                        
                except Exception as e:
                    print(f"‚ùå Failed to process {url}: {e}")
                    # Add error result
                    batch_results.append(create_error_result(url, "Unknown", str(e)))
        
        # Add batch results to main scores
        scores.extend(batch_results)
        
        # Save checkpoint after each batch
        pd.DataFrame(scores).to_csv(checkpoint_file, index=False)
        print(f"üíæ Checkpoint saved: {len(scores)} total items processed")
        
        # Cleanup drivers for this batch
        for _ in range(max_workers):
            try:
                cleanup_driver()
            except:
                pass
        
        # Brief pause between batches
        time.sleep(2)
    
    # Final save and cleanup
    final_df = pd.DataFrame(scores)
    final_df.to_excel("KBC/kbc_b2_final_results.xlsx", index=False)
    
    # Summary
    if len(final_df) > 0:
        overall_score = final_df["Compliance Level"].mean()
        print(f"\nüåê Overall CEFR B2 Accessibility Score: {overall_score:.2f}%")
        
        typology_avg = final_df.groupby("Page Type")["Compliance Level"].mean()
        for typ, score in typology_avg.items():
            status = "‚úÖ" if score >= 70 else "‚ö†Ô∏è"
            print(f"{status} {typ} pages: {score:.2f}% average compliance")
    
    print(f"\n‚úÖ Analysis complete! Results saved to: kbc_b2_final_results.xlsx")
    return final_df

# ---------------------- EXECUTION ----------------------
if __name__ == "__main__":
    excel_file_path = "KBC/kbc_urls.xlsx"
    # Adjust parameters for your system:
    # max_workers: Number of parallel threads (4-8 recommended)
    # batch_size: URLs processed before checkpoint (20-50 recommended)
    analyze_kbc_b2_compliance(excel_file_path, max_workers=6, batch_size=30)

üöÄ KBC B2 Compliance Analysis (Optimized)
‚úÖ Loaded 3109 URLs from Address column

üìä URL Distribution:
Page Type
Product    1615
Other      1349
Legal        58
Contact      30
FAQ          30
Blog         27
Name: count, dtype: int64
üìÇ Resuming from checkpoint with 1470 existing scores
üîÑ Processing 1593 remaining URLs with 6 workers

üì¶ Processing batch 1/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:11<00:00,  2.38s/it]


üíæ Checkpoint saved: 1500 total items processed

üì¶ Processing batch 2/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:57<00:00,  1.91s/it]


üíæ Checkpoint saved: 1530 total items processed

üì¶ Processing batch 3/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:10<00:00,  2.34s/it]


üíæ Checkpoint saved: 1560 total items processed

üì¶ Processing batch 4/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:29<00:00,  2.99s/it]


üíæ Checkpoint saved: 1590 total items processed

üì¶ Processing batch 5/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:56<00:00,  1.89s/it]


üíæ Checkpoint saved: 1620 total items processed

üì¶ Processing batch 6/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:59<00:00,  2.00s/it]


üíæ Checkpoint saved: 1650 total items processed

üì¶ Processing batch 7/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:04<00:00,  2.14s/it]


üíæ Checkpoint saved: 1680 total items processed

üì¶ Processing batch 8/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:04<00:00,  2.14s/it]


üíæ Checkpoint saved: 1710 total items processed

üì¶ Processing batch 9/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:01<00:00,  2.04s/it]


üíæ Checkpoint saved: 1740 total items processed

üì¶ Processing batch 10/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:59<00:00,  1.97s/it]


üíæ Checkpoint saved: 1770 total items processed

üì¶ Processing batch 11/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:56<00:00,  1.89s/it]


üíæ Checkpoint saved: 1800 total items processed

üì¶ Processing batch 12/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.00s/it]


üíæ Checkpoint saved: 1830 total items processed

üì¶ Processing batch 13/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:56<00:00,  1.87s/it]


üíæ Checkpoint saved: 1860 total items processed

üì¶ Processing batch 14/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:01<00:00,  2.06s/it]


üíæ Checkpoint saved: 1890 total items processed

üì¶ Processing batch 15/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.01s/it]


üíæ Checkpoint saved: 1920 total items processed

üì¶ Processing batch 16/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:58<00:00,  1.95s/it]


üíæ Checkpoint saved: 1950 total items processed

üì¶ Processing batch 17/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.01s/it]


üíæ Checkpoint saved: 1980 total items processed

üì¶ Processing batch 18/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:55<00:00,  1.86s/it]


üíæ Checkpoint saved: 2010 total items processed

üì¶ Processing batch 19/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:57<00:00,  1.91s/it]


üíæ Checkpoint saved: 2040 total items processed

üì¶ Processing batch 20/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:03<00:00,  2.10s/it]


üíæ Checkpoint saved: 2070 total items processed

üì¶ Processing batch 21/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:02<00:00,  2.07s/it]


üíæ Checkpoint saved: 2100 total items processed

üì¶ Processing batch 22/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.02s/it]


üíæ Checkpoint saved: 2130 total items processed

üì¶ Processing batch 23/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.03s/it]


üíæ Checkpoint saved: 2160 total items processed

üì¶ Processing batch 24/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:02<00:00,  2.09s/it]


üíæ Checkpoint saved: 2190 total items processed

üì¶ Processing batch 25/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:57<00:00,  1.91s/it]


üíæ Checkpoint saved: 2220 total items processed

üì¶ Processing batch 26/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:04<00:00,  2.15s/it]


üíæ Checkpoint saved: 2250 total items processed

üì¶ Processing batch 27/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.00s/it]


üíæ Checkpoint saved: 2280 total items processed

üì¶ Processing batch 28/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:58<00:00,  1.94s/it]


üíæ Checkpoint saved: 2310 total items processed

üì¶ Processing batch 29/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:59<00:00,  1.99s/it]


üíæ Checkpoint saved: 2340 total items processed

üì¶ Processing batch 30/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:01<00:00,  2.03s/it]


üíæ Checkpoint saved: 2370 total items processed

üì¶ Processing batch 31/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:04<00:00,  2.14s/it]


üíæ Checkpoint saved: 2400 total items processed

üì¶ Processing batch 32/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:04<00:00,  2.16s/it]


üíæ Checkpoint saved: 2430 total items processed

üì¶ Processing batch 33/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:07<00:00,  2.23s/it]


üíæ Checkpoint saved: 2460 total items processed

üì¶ Processing batch 34/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:03<00:00,  2.11s/it]


üíæ Checkpoint saved: 2490 total items processed

üì¶ Processing batch 35/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:04<00:00,  2.15s/it]


üíæ Checkpoint saved: 2520 total items processed

üì¶ Processing batch 36/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:57<00:00,  1.92s/it]


üíæ Checkpoint saved: 2550 total items processed

üì¶ Processing batch 37/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.01s/it]


üíæ Checkpoint saved: 2580 total items processed

üì¶ Processing batch 38/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:55<00:00,  1.85s/it]


üíæ Checkpoint saved: 2610 total items processed

üì¶ Processing batch 39/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.03s/it]


üíæ Checkpoint saved: 2640 total items processed

üì¶ Processing batch 40/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:03<00:00,  2.12s/it]


üíæ Checkpoint saved: 2670 total items processed

üì¶ Processing batch 41/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.01s/it]


üíæ Checkpoint saved: 2700 total items processed

üì¶ Processing batch 42/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:10<00:00,  2.34s/it]


üíæ Checkpoint saved: 2730 total items processed

üì¶ Processing batch 43/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:56<00:00,  1.89s/it]


üíæ Checkpoint saved: 2760 total items processed

üì¶ Processing batch 44/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:55<00:00,  1.87s/it]


üíæ Checkpoint saved: 2790 total items processed

üì¶ Processing batch 45/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:00<00:00,  2.03s/it]


üíæ Checkpoint saved: 2820 total items processed

üì¶ Processing batch 46/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:59<00:00,  1.98s/it]


üíæ Checkpoint saved: 2850 total items processed

üì¶ Processing batch 47/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:57<00:00,  1.92s/it]


üíæ Checkpoint saved: 2880 total items processed

üì¶ Processing batch 48/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:58<00:00,  1.94s/it]


üíæ Checkpoint saved: 2910 total items processed

üì¶ Processing batch 49/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:01<00:00,  2.05s/it]


üíæ Checkpoint saved: 2940 total items processed

üì¶ Processing batch 50/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:02<00:00,  2.08s/it]


üíæ Checkpoint saved: 2970 total items processed

üì¶ Processing batch 51/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:03<00:00,  2.11s/it]


üíæ Checkpoint saved: 3000 total items processed

üì¶ Processing batch 52/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:02<00:00,  2.08s/it]


üíæ Checkpoint saved: 3030 total items processed

üì¶ Processing batch 53/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:11<00:00,  2.40s/it]


üíæ Checkpoint saved: 3060 total items processed

üì¶ Processing batch 54/54


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:11<00:00,  3.72s/it]


üíæ Checkpoint saved: 3063 total items processed

üåê Overall CEFR B2 Accessibility Score: 68.74%
‚ö†Ô∏è Blog pages: 64.15% average compliance
‚úÖ Contact pages: 80.90% average compliance
‚úÖ FAQ pages: 73.38% average compliance
‚úÖ Legal pages: 70.25% average compliance
‚ö†Ô∏è Other pages: 66.80% average compliance
‚úÖ Product pages: 70.06% average compliance

‚úÖ Analysis complete! Results saved to: kbc_b2_final_results.xlsx


<div style="background-color: rgb(2, 124, 18); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    5. Specialized Code for Argenta
</div>

FITTED SCRAPER
- It uses a web driver that acts like a real browser and additional settings to disable blockers.
- I tried selenium, playwrigt and nothing worked except for this one.

In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import time
import json
import os
import re
import google.generativeai as genai
from tqdm import tqdm
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# ---------------------- CONFIG ----------------------
genai.configure(api_key="AIzaSyBzOT2O03scMENbdWouWexYa10v4K4OVPE")  # Replace with your actual API key

# ---------------------- STEP 1: Load Data ----------------------
def load_file(file_path):
    """Load data from CSV or Excel file"""
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    elif file_path.endswith(".xlsx"):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format")
    return df

# ---------------------- STEP 2: Typology Classification ----------------------
def classify_url(url):
    """Classify URL by page type based on URL patterns"""
    url = url.lower()
    
    if any(term in url for term in ["product", "producten", "produits",  # general
                                  "lenen", "loan", "pret",  # loans
                                  "sparen", "saving", "epargne",  # savings
                                  "rekening", "account", "compte",  # accounts
                                  "beleggen", "investment", "investir",  # investments
                                  "hypotheek", "mortgage", "hypothecaire",  # mortgage
                                  "verzekering", "insurance", "assurance",  # insurance
                                  "kaart", "card", "carte",  # cards
                                  "bankieren", "banking", "banque"]):
        return "Product"
    elif any(term in url for term in ["faq", "support", "help", "hulp", "ondersteuning", "aide", "questions", 
                                     "klantenservice", "clientservice", "contactcenter", "assistance"]):
        return "FAQ"
    elif any(term in url for term in ["legal", "juridisch", "juridique", "voorwaarden", "terms", 
                                    "conditions", "privacy", "beleid", "policy", "cookie", 
                                    "gdpr", "compliance", "disclaimer", "protection", "gegevensbescherming"]):
        return "Legal"
    elif any(term in url for term in ["contact", "locatie", "location", "agences", "branches", "agents", 
                                     "kantoren", "bureaux", "afspraak", "appointment", "form", 
                                     "formulier", "trouver", "bereikbaarheid"]):
        return "Contact"
    elif "blog" in url:
        return "Blog"
    else:
        return "Other"

# ---------------------- STEP 3: Optimized Argenta Content Extractor ----------------------
def extract_clean_text_with_selenium_argenta(url, timeout=10):
    """
    Fast and efficient text extraction for Argenta banking website
    """
    driver = None
    try:
        # Minimal but effective options
        options = Options()
        options.headless = True  # Run headless for speed
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('--disable-extensions')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument('--window-size=1420,1080')
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36')
        
        # Disable images and CSS for faster loading
        prefs = {
            'profile.default_content_setting_values': {
                'images': 2,  # 2 = block images
                'plugins': 2,
                'popups': 2,
                'geolocation': 2,
                'notifications': 2,
                'auto_select_certificate': 2,
                'fullscreen': 2,
                'mouselock': 2,
                'mixed_script': 2,
                'media_stream': 2,
                'media_stream_mic': 2,
                'media_stream_camera': 2,
                'protocol_handlers': 2,
                'ppapi_broker': 2,
                'automatic_downloads': 2,
                'midi_sysex': 2,
                'push_messaging': 2,
                'ssl_cert_decisions': 2,
                'metro_switch_to_desktop': 2,
                'protected_media_identifier': 2,
                'app_banner': 2,
                'site_engagement': 2,
                'durable_storage': 2
            },
            'disk-cache-size': 4096
        }
        options.add_experimental_option('prefs', prefs)
        
        # Use default Chrome driver
        driver = webdriver.Chrome(options=options)
        
        # Mask webdriver flag
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
        # Navigate with page load strategy
        driver.set_page_load_timeout(timeout)
        driver.get(url)
        
        # Short wait for critical content
        wait = WebDriverWait(driver, 5)
        
        # Handle cookie banner quickly - one attempt only
        try:
            cookie_btn = wait.until(EC.element_to_be_clickable((
                By.XPATH, "//button[contains(text(), 'Accepter tous les cookies') or contains(text(), 'Alle cookies aanvaarden')]"
            )))
            cookie_btn.click()
        except:
            pass  # Continue if cookie banner isn't found or clickable
        
        # Let page render minimally - very short wait
        time.sleep(1.5)
        
        # Parse page source once
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Check if we got a JavaScript warning
        if "ne prend pas en charge JavaScript" in soup.text or "does not support JavaScript" in soup.text:
            # Try an alternative approach if JavaScript warning is detected
            driver.execute_script("window.scrollTo(0, 300);")
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Focused content extraction - only look for important content areas
        content_elements = []
        
        # Try to find main content containers
        main_content = soup.select_one('main, #content, article, .container, .content')
        
        if main_content:
            # If a main container is found, use that
            content_elements = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
        else:
            # Fall back to all body paragraphs and headers
            content_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
        
        # Extract and clean text
        text = ' '.join([elem.get_text(strip=True) for elem in content_elements if elem.get_text(strip=True)])
        
        # If we got very little text, try a simpler approach
        if len(text) < 100:
            # Remove just scripts and styles
            for tag in soup(['script', 'style']):
                tag.decompose()
            text = soup.get_text(separator=' ', strip=True)
            text = ' '.join(text.split())
        
        return text[:4000]  # Limit to 4000 chars as requested in the original
        
    except Exception as e:
        print(f"Error: {str(e)}")
        # If we have driver contents, try to extract something
        if driver:
            try:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                return soup.get_text(separator=' ', strip=True)[:4000]
            except:
                pass
        return ""
        
    finally:
        # Always close the driver
        if driver:
            driver.quit()

# General extraction function that calls Argenta-specific function for Argenta URLs
def extract_clean_text_with_selenium(url):
    """Extract text based on URL domain"""
    if "argenta.be" in url:
        return extract_clean_text_with_selenium_argenta(url)
    else:
        # Call the appropriate scraper for other banks
        # For now, defaulting to Argenta scraper as a fallback
        return extract_clean_text_with_selenium_argenta(url)

# ---------------------- STEP 4: Text Cleaning ----------------------
def clean_extracted_text(text):
    """Clean extracted text for better analysis"""
    if not text:
        return ""
        
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove control characters
    text = ''.join(c for c in text if c.isprintable() or c.isspace())
    
    # Remove any script remnants
    text = re.sub(r'<script.*?</script>', '', text, flags=re.DOTALL)
    
    # Remove any style remnants  
    text = re.sub(r'<style.*?</style>', '', text, flags=re.DOTALL)
    
    # Normalize spaces again
    text = ' '.join(text.split())
    
    return text

# ---------------------- STEP 5: Score with Gemini ----------------------
def score_page_with_gemini(text, page_type):
    """Use Gemini AI to score the page for CEFR B2 compliance"""
    prompt = f"""
**Context:** This prompt is designed for the Gemini language model to evaluate the CEFR B2 level compliance of webpage content from retail banking websites for regulatory compliance. The evaluation focuses on vocabulary, grammar, clarity, and coherence to determine if the text is easily understandable for someone at a B2 level in English, French, or Dutch. The desired output includes the compliance level percentage and individual scores for vocabulary complexity, grammatical structures, overall clarity, and coherence, with a detailed rationale for each evaluated address presented in a single cell of an output file (e.g., CSV or Excel). The goal is to ensure the evaluation effectively differentiates between webpages with varying levels of B2 compliance, leading to a wider range of scores, and that the rationale is comprehensive yet concise enough to fit within a single cell per address. **It is important to consider that these are banking websites, and some technical or financial terms may be inherent to the content.**

**Task:** Assess the CEFR B2 compliance level of the provided webpage content, ensuring a variable range of scores and a detailed, single-cell rationale for each evaluated address, **while acknowledging the potential presence of necessary banking terminology.**

**Instructions:**

1. **Identify Language:** Determine if the input text is in English, French, or Dutch.

2. **Evaluate B2 Compliance with Granularity (Considering Banking Terms):** Analyze the text against the CEFR B2 criteria for the identified language, critically and precisely assessing the following aspects on a scale of 0 to 10. Avoid assigning only 0 or 10; use the full scale based on nuance and subtlety. ‚ÄúDo not hesitate to assign low (0‚Äì4) or high (8‚Äì10) scores when the text clearly deserves it. Avoid accumulating around 6‚Äì7 unless the text is truly average.‚Äù Remember the compliancy threshold is 70% (7/10) for B2 level. Therefore if a text is generally compliant it should receive a total score of higher than or equal to 70.

- **Vocabulary Complexity (0‚Äì10)**
  - 10 ‚Üí very simple, common words, basic banking terms, no jargon
  - 7‚Äì9 ‚Üí mostly common words, occasional technical terms explained
  - 4‚Äì6 ‚Üí mix of general and technical terms, some unnecessarily complex or rare words
  - 1‚Äì3 ‚Üí frequent use of complex, low-frequency words or jargon, often unexplained
  - 0 ‚Üí highly complex, dense language with rare or unexplained terms everywhere

- **Grammatical Structures (0‚Äì10)**
  - 10 ‚Üí simple sentences, clear structure, active voice, no complex clauses
  - 7‚Äì9 ‚Üí mostly simple, some moderate clauses, minor passive use
  - 4‚Äì6 ‚Üí mix of simple and complex sentences, occasional embedded or passive forms
  - 1‚Äì3 ‚Üí mostly long, embedded, or passive structures, hard to follow
  - 0 ‚Üí extremely complex grammar, frequent embedding, difficult to parse

- **Overall Clarity (0‚Äì10)**
  - 10 ‚Üí very clear, easy to understand, minimal effort required
  - 7‚Äì9 ‚Üí mostly clear, small moments of complexity
  - 4‚Äì6 ‚Üí mixed clarity, occasional confusion or ambiguity
  - 1‚Äì3 ‚Üí often unclear, requires effort to interpret
  - 0 ‚Üí very unclear, confusing, hard to follow

- **Coherence (0‚Äì10)**
  - 10 ‚Üí logical flow, clear organization, excellent connectors
  - 7‚Äì9 ‚Üí mostly logical, some jumps, minor missing links
  - 4‚Äì6 ‚Üí mixed coherence, weak transitions, partial disorganization
  - 1‚Äì3 ‚Üí often disorganized, unclear connections
  - 0 ‚Üí no logical order, chaotic, fragmented

3. **Provide Detailed Rationale (Single Cell):** Explain the reasoning behind each of the four scores within a single text string suitable for one Excel cell. Explicitly point out specific linguistic features (vocabulary, grammar, discourse markers) that contribute to the assigned level of complexity or simplicity for each criterion. When discussing vocabulary, specifically comment on the presence and handling of banking terminology. Justify why the text is or is not strictly at the B2 level for each aspect. Use clear separators (e.g., "; ") between the rationale for each criterion to ensure readability within the single cell.
```xml
<rationale>Vocabulary: [Explanation with examples, noting banking terms]; Grammar: [Explanation with examples]; Clarity: [Explanation with examples, considering banking terms]; Coherence: [Explanation with examples]</rationale>

**Output Format:**
Return the evaluation in the following XML format, ensuring all information for a single evaluated webpage address can be represented as a single row in an output file:
```xml
<vocabulary_complexity>Y</vocabulary_complexity>
<grammatical_structures>Z</grammatical_structures>
<overall_clarity>W</overall_clarity>
<coherence>V</coherence>
<rationale>Vocabulary: [Explanation with examples, noting banking terms], Grammar: [Explanation with examples], Clarity: [Explanation with examples, considering banking terms], Coherence: [Explanation with examples]</rationale>

Examples of B2 Compliant Texts and C1 Texts Which Are Not B2 Compliant
 To help you understand the evaluation criteria, here are some examples of texts rated at B2 and C1 levels:

English
B2 level text
 Source: LinguaPress Unsolved mysteries ‚Äì a short story by Sarah Wollbach
 Megan‚Äôs acting career began one morning a couple of years ago, when a woman approached her in the parking lot of her neighborhood grocery store. ‚ÄúExcuse me,‚Äù she said, ‚Äúbut have you ever taken acting lessons?‚Äù ‚Äî ‚ÄúNo,‚Äù she answered hesitantly. 
 The woman reached into her pocket and handed Megan a card. ‚ÄúI‚Äôm a casting director for Unsolved Mysteries,‚Äù she said, shaking her hand. Megan had always been stage-struck. 
 For years she'd fantasized about being an actor, sure that deep within her lurked a brilliant chameleon like Meryl Streep or Julia Roberts. Maybe this was her big break. 
 ‚ÄúThe show‚Äôs doing a feature about a woman who was kidnapped,‚Äù the lady continued, ‚Äúand you look exactly like her. The resemblance is amazing. Would you be interested in auditioning?‚Äù 
 The episode aired the next week, with a couple of thousand dollars for two days‚Äô work, plus travel, lodging, and food expenses.


C1 level text
 Source: LinguaPress The Enigma of the Missing Manuscript by John Doe
 The mystery of the missing manuscript has eluded generations of writers. It was said to contain the final, unpublished works and annotations of the author, whose sudden disappearance 
 had only added to the intrigue. The manuscript was believed to be hidden somewhere in the old mansion, a labyrinthine structure filled with secret passages and hidden rooms. 
 Many had tried to find it, but all had failed. The clues were cryptic, the dangers real, and the stakes high. For those who dared to search, it was a journey into the unknown, a test of wit and courage.



French
B2 level text
 Source: LinguaPress Myst√®res non r√©solus ‚Äì une histoire courte par Sarah Wollbach
 La carri√®re d‚Äôactrice de Megan a commenc√© un matin il y a quelques ann√©es, lorsqu‚Äôune femme l‚Äôa abord√©e dans le parking de son √©picerie de quartier. 
 ‚ÄúExcusez-moi,‚Äù dit-elle, ‚Äúmais avez-vous d√©j√† pris des cours de th√©√¢tre?‚Äù ‚Äî ‚ÄúNon,‚Äù r√©pondit-elle avec h√©sitation. La femme a fouill√© dans sa poche et tendu une carte √† Megan. 
 ‚ÄúJe suis directrice de casting de Myst√®res non r√©solus,‚Äù dit-elle en lui serrant la main. Megan avait toujours √©t√© fascin√©e par la sc√®ne. 
 Pendant des ann√©es, elle avait nourri en secret le r√™ve d‚Äô√™tre actrice, convaincue qu‚Äôau fond d‚Äôelle-m√™me se cachait un brillant cam√©l√©on comme Meryl Streep ou Julia Roberts. 
 Peut-√™tre que c‚Äô√©tait sa grande chance. ‚ÄúL‚Äô√©mission fait un reportage sur une femme qui a √©t√© kidnapp√©e,‚Äù continua la dame, ‚Äúet vous lui ressemblez exactement. 
 La ressemblance est incroyable. Seriez-vous int√©ress√©e par une audition?‚Äù Elle expliqua que le r√¥le valait quelques milliers de dollars pour deux jours de travail, plus les frais de voyage, de logement et de nourriture.


C1 level text
 Source: LinguaPress L‚ÄôEnigme du Manuscrit Disparu par Jean Dupont
 Le myst√®re du manuscrit disparu que tout le monde tentait de percer depuis des d√©cennies. On disait qu‚Äôil contenait les derni√®res ≈ìuvres finales, non publi√©es, d‚Äôun auteur renomm√©, 
 dont la disparition soudaine n‚Äôavait fait qu‚Äôajouter √† l‚Äôintrigue. On croyait que le manuscrit √©tait cach√© quelque part dans le vieux manoir, une structure labyrinthique remplie de passages secrets et de pi√®ces cach√©es. 
 Beaucoup avaient essay√© de le trouver, mais tous avaient √©chou√©. Les indices √©taient cryptiques, les dangers r√©els, et les enjeux √©lev√©s. Pour ceux qui osaient chercher, c‚Äô√©tait un voyage dans l‚Äôinconnu, un test d‚Äôesprit et de courage.



Dutch
B2 level text
 Source: LinguaPress Opgeloste mysteries ‚Äì een kortverhaal door Sarah Wollbach
 Megan‚Äôs acteercarri√®re begon op een ochtend een paar jaar geleden, toen een vrouw haar benaderde op de parkeerplaats van haar buurtwinkel. ‚ÄúExcuseer me,‚Äù zei ze, ‚Äúmaar heb je ooit acteerlessen gevolgd?‚Äù ‚Äî ‚ÄúNee,‚Äù antwoordde ze aarzelend. 
 De vrouw stak haar hand in haar zak en gaf Megan een kaartje. ‚ÄúIk ben een castingdirecteur voor Opgeloste mysteries,‚Äù zei ze, terwijl ze haar hand schudde. Megan was altijd al gefascineerd door het toneel. 
 Jarenlang had ze gefantaseerd over het zijn van een actrice, ervan overtuigd dat diep vanbinnen een briljante actrice zoals Meryl Streep of Julia Roberts schuilde. Misschien was dit haar grote doorbraak. 
 ‚ÄúDe show doet een reportage over een vrouw die ontvoerd is,‚Äù vervolgde de dame, ‚Äúen je lijkt precies op haar. De gelijkenis is verbazingwekkend. Zou je ge√Ønteresseerd zijn in een auditie?‚Äù 
 Ze zette uit dat dit alles een paar duizend dollar waard was voor twee dagen werk, plus reis-, verblijf- en voedselkosten.


C1 level text
 Source: LinguaPress Het Raadsel van het Verdwenen Manuscript door Jan Jansen
 Het mysterie van het verdwenen manuscript dat generaties schrijvers decennialang verbijsterd. Er werd gezegd dat het de laatste, ongepubliceerde werken van een beroemde auteur bevatte, wiens plotselinge verdwijning alleen maar bijdroeg aan de intrige. 
 Het gerucht deed de ronde dat het manuscript ergens in het oude herenhuis verborgen was, een labyrintische structuur vol geheime gangen en verborgen kamers. Velen hadden geprobeerd het te vinden, maar allemaal waren ze mislukt. 
 De aanwijzingen waren cryptisch, de gevaren echt, en de inzet hoog. Voor degenen die durfden te zoeken, was het een reis in het onbekende, een test van verstand en moed.


Input Text content to check: \"\"\"{text}\"\"\" 
"""
    try:
        model = genai.GenerativeModel("gemini-2.0-flash")
        response = model.generate_content(
            prompt,
            generation_config={"temperature": 0.2}
        )
        output = response.text.strip()

        if output.startswith("```"):
            output = output.strip("` \n").replace("xml", "").strip()

        # Extract scores using regex from the XML
        scores = {
            "vocabulary_complexity": extract_xml_score(output, "vocabulary_complexity"),
            "grammatical_structures": extract_xml_score(output, "grammatical_structures"),
            "overall_clarity": extract_xml_score(output, "overall_clarity"),
            "coherence": extract_xml_score(output, "coherence"),
            "rationale": extract_xml_rationale(output),
        }

        return scores

    except Exception as e:
        print(f"‚ùå Error scoring page with Gemini: {e}")
        return {
            "vocabulary_complexity": 0,
            "grammatical_structures": 0,
            "overall_clarity": 0,
            "coherence": 0,
            "rationale": f"Error occurred during evaluation: {str(e)}"
        }

def extract_xml_score(xml_text, tag):
    match = re.search(fr"<{tag}>(\d+)</{tag}>", xml_text)
    return int(match.group(1)) if match else 0

def extract_xml_rationale(xml_text):
    match = re.search(r"<rationale>(.*?)</rationale>", xml_text, re.DOTALL)
    return match.group(1).strip() if match else "No rationale found."

# ---------------------- STEP 6: Evaluation + Warning ----------------------
def evaluate_accessibility(df, file_path):
    """Process all URLs and evaluate their CEFR B2 accessibility"""
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    if base_name.endswith("_urls"):
        base_name = base_name.replace("_urls", "")
    scores = []
    checkpoint_file = f"{base_name}_b2_accessibility_checkpoint.xlsx"
    log_file = f"{base_name}_b2_accessibility_log.txt"

    # Check if log exists ‚Üí skip already processed URLs
    processed_urls = set()
    if os.path.exists(log_file):
        with open(log_file, 'r') as f:
            processed_urls = set(line.strip() for line in f.readlines())

    # Resume from checkpoint if exists
    if os.path.exists(checkpoint_file):
        scores_df = pd.read_excel(checkpoint_file)
        scores = scores_df.to_dict(orient='records')
        print(f"‚úÖ Resuming from checkpoint with {len(scores)} already processed URLs")
    else:
        scores_df = pd.DataFrame()

    CHECKPOINT_EVERY = 20 

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring pages"):
        url = row['Address']

        if url in processed_urls:
            print(f"Skipping already processed URL: {url}")
            continue

        print(f"Scoring: {url}")
        try:
            text = extract_clean_text_with_selenium(url)
            
            if not text or len(text.strip()) < 50:
                print(f"‚ö†Ô∏è Warning: Very little text extracted from {url} - skipping")
                continue
                
            cleaned_text = clean_extracted_text(text)
            result = score_page_with_gemini(cleaned_text, row['Page Type'])

            sub_scores = [
                result.get("vocabulary_complexity", 0),
                result.get("grammatical_structures", 0),
                result.get("overall_clarity", 0),
                result.get("coherence", 0),
            ]
            compliance_value = round(sum(sub_scores) / 4 * 10) if all(isinstance(score, int) and 0 <= score <= 10 for score in sub_scores) else 0

        except Exception as e:
            print(f"Error scoring page {url}: {e}")
            compliance_value = 0
            result = {
                "vocabulary_complexity": 0,
                "grammatical_structures": 0,
                "overall_clarity": 0,
                "coherence": 0,
                "rationale": f"Error occurred during evaluation: {str(e)}",
            }

        scores.append({
            "URL": url,
            "Page Type": row['Page Type'],
            "Compliance Level": compliance_value,
            "Vocabulary Complexity": result.get("vocabulary_complexity"),
            "Grammatical Structures": result.get("grammatical_structures"),
            "Overall Clarity": result.get("overall_clarity"),
            "Coherence": result.get("coherence"),
            "Rationale": result.get("rationale"),
        })

        # Log processed URL
        with open(log_file, 'a') as f:
            f.write(url + '\n')

        # Save checkpoint more frequently for Argenta
        if len(scores) % CHECKPOINT_EVERY == 0:
            checkpoint_df = pd.DataFrame(scores)
            checkpoint_df.to_excel(checkpoint_file, index=False)
            print(f"‚úÖ Checkpoint saved at {len(scores)} items")

        # Add a small delay to avoid overloading servers (and Gemini API)
        time.sleep(1)

    # Final save
    final_df = pd.DataFrame(scores)
    final_df.to_excel(f"{base_name}_b2_accessibility_final.xlsx", index=False)
    print("‚úÖ Final results saved")

    return final_df

# ---------------------- STEP 7: Output & Summary ----------------------
def output_summary(result_df, input_path):
    """Generate summary of the results"""
    # Extract filename without extension (e.g., "belfius_urls" ‚Üí "belfius")
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    if base_name.endswith("_urls"):
        base_name = base_name.replace("_urls", "")

    output_filename = f"{base_name}_b2_accessibility_scores.xlsx"
    
    # Save to current directory instead of hardcoded path
    result_df.to_excel(output_filename, index=False)
    print(f"\n‚úÖ Results saved to {output_filename}\n")

    # Score summaries
    overall_score = result_df["Compliance Level"].mean()
    print(f"üåê Overall CEFR B2 Accessibility Score: {overall_score:.2f}%")

    typology_avg = result_df.groupby("Page Type")["Compliance Level"].mean()
    for typ, score in typology_avg.items():
        if score < 70:
            print(f"‚ö†Ô∏è {typ} pages may require language simplification (avg score: {score:.2f})")
        else:
            print(f"‚úÖ {typ} pages meet accessibility requirements (avg score: {score:.2f})")

# ---------------------- URL Collection Function for Argenta ----------------------
def collect_argenta_urls(base_url="https://www.argenta.be", max_urls=50):
    """Collect Argenta URLs to analyze"""
    print(f"Collecting Argenta URLs from {base_url}")
    
    urls = set()
    visited = set()
    to_visit = [base_url]
    
    # Language variants to include
    langs = ["/fr/", "/nl/"]
    
    # Add some key product pages directly
    seed_urls = [
        "https://www.argenta.be/fr/investir/epargner-pour-votre-avenir.html",
        "https://www.argenta.be/fr/epargner-et-investir/investir/argenta-life-plan.html",
        "https://www.argenta.be/fr/pret-habitation.html",
        "https://www.argenta.be/fr/pret-auto.html",
        "https://www.argenta.be/fr/epargner-et-investir/assurances/assurance-solde-restant-du.html",
        "https://www.argenta.be/nl/lenen/woonkrediet.html",
        "https://www.argenta.be/nl/sparen-en-beleggen/beleggen/argenta-life-plan.html"
    ]
    
    for url in seed_urls:
        if url not in visited:
            to_visit.append(url)
    
    driver = None
    try:
        # Setup headless browser for link collection
        options = Options()
        options.headless = True
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36')
        
        driver = webdriver.Chrome(options=options)
        
        # Timeout for loading pages
        driver.set_page_load_timeout(10)
        
        while to_visit and len(urls) < max_urls:
            current_url = to_visit.pop(0)
            
            if current_url in visited:
                continue
                
            visited.add(current_url)
            
            try:
                print(f"Visiting {current_url} to collect links")
                driver.get(current_url)
                
                # Click cookie banner if exists
                try:
                    cookie_btn = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((
                        By.XPATH, "//button[contains(text(), 'Accepter tous les cookies') or contains(text(), 'Alle cookies aanvaarden')]"
                    )))
                    cookie_btn.click()
                except:
                    pass
                
                # Wait for page to load fully
                time.sleep(1)
                
                # Get all links
                elements = driver.find_elements(By.TAG_NAME, "a")
                page_links = [elem.get_attribute('href') for elem in elements]
                
                # Filter and add valid links
                for link in page_links:
                    if not link:
                        continue
                        
                    if link.startswith(base_url) and any(lang in link for lang in langs):
                        # Skip certain non-content URLs
                        if any(skip in link.lower() for skip in [
                            "javascript:", "#", "tel:", "mailto:", 
                            "/cookie", "/login", "/search", "?", 
                            ".pdf", ".jpg", ".png", ".gif"
                        ]):
                            continue
                            
                        if link not in visited and link not in to_visit:
                            to_visit.append(link)
                
                # Add current URL to collected set if it's valid
                if current_url.startswith(base_url) and any(lang in current_url for lang in langs):
                    urls.add(current_url)
                    print(f"Added URL #{len(urls)}: {current_url}")
                
            except Exception as e:
                print(f"Error collecting links from {current_url}: {e}")
                
            time.sleep(0.5)  # Small delay between requests
            
    except Exception as e:
        print(f"Error during URL collection: {e}")
    finally:
        if driver:
            driver.quit()
    
    print(f"Collected {len(urls)} unique Argenta URLs")
    return list(urls)

# ---------------------- Create Argenta URL file ----------------------
def create_argenta_url_file(max_urls=50, output_file="argenta_urls_output.xlsx"):
    """Create an Excel file with Argenta URLs to analyze"""
    urls = collect_argenta_urls(max_urls=max_urls)
    
    df = pd.DataFrame({
        "Address": urls
    })
    
    df.to_excel(output_file, index=False)
    print(f"‚úÖ Created URL file with {len(urls)} Argenta URLs: {output_file}")
    return output_file

# ---------------------- Main Runner ----------------------
def main(file_path=None, max_urls=50):
    """Main function to run the entire pipeline"""
    # If no file provided, create one with Argenta URLs
    if not file_path:
        file_path = create_argenta_url_file(max_urls=max_urls)
    
    # Load and process the URLs
    df = load_file(file_path)
    
    # Add page type classification
    df["Page Type"] = df["Address"].apply(classify_url)
    
    # Score all URLs
    result_df = evaluate_accessibility(df, file_path)
    
    # Output summary
    output_summary(result_df, file_path)

# Example usage
#if __name__ == "__main__":
    # To create a new Argenta URL file and analyze it (collecting 30 URLs):
    # main(max_urls=30)
    
    # Or to use an existing URL file:
    # main("argenta_urls.xlsx")

In [16]:
main("/Users/furkandemir/Desktop/Sailpeak/Accesibility/Argenta/argenta_urls_nl.xlsx")

‚úÖ Resuming from checkpoint with 660 already processed URLs


Scoring pages:   0%|          | 0/894 [00:00<?, ?it/s]

Skipping already processed URL: https://www.argenta.be/nl.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/rekeninguittreksels.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/argenta-app.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/argenta-app/systeemvereisten.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/argenta-app/argenta-app-updaten.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/argenta-app/argenta-app-installeren.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/argenta-app/demo.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/argenta-app/veiligheid.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/argenta-app/kredieten-in-de-argenta-app.html
Skipping already processed URL: https://www.argenta.be/nl/bankieren/argenta-app/verzekeren-in-de-argenta-app.html
Skipping already processed URL

Scoring pages:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 716/894 [00:32<00:08, 21.93it/s]

Scoring: https://www.argenta.be/nl/kantoren/steven-mertens-bv-9544.html
Scoring: https://www.argenta.be/nl/kantoren/tim-fripon-bv-3394.html


Scoring pages:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 718/894 [01:00<00:17,  9.88it/s]

Scoring: https://www.argenta.be/nl/kantoren/smedts-katleen-bv-3198.html


Scoring pages:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 719/894 [01:14<00:24,  7.20it/s]

Scoring: https://www.argenta.be/nl/kantoren/wouter-danis-bv-3295.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 720/894 [01:27<00:33,  5.23it/s]

Scoring: https://www.argenta.be/nl/kantoren/anl-services-bv-3517.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 721/894 [01:40<00:46,  3.73it/s]

Scoring: https://www.argenta.be/nl/kantoren/trust-in-us-bv-3538.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 722/894 [01:54<01:05,  2.64it/s]

Scoring: https://www.argenta.be/nl/kantoren/kim-lemahieu-bv-3716.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 723/894 [02:08<01:30,  1.89it/s]

Scoring: https://www.argenta.be/nl/kantoren/katia-mertens-bv-3860.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 724/894 [02:22<02:06,  1.35it/s]

Scoring: https://www.argenta.be/nl/kantoren/pedes-bv-3000.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 725/894 [02:36<02:55,  1.04s/it]

Scoring: https://www.argenta.be/nl/kantoren/arsys-bv-2869.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 726/894 [02:50<04:02,  1.44s/it]

Scoring: https://www.argenta.be/nl/kantoren/danberg-bv-3509.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 727/894 [03:07<05:56,  2.14s/it]

Scoring: https://www.argenta.be/nl/kantoren/ellen-veraghtert-bv-3613.html


Scoring pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 728/894 [03:21<07:48,  2.82s/it]

Scoring: https://www.argenta.be/nl/kantoren/meulepas-kathleen-bv-2896.html


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 729/894 [03:36<10:16,  3.74s/it]

Scoring: https://www.argenta.be/nl/kantoren/siegfried-boelanders-bv-3744.html


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 730/894 [03:51<13:15,  4.85s/it]

Scoring: https://www.argenta.be/nl/kantoren/geert-vervaet-bv-3269.html


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 731/894 [04:06<16:15,  5.98s/it]

Scoring: https://www.argenta.be/nl/kantoren/maarten-bellemans-bv-3614.html


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 732/894 [04:19<19:20,  7.16s/it]

Scoring: https://www.argenta.be/nl/kantoren/karolien-vandamme-bv-3605.html


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 733/894 [04:33<22:22,  8.34s/it]

Scoring: https://www.argenta.be/nl/kantoren/laurent-baetsle-bv-3833.html


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 734/894 [04:46<24:35,  9.22s/it]

Scoring: https://www.argenta.be/nl/kantoren/laurent-baetsle-bv-3834.html
‚úÖ Checkpoint saved at 680 items


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 735/894 [04:59<26:37, 10.05s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-van-der-borght-bv-3387.html


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 736/894 [05:13<28:46, 10.93s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-van-der-borght-bv-3687.html


Scoring pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 737/894 [05:26<29:59, 11.46s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-dirk-geysels-bv-2605.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 738/894 [05:40<31:33, 12.14s/it]

Scoring: https://www.argenta.be/nl/kantoren/ilse-blommaert-bv-2591.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 739/894 [05:54<32:48, 12.70s/it]

Scoring: https://www.argenta.be/nl/kantoren/de-loecker-liesbeth-bv-2834.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 740/894 [06:11<36:01, 14.03s/it]

Scoring: https://www.argenta.be/nl/kantoren/ruben-jackers-bv-3179.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 741/894 [06:25<35:42, 14.00s/it]

Scoring: https://www.argenta.be/nl/kantoren/tilburghs-didier-bv-2933.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 742/894 [06:38<34:46, 13.73s/it]

Scoring: https://www.argenta.be/nl/kantoren/raf-nys-bv-3108.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 743/894 [06:53<35:06, 13.95s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3885.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 744/894 [07:10<36:58, 14.79s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-dirk-meeus-bv-3824.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 745/894 [07:23<35:49, 14.43s/it]

Scoring: https://www.argenta.be/nl/kantoren/thiry-sabrine-bv-2631.html


Scoring pages:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 746/894 [07:58<50:22, 20.42s/it]

Scoring: https://www.argenta.be/nl/kantoren/schepers-bv-3448.html


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 747/894 [08:14<46:41, 19.05s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-vlasselaer-bv-3470.html


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 748/894 [08:28<43:07, 17.73s/it]

Scoring: https://www.argenta.be/nl/kantoren/tom-geens-bv-3346.html


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 749/894 [08:46<42:54, 17.75s/it]

Scoring: https://www.argenta.be/nl/kantoren/frank-riviere-bv-3638.html


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 750/894 [09:00<39:47, 16.58s/it]

Scoring: https://www.argenta.be/nl/kantoren/frank-riviere-bv-3865.html


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 751/894 [09:13<37:08, 15.59s/it]

Scoring: https://www.argenta.be/nl/kantoren/appelmans-andy-bv-3624.html


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 752/894 [09:26<35:22, 14.95s/it]

Scoring: https://www.argenta.be/nl/kantoren/hilde-van-hecke-bv-3655.html


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 753/894 [09:40<34:10, 14.54s/it]

Scoring: https://www.argenta.be/nl/kantoren/dominique-rasker-bv-3118.html


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 754/894 [09:53<32:53, 14.09s/it]

Scoring: https://www.argenta.be/nl/kantoren/maxi-invest-bv-9238.html
‚úÖ Checkpoint saved at 700 items


Scoring pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 755/894 [10:11<35:14, 15.21s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-bogaert-bv-3232.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 756/894 [10:25<34:26, 14.98s/it]

Scoring: https://www.argenta.be/nl/kantoren/de-boeck-bv-946.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 757/894 [10:39<33:15, 14.56s/it]

Scoring: https://www.argenta.be/nl/kantoren/fincoaching-bv-2808.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 758/894 [10:53<32:36, 14.38s/it]

Scoring: https://www.argenta.be/nl/kantoren/kumeco-bv-3446.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 759/894 [11:07<32:25, 14.41s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-derdeyn-bv-3105.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 760/894 [11:24<33:25, 14.97s/it]

Scoring: https://www.argenta.be/nl/kantoren/els-de-baerdemaeker-bv-3628.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 761/894 [11:37<32:18, 14.58s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-kenis-bv-3243.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 762/894 [11:52<32:14, 14.65s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-kenis-bv-3033.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 763/894 [12:05<31:06, 14.25s/it]

Scoring: https://www.argenta.be/nl/kantoren/ruben-jackers-bv-3877.html


Scoring pages:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 764/894 [12:19<30:36, 14.13s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3881.html


Scoring pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 765/894 [12:33<29:58, 13.94s/it]

Scoring: https://www.argenta.be/nl/kantoren/vanhove-bv-2924.html


Scoring pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 766/894 [12:47<30:04, 14.09s/it]

Scoring: https://www.argenta.be/nl/kantoren/olivier-guillaume-bv-3873.html


Scoring pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 767/894 [13:01<29:35, 13.98s/it]

Scoring: https://www.argenta.be/nl/kantoren/somers-bv-831.html


Scoring pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 768/894 [13:15<29:10, 13.90s/it]

Scoring: https://www.argenta.be/nl/kantoren/nick-cambre-bv-2236.html


Scoring pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 769/894 [13:28<28:47, 13.82s/it]

Scoring: https://www.argenta.be/nl/kantoren/dirkx-bv-3622.html


Scoring pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 770/894 [13:44<29:41, 14.37s/it]

Skipping already processed URL: https://www.argenta.be/nl/kantoren/stein-echelpoels-bv-3869.html
Scoring: https://www.argenta.be/nl/kantoren/christophe-monsart-bv-3579.html


Scoring pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 772/894 [13:58<22:09, 10.89s/it]

Scoring: https://www.argenta.be/nl/kantoren/heylen--noots-bv-3882.html


Scoring pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 773/894 [14:13<23:58, 11.89s/it]

Scoring: https://www.argenta.be/nl/kantoren/jonathan-samyn-bv-2630.html


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 774/894 [14:28<25:32, 12.77s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-sonja-kinds-bv-3242.html


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 775/894 [14:42<26:20, 13.28s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-graux-bv-844.html
‚úÖ Checkpoint saved at 720 items


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 776/894 [14:57<26:31, 13.49s/it]

Scoring: https://www.argenta.be/nl/kantoren/akin-sariyildiz-bv-3879.html


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 777/894 [15:11<26:42, 13.70s/it]

Scoring: https://www.argenta.be/nl/kantoren/estelle-somja-srl-3886.html


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 778/894 [15:28<28:21, 14.67s/it]

Scoring: https://www.argenta.be/nl/kantoren/arpil-bv-3528.html


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 779/894 [15:42<27:36, 14.41s/it]

Scoring: https://www.argenta.be/nl/kantoren/peter-van-campfort-bv-1890.html


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 780/894 [15:55<26:56, 14.18s/it]

Scoring: https://www.argenta.be/nl/kantoren/koen-van-der-steen-bv-2551.html


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 781/894 [16:10<26:54, 14.29s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-maarten-janssens-bv-3816.html


Scoring pages:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 782/894 [16:25<27:28, 14.72s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-lismont-bv-2264.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 783/894 [16:40<27:01, 14.61s/it]

Scoring: https://www.argenta.be/nl/kantoren/caroline-denturck-bv-3570.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 784/894 [16:54<26:21, 14.38s/it]

Scoring: https://www.argenta.be/nl/kantoren/katia-mertens-bv-3861.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 785/894 [17:08<26:08, 14.39s/it]

Scoring: https://www.argenta.be/nl/kantoren/bert-annick-bv-3085.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 786/894 [17:22<25:39, 14.25s/it]

Scoring: https://www.argenta.be/nl/kantoren/akin-sariyildiz-bv-3664.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 787/894 [17:36<25:18, 14.19s/it]

Scoring: https://www.argenta.be/nl/kantoren/lb-finance-bv-3678.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 788/894 [17:50<24:45, 14.02s/it]

Scoring: https://www.argenta.be/nl/kantoren/lb-finance-bv-3808.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 789/894 [18:04<24:35, 14.05s/it]

Scoring: https://www.argenta.be/nl/kantoren/paulusse-michiels-bv-2659.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 790/894 [18:17<24:04, 13.89s/it]

Scoring: https://www.argenta.be/nl/kantoren/bert-wouters-bv-3552.html


Scoring pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 791/894 [18:31<23:31, 13.70s/it]

Scoring: https://www.argenta.be/nl/kantoren/loua-bv-3268.html


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 792/894 [18:45<23:29, 13.82s/it]

Scoring: https://www.argenta.be/nl/kantoren/nico-herpoel-bv-3758.html


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 793/894 [18:58<23:00, 13.67s/it]

Scoring: https://www.argenta.be/nl/kantoren/alex-decraemer-bv-3890.html


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 794/894 [19:12<22:53, 13.74s/it]

Scoring: https://www.argenta.be/nl/kantoren/alex-decraemer-bv-3891.html


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 795/894 [19:26<22:36, 13.70s/it]

Scoring: https://www.argenta.be/nl/kantoren/caroline-denturck-bv-3887.html
‚úÖ Checkpoint saved at 740 items


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 796/894 [19:39<22:20, 13.68s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-vos-dejaegher-bv-3432.html


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 797/894 [19:54<22:44, 14.06s/it]

Scoring: https://www.argenta.be/nl/kantoren/silvie-vermeiren-bv-3900.html


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 798/894 [20:11<24:00, 15.01s/it]

Scoring: https://www.argenta.be/nl/kantoren/ibrahimovic--vanhaeren-bv-3898.html


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 799/894 [20:25<23:16, 14.70s/it]

Scoring: https://www.argenta.be/nl/kantoren/barbe-borloo-bv-3719.html


Scoring pages:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 800/894 [20:39<22:45, 14.53s/it]

Scoring: https://www.argenta.be/nl/kantoren/barbe-borloo-bv-2277.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 801/894 [20:54<22:23, 14.45s/it]

Scoring: https://www.argenta.be/nl/kantoren/kristiaan-de-belder-bv-1128.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 802/894 [21:07<21:51, 14.26s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3903.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 803/894 [21:21<21:08, 13.94s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3904.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 804/894 [21:34<20:37, 13.75s/it]

Scoring: https://www.argenta.be/nl/kantoren/joel-van-haute-bv-2892.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 805/894 [21:51<21:49, 14.71s/it]

Scoring: https://www.argenta.be/nl/kantoren/elcey-bv-3055.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 806/894 [22:08<22:28, 15.32s/it]

Scoring: https://www.argenta.be/nl/kantoren/vyncke-staelens-bv-3906.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 807/894 [22:21<21:13, 14.64s/it]

Scoring: https://www.argenta.be/nl/kantoren/vyncke-staelens-bv-3907.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 808/894 [22:35<21:00, 14.66s/it]

Scoring: https://www.argenta.be/nl/kantoren/cindy-vanneste-bv-3339.html


Scoring pages:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 809/894 [22:53<21:47, 15.38s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-vanhaecke-bv-2374.html


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 810/894 [23:09<22:08, 15.82s/it]

Scoring: https://www.argenta.be/nl/kantoren/isabel-van-aelst-bv-3880.html


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 811/894 [23:23<21:08, 15.28s/it]

Scoring: https://www.argenta.be/nl/kantoren/isabel-van-aelst-bv-3568.html


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 812/894 [23:37<20:22, 14.91s/it]

Scoring: https://www.argenta.be/nl/kantoren/johan-vancaester-bv-2890.html


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 813/894 [23:55<21:22, 15.84s/it]

Scoring: https://www.argenta.be/nl/kantoren/dierckx-persoons-bv-3920.html


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 814/894 [24:13<22:00, 16.50s/it]

Scoring: https://www.argenta.be/nl/kantoren/dierckx-persoons-bv-3922.html


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 815/894 [24:30<21:32, 16.37s/it]

Scoring: https://www.argenta.be/nl/kantoren/kathleen-van-hunsel-bv-3524.html
‚úÖ Checkpoint saved at 760 items


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 816/894 [24:43<20:10, 15.52s/it]

Scoring: https://www.argenta.be/nl/kantoren/prinsen-bv-3204.html


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 817/894 [24:56<19:05, 14.88s/it]

Scoring: https://www.argenta.be/nl/kantoren/nick-wanzeele-bv-3911.html


Scoring pages:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 818/894 [25:10<18:31, 14.63s/it]

Scoring: https://www.argenta.be/nl/kantoren/nick-wanzeele-bv-3912.html


Scoring pages:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 819/894 [25:27<19:08, 15.31s/it]

Scoring: https://www.argenta.be/nl/kantoren/sergeant--de-reu-bv-3913.html


Scoring pages:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 820/894 [25:45<19:47, 16.05s/it]

Scoring: https://www.argenta.be/nl/kantoren/claeys-katleen-bv-3917.html


Scoring pages:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 821/894 [26:03<20:06, 16.53s/it]

Scoring: https://www.argenta.be/nl/kantoren/alain-tits-bv-3044.html


Scoring pages:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 822/894 [26:16<18:36, 15.51s/it]

Scoring: https://www.argenta.be/nl/kantoren/vercauteren-lambrechts-bv-3091.html


Scoring pages:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 823/894 [26:30<17:41, 14.95s/it]

Scoring: https://www.argenta.be/nl/kantoren/dhondt-tommy-bv-2582.html


Scoring pages:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 824/894 [26:44<17:10, 14.72s/it]

Scoring: https://www.argenta.be/nl/kantoren/leen-tanghe-bv-2779.html


Scoring pages:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 825/894 [26:57<16:27, 14.31s/it]

Scoring: https://www.argenta.be/nl/kantoren/maarten-steurbaut-bv-3915.html


Scoring pages:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 826/894 [27:12<16:16, 14.37s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3931.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 827/894 [27:26<16:06, 14.43s/it]

Scoring: https://www.argenta.be/nl/kantoren/roosen-hermans-bv-3561.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 828/894 [27:41<15:57, 14.51s/it]

Scoring: https://www.argenta.be/nl/kantoren/donatus-finance-bv-3645.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 829/894 [27:55<15:39, 14.46s/it]

Scoring: https://www.argenta.be/nl/kantoren/bruyninckx-oc-bv-3313.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 830/894 [28:13<16:33, 15.53s/it]

Scoring: https://www.argenta.be/nl/kantoren/silvie-daemen-bv-3578.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 831/894 [28:28<16:05, 15.32s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-houwe-bure-bv-3924.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 832/894 [28:43<15:48, 15.31s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-houwe-bure-bv-3926.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 833/894 [28:57<15:06, 14.86s/it]

Scoring: https://www.argenta.be/nl/kantoren/bart-vandenborne-bv-3479.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 834/894 [29:12<14:53, 14.89s/it]

Scoring: https://www.argenta.be/nl/kantoren/evy-vos-bv-3596.html


Scoring pages:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 835/894 [29:26<14:25, 14.66s/it]

Scoring: https://www.argenta.be/nl/kantoren/evy-vos-bv-3901.html
‚úÖ Checkpoint saved at 780 items


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 836/894 [29:40<13:52, 14.35s/it]

Scoring: https://www.argenta.be/nl/kantoren/hans-renty-bv-3909.html


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 837/894 [29:54<13:31, 14.24s/it]

Scoring: https://www.argenta.be/nl/kantoren/stock-lieven-bv-2954.html


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 838/894 [30:08<13:14, 14.19s/it]

Scoring: https://www.argenta.be/nl/kantoren/david-vanbiervliet-bv-3936.html


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 839/894 [30:21<12:47, 13.96s/it]

Scoring: https://www.argenta.be/nl/kantoren/johan-verdonck-bv-2821.html


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 840/894 [30:35<12:34, 13.97s/it]

Scoring: https://www.argenta.be/nl/kantoren/isabelle-hannes-bv-3551.html


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 841/894 [30:50<12:23, 14.02s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-laere-rijckaert-bv-3928.html


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 842/894 [31:06<12:46, 14.73s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-laere-rijckaert-bv-3929.html


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 843/894 [31:21<12:31, 14.74s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3937.html


Scoring pages:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 844/894 [31:35<12:10, 14.62s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3941.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 845/894 [31:49<11:40, 14.29s/it]

Scoring: https://www.argenta.be/nl/kantoren/moens--muylaert-bv-3943.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 846/894 [32:02<11:17, 14.12s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3952.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 847/894 [32:19<11:38, 14.86s/it]

Scoring: https://www.argenta.be/nl/kantoren/wieland-verlinden-bv-3933.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 848/894 [32:32<11:00, 14.36s/it]

Scoring: https://www.argenta.be/nl/kantoren/kantoor-verheecke-bv-3939.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 849/894 [32:45<10:30, 14.01s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3935.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 850/894 [32:59<10:09, 13.84s/it]

Scoring: https://www.argenta.be/nl/kantoren/peter-gelders-bv-2680.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 851/894 [33:13<10:00, 13.96s/it]

Scoring: https://www.argenta.be/nl/kantoren/patrick-de-mecheleer-bv-3368.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 852/894 [33:26<09:37, 13.75s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-johan-de-wil-bv-3696.html


Scoring pages:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 853/894 [33:39<09:12, 13.48s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-johan-de-wil-bv-3938.html


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 854/894 [33:52<08:54, 13.37s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3838.html


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 855/894 [34:06<08:51, 13.64s/it]

Scoring: https://www.argenta.be/nl/kantoren/tim-fripon-bv-3947.html
‚úÖ Checkpoint saved at 800 items


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 856/894 [34:20<08:36, 13.60s/it]

Scoring: https://www.argenta.be/nl/kantoren/tim-fripon-bv-3949.html


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 857/894 [34:34<08:23, 13.61s/it]

Scoring: https://www.argenta.be/nl/kantoren/vermant--dekeersmaeker-bv-3944.html


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 858/894 [34:47<08:10, 13.63s/it]

Scoring: https://www.argenta.be/nl/kantoren/vermant--dekeersmaeker-bv-3945.html


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 859/894 [35:00<07:51, 13.46s/it]

Scoring: https://www.argenta.be/nl/kantoren/zakenkantoor-haesendonck--beynen-bv-3953.html


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 860/894 [35:13<07:32, 13.31s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-camp-apostolou-bv-3871.html


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 861/894 [35:27<07:27, 13.55s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-camp-apostolou-bv-3872.html


Scoring pages:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 862/894 [35:41<07:15, 13.62s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-dijck--bekaert-bv-2799.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 863/894 [35:58<07:30, 14.53s/it]

Scoring: https://www.argenta.be/nl/kantoren/van-dijck--bekaert-bv-3946.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 864/894 [36:12<07:13, 14.46s/it]

Scoring: https://www.argenta.be/nl/kantoren/tom-geens-bv-3940.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 865/894 [36:29<07:21, 15.22s/it]

Scoring: https://www.argenta.be/nl/kantoren/jennes-moonen-bv-3951.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 866/894 [36:43<06:55, 14.83s/it]

Scoring: https://www.argenta.be/nl/kantoren/sergeant--de-reu-bv-3914.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 867/894 [36:57<06:32, 14.53s/it]

Scoring: https://www.argenta.be/nl/kantoren/frederic-dereppe-bv-3709.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 868/894 [37:11<06:13, 14.38s/it]

Scoring: https://www.argenta.be/nl/kantoren/dennis-verheyen-bv-2610.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 869/894 [37:24<05:53, 14.15s/it]

Scoring: https://www.argenta.be/nl/kantoren/olivier-trouillard-bv-3962.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 870/894 [37:38<05:35, 13.99s/it]

Scoring: https://www.argenta.be/nl/kantoren/olivier-trouillard-bv-3963.html


Scoring pages:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 871/894 [37:53<05:24, 14.13s/it]

Scoring: https://www.argenta.be/nl/kantoren/estelle-somja-bv-3960.html


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 872/894 [38:10<05:30, 15.02s/it]

Scoring: https://www.argenta.be/nl/kantoren/kenneth-van-nuffel-bv-3225.html


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 873/894 [38:23<05:05, 14.54s/it]

Scoring: https://www.argenta.be/nl/kantoren/helsen-huveneers-vandeurzen-bv-2722.html


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 874/894 [38:37<04:48, 14.42s/it]

Scoring: https://www.argenta.be/nl/kantoren/helsen-huveneers-vandeurzen-bv-3896.html


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 875/894 [38:50<04:27, 14.08s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3964.html
‚úÖ Checkpoint saved at 820 items


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 876/894 [39:05<04:14, 14.16s/it]

Scoring: https://www.argenta.be/nl/kantoren/vincent-de-vries-bv-3968.html


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 877/894 [39:21<04:11, 14.80s/it]

Scoring: https://www.argenta.be/nl/kantoren/vincent-de-vries-bv-3969.html


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 878/894 [39:36<03:54, 14.69s/it]

Scoring: https://www.argenta.be/nl/kantoren/jan-maes-bv-3428.html


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 879/894 [39:50<03:37, 14.50s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3966.html


Scoring pages:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 880/894 [40:03<03:20, 14.29s/it]

Scoring: https://www.argenta.be/nl/kantoren/frederic-dereppe-bv-3942.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 881/894 [40:17<03:01, 13.99s/it]

Scoring: https://www.argenta.be/nl/kantoren/filiz-turkkol-srl--3961.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 882/894 [40:31<02:50, 14.22s/it]

Scoring: https://www.argenta.be/nl/kantoren/jan-maes-bv-3971.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 883/894 [40:46<02:36, 14.19s/it]

Scoring: https://www.argenta.be/nl/kantoren/steven-roymans-bv-3723.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 884/894 [40:59<02:20, 14.05s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-4002.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 885/894 [41:16<02:13, 14.81s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-4003.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 886/894 [41:30<01:57, 14.64s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-4004.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 887/894 [41:44<01:41, 14.49s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-4005.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 888/894 [41:59<01:27, 14.56s/it]

Scoring: https://www.argenta.be/nl/kantoren/lokarg-bv-3985.html


Scoring pages:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 889/894 [42:14<01:12, 14.59s/it]

Scoring: https://www.argenta.be/nl/kantoren/argenta-spaarbank-nv-3976.html


Scoring pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 890/894 [42:28<00:57, 14.45s/it]

Scoring: https://www.argenta.be/nl/inschrijven-nieuwsbrief.html


Scoring pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 891/894 [42:41<00:42, 14.14s/it]

Scoring: https://www.argenta.be/nl/vacatures.html


Scoring pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 892/894 [42:55<00:28, 14.01s/it]

Scoring: https://www.argenta.be/nl/simpel-gezegd.html


Scoring pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 893/894 [43:09<00:14, 14.08s/it]

Scoring: https://www.argenta.be/nl/toegankelijkheid.html


Scoring pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 894/894 [43:24<00:00,  2.91s/it]


‚úÖ Final results saved

‚úÖ Results saved to argenta_urls_nl_b2_accessibility_scores.xlsx

üåê Overall CEFR B2 Accessibility Score: 75.36%
‚úÖ Contact pages meet accessibility requirements (avg score: 81.68)
‚úÖ FAQ pages meet accessibility requirements (avg score: 75.00)
‚ö†Ô∏è Legal pages may require language simplification (avg score: 68.10)
‚úÖ Other pages meet accessibility requirements (avg score: 73.97)
‚ö†Ô∏è Product pages may require language simplification (avg score: 67.20)


<div style="background-color: rgb(240, 142, 23); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    6. Specialized Code for ING
</div>

In [None]:
import requests
import re

def extract_multi_lang_clean_text_from_ing_api(url_path="/nl/particulieren"):
    api_url = "https://api.www.ing.be/be/public/pagemodel"
    params = {"pageUrl": url_path}
    try:
        response = requests.get(api_url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        # Recursive extractor with group labels
        def extract_text(obj, section="GENERAL"):
            texts = []
            if isinstance(obj, dict):
                for key, value in obj.items():
                    key_upper = key.upper()
                    # Label common ING section keys
                    if "title" in key.lower():
                        sec_label = "[TITLE]"
                    elif "header" in key.lower():
                        sec_label = "[HEADER]"
                    elif "description" in key.lower():
                        sec_label = "[DESCRIPTION]"
                    elif "cta" in key.lower():
                        sec_label = "[CTA]"
                    elif "paragraph" in key.lower():
                        sec_label = "[PARAGRAPH]"
                    elif "product" in key.lower():
                        sec_label = "[PRODUCT]"
                    elif "article" in key.lower():
                        sec_label = "[ARTICLE]"
                    else:
                        sec_label = section
                    texts.extend(extract_text(value, sec_label))
            elif isinstance(obj, list):
                for item in obj:
                    texts.extend(extract_text(item, section))
            elif isinstance(obj, str):
                texts.append(f"{section} {obj}".strip())
            return texts

        all_texts = extract_text(data)

        combined_text = " ".join(all_texts)

        # === CLEANING STEPS ===

        # Remove URLs and file paths
        cleaned_text = re.sub(r'https?://\S+', '', combined_text)
        cleaned_text = re.sub(r'/[a-zA-Z0-9\-/_.]+', '', cleaned_text)

        # Remove tech noise, image labels, system keywords
        cleaned_text = re.sub(
            r'\b(productCard|serviceCard|articleCard|cards|cta|paragraph|sectionTitle|left|icon fraud|MIDSECTION OF WOMAN SITTING IN BUS|sitting on sofa at home|zijn bezig op hun mobiel|GROUP OF PEOPLE IN TRADITIONAL CLOTHING|ING_210324_01\s*\d*|image|png|jpg|svg|Safe money|LOW SECTION OF PERSON STANDING ON ROAD|of woman photographing against sky|ING_210629_01)\b',
            '',
            cleaned_text,
            flags=re.IGNORECASE,
        )

        # Remove HTML tags
        cleaned_text = re.sub(r'<.*?>', '', cleaned_text)

        # Remove standalone numbers but keep percentages, time periods
        cleaned_text = re.sub(r'\b\d{1,4}\b(?!\s*(%|jaar|ans|maand|mois|per maand|par mois))', '', cleaned_text)

        # Normalize currency phrases
        cleaned_text = re.sub(r'‚Ç¨\s*/\s*maand', '‚Ç¨ per maand', cleaned_text)
        cleaned_text = re.sub(r'‚Ç¨\s*/\s*mois', '‚Ç¨ par mois', cleaned_text)
        cleaned_text = re.sub(r'‚Ç¨/maand', '‚Ç¨ per maand', cleaned_text)
        cleaned_text = re.sub(r'‚Ç¨/mois', '‚Ç¨ par mois', cleaned_text)

        # Remove ‚Äúclick here‚Äù noise
        cleaned_text = re.sub(r'Klik hier voor meer info\.', '', cleaned_text, flags=re.IGNORECASE)
        cleaned_text = re.sub(r'Cliquez ici pour plus dinfo\.', '', cleaned_text, flags=re.IGNORECASE)

        # Clean whitespace
        cleaned_text = re.sub(r'\.\s*\.', '.', cleaned_text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

        return cleaned_text.strip()[:12000]

    except Exception as e:
        print(f"‚ùå Error fetching ING API content: {e}")
        return f"‚ùå API fetch error: {str(e)}"


In [None]:
import re

def collapse_general_and_headers(text):
    """
    Cleans ING API text by:
    - Removing labels (GENERAL, product small/medium, etc.)
    - Collapsing repeated tags like [TITLE][TITLE]
    - Grouping [TITLE]/[DESCRIPTION] + following text into blocks
    - Removing empty [] and extra whitespace
    """
    # Remove known system labels
    cleaned = re.sub(r'\b(GENERAL|cards_x003a_\w+|personalizationTaxonomy|product (small|medium)|service small|article medium)\b', '', text)

    # Collapse repeated headers like [TITLE][TITLE][TITLE] ‚Üí [TITLE]
    cleaned = re.sub(r'(\[[A-Z]+\])(?:\s*\1)+', r'\1', cleaned)

    # Remove empty brackets and stray double quotes
    cleaned = re.sub(r'\[\]', '', cleaned)
    cleaned = re.sub(r'""', '', cleaned)

    # Split by space (since raw text uses space separator)
    tokens = cleaned.split()
    grouped = []
    current_tag = ''
    buffer = []

    for token in tokens:
        tag_match = re.match(r'^\[([A-Z]+)\]$', token)
        if tag_match:
            # Save previous section when a new tag appears
            if buffer:
                grouped.append(f"[{current_tag}] {' '.join(buffer)}")
                buffer = []
            current_tag = tag_match.group(1)
        else:
            buffer.append(token)

    # Add last buffer if exists
    if buffer:
        grouped.append(f"[{current_tag}] {' '.join(buffer)}")

    # Join sections with double line breaks for readability
    result = '\n\n'.join(grouped)

    # Final cleanup: reduce multiple spaces, trim
    result = re.sub(r'\s+', ' ', result).strip()

    return result


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import time
import json
import os
import re
import google.generativeai as genai
from tqdm import tqdm
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ---------------------- CONFIG ----------------------
genai.configure(api_key="AIzaSyBzOT2O03scMENbdWouWexYa10v4K4OVPE")

# ---------------------- STEP 1: Load Data ----------------------
def load_file(file_path):
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    elif file_path.endswith(".xlsx"):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format")
    return df

# ---------------------- STEP 2: Typology Classification ----------------------needs to updated
def classify_url(url):
    url = url.lower()
    if any(term in url for term in ["product", "producten", "produits",  # general
                                    "lenen", "loan", "pret",  # loans
                                    "sparen", "saving", "epargne",  # savings
                                    "rekening", "account", "compte",  # accounts
                                    "beleggen", "investment", "investir",  # investments
                                    "hypotheek", "mortgage", "hypothecaire",  # mortgage
                                    "verzekering", "insurance", "assurance",  # insurance
                                    "kaart", "card", "carte",  # cards
                                    "bankieren", "banking", "banque"]):
        return "Product"
    elif any(term in url for term in ["faq", "support", "help", "hulp", "ondersteuning", "aide", "questions", 
                                       "klantenservice", "clientservice", "contactcenter", "assistance"]):
        return "FAQ"
    elif any(term in url for term in ["legal", "juridisch", "juridique", "voorwaarden", "terms", 
                                      "conditions", "privacy", "beleid", "policy", "cookie", 
                                      "gdpr", "compliance", "disclaimer", "protection", "gegevensbescherming"]):
        return "Legal"
    elif any(term in url for term in ["contact", "locatie", "location", "agences", "branches", "agents", 
                                      "kantoren", "bureaux", "afspraak", "appointment", "form", 
                                      "formulier", "trouver", "bereikbaarheid"]):
        return "Contact"
    else:
        return "Other"

# ---------------------- STEP 3: Stratified Sampling ----------------------
def stratified_sample(df):
    df["Page Type"] = df["Address"].apply(classify_url)
    sample = []
    for typ in ["Product", "FAQ", "Legal", "Contact"]:
        group = df[df["Page Type"] == typ]
        sample.extend(group.sample(min(2, len(group)), random_state=42).to_dict("records"))
    remaining = 10 - len(sample)
    other = df[df["Page Type"] == "Other"]
    sample.extend(other.sample(min(remaining, len(other)), random_state=42).to_dict("records"))
    return pd.DataFrame(sample)

# ---------------------- STEP 4: Extract Page Text ----------------------
def extract_clean_text(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')
        for tag in soup(['script', 'style', 'nav', 'footer']):
            tag.decompose()
        text = soup.get_text(separator=' ', strip=True)
        return ' '.join(text.split())[:4000]  # Truncate to token-safe length
    except:
        return ""

def extract_clean_text_with_selenium(url):
    try:
        options = Options()
        options.headless = True
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')

        driver = webdriver.Chrome(options=options)
        driver.get(url)

        # Wait for full page load (you can add WebDriverWait here for better reliability)
        driver.implicitly_wait(5)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()

        # Remove unwanted tags
        for tag in soup(['script', 'style', 'nav', 'footer']):
            tag.decompose()

        text = soup.get_text(separator=' ', strip=True)
        return ' '.join(text.split())[:4000]

    except Exception as e:
        print(f"‚ùå Error with Selenium extraction: {e}")
        return ""


# ---------------------- STEP 5: Score with Gemini ----------------------
def score_page_with_gemini(text, page_type):
    prompt = f"""
**Context:** This prompt is designed for the Gemini language model to evaluate the CEFR B2 level compliance of webpage content from retail banking websites for regulatory compliance. The evaluation focuses on vocabulary, grammar, clarity, and coherence to determine if the text is easily understandable for someone at a B2 level in English, French, or Dutch. The desired output includes the compliance level percentage and individual scores for vocabulary complexity, grammatical structures, overall clarity, and coherence, with a detailed rationale for each evaluated address presented in a single cell of an output file (e.g., CSV or Excel). The goal is to ensure the evaluation effectively differentiates between webpages with varying levels of B2 compliance, leading to a wider range of scores, and that the rationale is comprehensive yet concise enough to fit within a single cell per address. **It is important to consider that these are banking websites, and some technical or financial terms may be inherent to the content.**

**Task:** Assess the CEFR B2 compliance level of the provided webpage content, ensuring a variable range of scores and a detailed, single-cell rationale for each evaluated address, **while acknowledging the potential presence of necessary banking terminology.**

**Instructions:**

1. **Identify Language:** Determine if the input text is in English, French, or Dutch.

2. **Evaluate B2 Compliance with Granularity (Considering Banking Terms):** Analyze the text against the CEFR B2 criteria for the identified language, critically and precisely assessing the following aspects on a scale of 0 to 10. Avoid assigning only 0 or 10; use the full scale based on nuance and subtlety. ‚ÄúDo not hesitate to assign low (0‚Äì4) or high (8‚Äì10) scores when the text clearly deserves it. Avoid accumulating around 6‚Äì7 unless the text is truly average.‚Äù Remember the compliancy threshold is 70% (7/10) for B2 level. Therefore if a text is generally compliant it should receive a total score of higher than or equal to 70.

- **Vocabulary Complexity (0‚Äì10)**
  - 10 ‚Üí very simple, common words, basic banking terms, no jargon
  - 7‚Äì9 ‚Üí mostly common words, occasional technical terms explained
  - 4‚Äì6 ‚Üí mix of general and technical terms, some unnecessarily complex or rare words
  - 1‚Äì3 ‚Üí frequent use of complex, low-frequency words or jargon, often unexplained
  - 0 ‚Üí highly complex, dense language with rare or unexplained terms everywhere

- **Grammatical Structures (0‚Äì10)**
  - 10 ‚Üí simple sentences, clear structure, active voice, no complex clauses
  - 7‚Äì9 ‚Üí mostly simple, some moderate clauses, minor passive use
  - 4‚Äì6 ‚Üí mix of simple and complex sentences, occasional embedded or passive forms
  - 1‚Äì3 ‚Üí mostly long, embedded, or passive structures, hard to follow
  - 0 ‚Üí extremely complex grammar, frequent embedding, difficult to parse

- **Overall Clarity (0‚Äì10)**
  - 10 ‚Üí very clear, easy to understand, minimal effort required
  - 7‚Äì9 ‚Üí mostly clear, small moments of complexity
  - 4‚Äì6 ‚Üí mixed clarity, occasional confusion or ambiguity
  - 1‚Äì3 ‚Üí often unclear, requires effort to interpret
  - 0 ‚Üí very unclear, confusing, hard to follow

- **Coherence (0‚Äì10)**
  - 10 ‚Üí logical flow, clear organization, excellent connectors
  - 7‚Äì9 ‚Üí mostly logical, some jumps, minor missing links
  - 4‚Äì6 ‚Üí mixed coherence, weak transitions, partial disorganization
  - 1‚Äì3 ‚Üí often disorganized, unclear connections
  - 0 ‚Üí no logical order, chaotic, fragmented

3. **Provide Detailed Rationale (Single Cell):** Explain the reasoning behind each of the four scores within a single text string suitable for one Excel cell. Explicitly point out specific linguistic features (vocabulary, grammar, discourse markers) that contribute to the assigned level of complexity or simplicity for each criterion. When discussing vocabulary, specifically comment on the presence and handling of banking terminology. Justify why the text is or is not strictly at the B2 level for each aspect. Use clear separators (e.g., "; ") between the rationale for each criterion to ensure readability within the single cell.
```xml
<rationale>Vocabulary: [Explanation with examples, noting banking terms]; Grammar: [Explanation with examples]; Clarity: [Explanation with examples, considering banking terms]; Coherence: [Explanation with examples]</rationale>

**Output Format:**
Return the evaluation in the following XML format, ensuring all information for a single evaluated webpage address can be represented as a single row in an output file:
```xml
<vocabulary_complexity>Y</vocabulary_complexity>
<grammatical_structures>Z</grammatical_structures>
<overall_clarity>W</overall_clarity>
<coherence>V</coherence>
<rationale>Vocabulary: [Explanation with examples, noting banking terms], Grammar: [Explanation with examples], Clarity: [Explanation with examples, considering banking terms], Coherence: [Explanation with examples]</rationale>

Examples of B2 Compliant Texts and C1 Texts Which Are Not B2 Compliant
 To help you understand the evaluation criteria, here are some examples of texts rated at B2 and C1 levels:

English
B2 level text
 Source: LinguaPress Unsolved mysteries ‚Äì a short story by Sarah Wollbach
 Megan‚Äôs acting career began one morning a couple of years ago, when a woman approached her in the parking lot of her neighborhood grocery store. ‚ÄúExcuse me,‚Äù she said, ‚Äúbut have you ever taken acting lessons?‚Äù ‚Äî ‚ÄúNo,‚Äù she answered hesitantly. 
 The woman reached into her pocket and handed Megan a card. ‚ÄúI‚Äôm a casting director for Unsolved Mysteries,‚Äù she said, shaking her hand. Megan had always been stage-struck. 
 For years she'd fantasized about being an actor, sure that deep within her lurked a brilliant chameleon like Meryl Streep or Julia Roberts. Maybe this was her big break. 
 ‚ÄúThe show‚Äôs doing a feature about a woman who was kidnapped,‚Äù the lady continued, ‚Äúand you look exactly like her. The resemblance is amazing. Would you be interested in auditioning?‚Äù 
 The episode aired the next week, with a couple of thousand dollars for two days‚Äô work, plus travel, lodging, and food expenses.


C1 level text
 Source: LinguaPress The Enigma of the Missing Manuscript by John Doe
 The mystery of the missing manuscript has eluded generations of writers. It was said to contain the final, unpublished works and annotations of the author, whose sudden disappearance 
 had only added to the intrigue. The manuscript was believed to be hidden somewhere in the old mansion, a labyrinthine structure filled with secret passages and hidden rooms. 
 Many had tried to find it, but all had failed. The clues were cryptic, the dangers real, and the stakes high. For those who dared to search, it was a journey into the unknown, a test of wit and courage.



French
B2 level text
 Source: LinguaPress Myst√®res non r√©solus ‚Äì une histoire courte par Sarah Wollbach
 La carri√®re d‚Äôactrice de Megan a commenc√© un matin il y a quelques ann√©es, lorsqu‚Äôune femme l‚Äôa abord√©e dans le parking de son √©picerie de quartier. 
 ‚ÄúExcusez-moi,‚Äù dit-elle, ‚Äúmais avez-vous d√©j√† pris des cours de th√©√¢tre?‚Äù ‚Äî ‚ÄúNon,‚Äù r√©pondit-elle avec h√©sitation. La femme a fouill√© dans sa poche et tendu une carte √† Megan. 
 ‚ÄúJe suis directrice de casting de Myst√®res non r√©solus,‚Äù dit-elle en lui serrant la main. Megan avait toujours √©t√© fascin√©e par la sc√®ne. 
 Pendant des ann√©es, elle avait nourri en secret le r√™ve d‚Äô√™tre actrice, convaincue qu‚Äôau fond d‚Äôelle-m√™me se cachait un brillant cam√©l√©on comme Meryl Streep ou Julia Roberts. 
 Peut-√™tre que c‚Äô√©tait sa grande chance. ‚ÄúL‚Äô√©mission fait un reportage sur une femme qui a √©t√© kidnapp√©e,‚Äù continua la dame, ‚Äúet vous lui ressemblez exactement. 
 La ressemblance est incroyable. Seriez-vous int√©ress√©e par une audition?‚Äù Elle expliqua que le r√¥le valait quelques milliers de dollars pour deux jours de travail, plus les frais de voyage, de logement et de nourriture.


C1 level text
 Source: LinguaPress L‚ÄôEnigme du Manuscrit Disparu par Jean Dupont
 Le myst√®re du manuscrit disparu que tout le monde tentait de percer depuis des d√©cennies. On disait qu‚Äôil contenait les derni√®res ≈ìuvres finales, non publi√©es, d‚Äôun auteur renomm√©, 
 dont la disparition soudaine n‚Äôavait fait qu‚Äôajouter √† l‚Äôintrigue. On croyait que le manuscrit √©tait cach√© quelque part dans le vieux manoir, une structure labyrinthique remplie de passages secrets et de pi√®ces cach√©es. 
 Beaucoup avaient essay√© de le trouver, mais tous avaient √©chou√©. Les indices √©taient cryptiques, les dangers r√©els, et les enjeux √©lev√©s. Pour ceux qui osaient chercher, c‚Äô√©tait un voyage dans l‚Äôinconnu, un test d‚Äôesprit et de courage.



Dutch
B2 level text
 Source: LinguaPress Opgeloste mysteries ‚Äì een kortverhaal door Sarah Wollbach
 Megan‚Äôs acteercarri√®re begon op een ochtend een paar jaar geleden, toen een vrouw haar benaderde op de parkeerplaats van haar buurtwinkel. ‚ÄúExcuseer me,‚Äù zei ze, ‚Äúmaar heb je ooit acteerlessen gevolgd?‚Äù ‚Äî ‚ÄúNee,‚Äù antwoordde ze aarzelend. 
 De vrouw stak haar hand in haar zak en gaf Megan een kaartje. ‚ÄúIk ben een castingdirecteur voor Opgeloste mysteries,‚Äù zei ze, terwijl ze haar hand schudde. Megan was altijd al gefascineerd door het toneel. 
 Jarenlang had ze gefantaseerd over het zijn van een actrice, ervan overtuigd dat diep vanbinnen een briljante actrice zoals Meryl Streep of Julia Roberts schuilde. Misschien was dit haar grote doorbraak. 
 ‚ÄúDe show doet een reportage over een vrouw die ontvoerd is,‚Äù vervolgde de dame, ‚Äúen je lijkt precies op haar. De gelijkenis is verbazingwekkend. Zou je ge√Ønteresseerd zijn in een auditie?‚Äù 
 Ze zette uit dat dit alles een paar duizend dollar waard was voor twee dagen werk, plus reis-, verblijf- en voedselkosten.


C1 level text
 Source: LinguaPress Het Raadsel van het Verdwenen Manuscript door Jan Jansen
 Het mysterie van het verdwenen manuscript dat generaties schrijvers decennialang verbijsterd. Er werd gezegd dat het de laatste, ongepubliceerde werken van een beroemde auteur bevatte, wiens plotselinge verdwijning alleen maar bijdroeg aan de intrige. 
 Het gerucht deed de ronde dat het manuscript ergens in het oude herenhuis verborgen was, een labyrintische structuur vol geheime gangen en verborgen kamers. Velen hadden geprobeerd het te vinden, maar allemaal waren ze mislukt. 
 De aanwijzingen waren cryptisch, de gevaren echt, en de inzet hoog. Voor degenen die durfden te zoeken, was het een reis in het onbekende, een test van verstand en moed.


Input Text content to check: \"\"\"{text}\"\"\" 
"""
    try:

        model = genai.GenerativeModel("gemini-2.0-flash")
        response = model.generate_content(
            prompt,
            generation_config={"temperature": 0.2}
        )
        output = response.text.strip()

        if output.startswith("```"):
            output = output.strip("` \n").replace("xml", "").strip()

        # Extract scores using regex from the XML
        scores = {
            "vocabulary_complexity": extract_xml_score(output, "vocabulary_complexity"),
            "grammatical_structures": extract_xml_score(output, "grammatical_structures"),
            "overall_clarity": extract_xml_score(output, "overall_clarity"),
            "coherence": extract_xml_score(output, "coherence"),
            "rationale": extract_xml_rationale(output),
        }

        return scores

    except Exception as e:
        print(f"‚ùå Error scoring page with Gemini: {e}")
        return {
            "compliance_level": 0,
            "vocabulary_complexity": 0,
            "grammatical_structures": 0,
            "overall_clarity": 0,
            "coherence": 0,
            "rationale": "Error occurred during evaluation."
        }

def extract_xml_score(xml_text, tag):
    match = re.search(fr"<{tag}>(\d+)</{tag}>", xml_text)
    return int(match.group(1)) if match else 0

def extract_xml_rationale(xml_text):
    match = re.search(r"<rationale>(.*?)</rationale>", xml_text, re.DOTALL)
    return match.group(1).strip() if match else "No rationale found."

# ---------------------- STEP 6: Evaluation + Warning ----------------------

def evaluate_accessibility(df, file_path):
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    if base_name.endswith("_urls"):
        base_name = base_name.replace("_urls", "")
    scores = []
    checkpoint_file = f"{base_name}_b2_accessibility_checkpoint.xlsx"
    log_file = f"{base_name}_b2_accessibility_log.txt"

    # Check if log exists ‚Üí skip already processed URLs
    processed_urls = set()
    if os.path.exists(log_file):
        with open(log_file, 'r') as f:
            processed_urls = set(line.strip() for line in f.readlines())

    # Resume from checkpoint if exists
    if os.path.exists(checkpoint_file):
        scores_df = pd.read_excel(checkpoint_file)
        scores = scores_df.to_dict(orient='records')
    else:
        scores_df = pd.DataFrame()

    CHECKPOINT_EVERY = 20

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring pages"):
        url = row['Address']

        if url in processed_urls:
            continue

        print(f"Scoring: {url}")
        try:
            raw_text = extract_multi_lang_clean_text_from_ing_api(urlparse(url).path)
            cleaned_text = collapse_general_and_headers(raw_text)
            result = score_page_with_gemini(cleaned_text, row['Page Type'])

            sub_scores = [
                result.get("vocabulary_complexity", 0),
                result.get("grammatical_structures", 0),
                result.get("overall_clarity", 0),
                result.get("coherence", 0),
            ]
            compliance_value = round(sum(sub_scores) / 4 * 10) if all(isinstance(score, int) and 0 <= score <= 10 for score in sub_scores) else 0

        except Exception as e:
            print(f"Error scoring page {url}: {e}")
            compliance_value = 0
            result = {
                "vocabulary_complexity": 0,
                "grammatical_structures": 0,
                "overall_clarity": 0,
                "coherence": 0,
                "rationale": "Error occurred during evaluation.",
            }

        scores.append({
            "URL": url,
            "Page Type": row['Page Type'],
            "Compliance Level": compliance_value,
            "Vocabulary Complexity": result.get("vocabulary_complexity"),
            "Grammatical Structures": result.get("grammatical_structures"),
            "Overall Clarity": result.get("overall_clarity"),
            "Coherence": result.get("coherence"),
            "Rationale": result.get("rationale"),
        })

        # Log processed URL
        with open(log_file, 'a') as f:
            f.write(url + '\n')

        # Save checkpoint every N steps
        if len(scores) % CHECKPOINT_EVERY == 0:
            pd.DataFrame(scores).to_excel(checkpoint_file, index=False)
            print(f"‚úÖ Checkpoint saved at {len(scores)} items")

        time.sleep(2)

    # Final save
    final_df = pd.DataFrame(scores)
    final_df.to_excel("b2_accessibility_final.xlsx", index=False)
    print("‚úÖ Final results saved")

    return final_df

# ---------------------- STEP 7: Output & Summary ----------------------
def output_summary(result_df, input_path):
    # Extract filename without extension (e.g., "belfius_urls" ‚Üí "belfius")
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    if base_name.endswith("_urls"):
        base_name = base_name.replace("_urls", "")

    output_filename = f"{base_name}_b2_accessibility_scores.xlsx"
    output_path = os.path.join("/Users/furkandemir/Desktop/Sailpeak", output_filename)

    result_df.to_excel(output_path, index=False)
    print(f"\n‚úÖ Results saved to {output_filename}\n")

    # Score summaries
    overall_score = result_df["Compliance Level"].mean()
    print(f"üåê Overall CEFR B2 Accessibility Score: {overall_score:.2f}%")

    typology_avg = result_df.groupby("Page Type")["Compliance Level"].mean()
    for typ, score in typology_avg.items():
        if score < 70:
            print(f"‚ö†Ô∏è {typ} pages may require language simplification (avg score: {score:.2f})")

# ---------------------- Main Runner ----------------------
def main(file_path):
    df = load_file(file_path)
    #sampled_df = stratified_sample(df)
    df['Address'] = df['Address'].apply(lambda x: urlparse(x).path)
    df["Page Type"] = df["Address"].apply(classify_url)
    sampled_df = df  # Score all URLs
    result_df = evaluate_accessibility(sampled_df, file_path)
    output_summary(result_df, file_path)  # <-- pass file_path here


# Example usage:
# main("crelan_internal_html.xlsx")

<div style="background-color: rgb(61, 167, 29); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    7. Specialized Code for Crelan
</div>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
import re
import google.generativeai as genai
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from functools import lru_cache

# ---------------------- CONFIG ----------------------
genai.configure(api_key="AIzaSyBzOT2O03scMENbdWouWexYa10v4K4OVPE")

# Thread-local storage for WebDriver instances
thread_local = threading.local()

def get_driver():
    """Get a WebDriver instance for the current thread"""
    if not hasattr(thread_local, 'driver'):
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-images')  # Speed up loading
        options.add_argument('--disable-javascript')  # Speed up loading
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        thread_local.driver = webdriver.Chrome(options=options)
        thread_local.driver.set_page_load_timeout(30)  # Reduced timeout
        thread_local.driver.implicitly_wait(5)  # Reduced wait
    
    return thread_local.driver

def cleanup_driver():
    """Clean up the WebDriver for the current thread"""
    if hasattr(thread_local, 'driver'):
        thread_local.driver.quit()
        del thread_local.driver

# ---------------------- URL Classification ----------------------
@lru_cache(maxsize=1000)  # Cache classification results
def classify_crelan_url(url):
   """Enhanced URL classification specifically for KBC Bank (EN/FR/NL/DE)"""
   url = url.lower()
   
   if any(term in url for term in [
       # Products - EN/FR/NL/DE
       "product", "producten", "produits", "produkte", "sparen", "saving", "epargne",
       "lenen", "loan", "pret", "kredit", "credit", "rekening", "account", "compte", "konto",
       "beleggen", "investment", "investir", "investieren", "hypotheek", "mortgage",
       "verzekering", "insurance", "assurance", "versicherung", "kaart", "card", "carte", "karte",
       "bankieren", "banking", "banque"
   ]):
       return "Product"
   
   elif any(term in url for term in [
       # Support - EN/FR/NL/DE
       "faq", "support", "help", "hulp", "ondersteuning", "aide", "hilfe",
       "questions", "klantenservice", "service-client", "kundendienst", "assistance", "live-chat"
   ]):
       return "FAQ"
   
   elif any(term in url for term in [
       # Legal - EN/FR/NL/DE
       "legal", "juridisch", "juridique", "rechtlich", "voorwaarden", "terms",
       "conditions", "bedingungen", "privacy", "beleid", "policy", "datenschutz",
       "cookie", "gdpr", "compliance", "tarieven", "tarifs", "fees", "gebuehren"
   ]):
       return "Legal"
   
   elif any(term in url for term in [
       # Contact - EN/FR/NL/DE
       "contact", "locatie", "location", "standort", "agences", "branches", "filialen",
       "kantoren", "afspraak", "appointment", "rendez-vous", "termin"
   ]):
       return "Contact"
   
   elif any(term in url for term in [
       # News/Blog - EN/FR/NL/DE
       "blog", "nieuws", "news", "actualites", "nachrichten", "insights", "perspectives", "moments-cles"
   ]):
       return "Blog"
   
   else:
       return "Other"

# ---------------------- Text Extraction ----------------------
def extract_clean_text_crelan(url):
    """Optimized text extraction for Crelan"""
    try:
        driver = get_driver()
        driver.get(url)
        
        # Reduced wait time
        try:
            WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
        except:
            time.sleep(2)  # Reduced sleep
        
        # Quick cookie handling
        try:
            cookie_selectors = ['[data-testid*="accept"]', '[class*="accept"]']
            for selector in cookie_selectors:
                buttons = driver.find_elements(By.CSS_SELECTOR, selector)
                for button in buttons[:1]:  # Only try first button
                    if button.is_displayed():
                        driver.execute_script("arguments[0].click();", button)
                        time.sleep(1)
                        break
                if buttons:
                    break
        except:
            pass
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Remove unwanted elements
        for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
            if tag:
                tag.decompose()
        
        # Extract main content
        main_content = soup.find('main') or soup.find('article') or soup.find('.content')
        if main_content:
            text = main_content.get_text(separator=' ', strip=True)
        else:
            text = soup.get_text(separator=' ', strip=True)
        
        if text and len(text.strip()) > 100:
            result = ' '.join(text.split())[:10000]
            return result
        else:
            return ""

    except Exception as e:
        return ""

def clean_crelan_text(raw_text):
    """
    Universal text cleaning function for Belgian banks (KBC, BNP Paribas, Belfius, ING, Crelan)
    Handles multi-language content (NL/FR/EN) and bank-specific patterns
    """
    if not raw_text or len(raw_text.strip()) < 20:
        return ""
    
    # Universal navigation and header patterns
    navigation_patterns = [
        # Skip to content links
        r'Skip to .*?Log in',
        r'Retour au .*?Se connecter',
        r'Terug naar de inhoud',
        r'Overslaan en naar de inhoud gaan',
        r'Aller au contenu principal',
        r'Skip to main content',
        
        # Language switchers
        r'FR\s+NL\s+EN',
        r'Nederlands\s+Fran√ßais\s+English',
        r'NL\s+FR\s+DE',
        r'\bFR\s+NL\b',
        r'\bFran√ßais\b\s*\bNederlands\b',
        
        # Main navigation menus
        r'Home.*?Contact.*?Login',
        r'Accueil.*?Contact.*?Connexion',
        r'Thuis.*?Contact.*?Inloggen',
        r'Menu\s+Sluiten',
        r'Menu\s+Fermer',
        r'Close\s+Menu',
        
        # Search functionality
        r'Zoeken \(Optioneel\).*?Contact',
        r'Rechercher \(En option\).*?Contact',
        r'Search \(Optional\).*?Contact',
        r'Zoeken.*?Zoek',
        r'Rechercher.*?Recherche',
        r'Search.*?Search',

        # Crelan specific
        r'Crelan.*?Inloggen',
        r'Crelan.*?Se connecter',
        r'Crelan Online.*?Login',
        r'Crelan Bank.*?Connexion',
        
    ]
    
    # Cookie and privacy notices (comprehensive)
    cookie_patterns = [
        # Cookie acceptance
        r'Accept all cookies.*?Manage cookies',
        r'Accepter tous les cookies.*?G√©rer les cookies',
        r'Alle cookies accepteren.*?Cookies beheren',
        r'Deze website gebruikt cookies.*?Alles accepteren',
        r'Ce site utilise des cookies.*?Tout accepter',
        r'This website uses cookies.*?Accept all',
        
        # Cookie management sections
        r'Cookie settings.*?Save preferences',
        r'Param√®tres des cookies.*?Sauvegarder',
        r'Cookie-instellingen.*?Voorkeuren opslaan',
        r'Mijn cookies beheren.*?Alles accepteren',
        r'G√©rer mes cookies.*?Tout accepter',
        r'Manage my cookies.*?Accept all',
        
        # Cookie descriptions
        r'Functionele cookies.*?verbeteren\.',
        r'Les cookies fonctionnels.*?par des tiers\.',
        r'Functional cookies.*?third parties\.',
        r'Analytische cookies.*?voorkeuren zijn\.',
        r'Les cookies de mesure.*?leurs pr√©f√©rences\.',
        r'Analytics cookies.*?their preferences\.',
        r'Marketing cookies.*?te tonen\.',
        r'Les cookies publicitaires.*?pertinentes\.',
        r'Marketing cookies.*?relevant\.',
        
        # Privacy policy links
        r'Privacy policy.*?Terms',
        r'Politique de confidentialit√©.*?Conditions',
        r'Privacybeleid.*?Voorwaarden',
    ]
    
    # Technical and browser notices
    technical_patterns = [
        r'Voor een betere surfervaring.*?Chrome',
        r'Pour une meilleure exp√©rience.*?Chrome\.',
        r'For a better browsing experience.*?Chrome',
        r'Adblock detection:.*?Sluiten',
        r'Adblock detection:.*?Fermer',
        r'Adblock detection:.*?Close',
        r'You have not yet given permission.*?Load video',
        r'JavaScript is disabled.*?Enable JavaScript',
        r'Loading\.\.\.',
        r'Laden\.\.\.',
        r'Chargement\.\.\.',
    ]
    
    # Footer and related content
    footer_patterns = [
        # Related articles
        r'Other articles that might interest you.*',
        r'Autres articles qui pourraient vous int√©resser.*',
        r'Andere artikels die u kunnen interesseren.*',
        r'Gerelateerde concepten.*?Lees meer',
        r'Termes li√©s.*?Lire la suite',
        r'Related terms.*?Read more',
        
        # Newsletter and blog subscriptions
        r'Ontdek de.*?blog.*?Fran√ßais',
        r'D√©couvrir le blog.*?Nederlands',
        r'Discover the.*?blog.*?Dutch',
        r'Schrijf u in op onze nieuwsbrief.*?Inschrijven',
        r'Inscrivez-vous √† notre newsletter.*?S\'inscrire',
        r'Subscribe to our newsletter.*?Subscribe',
        
        # Legal and compliance
        r'Terms and conditions.*?Privacy',
        r'Termes et conditions.*?Confidentialit√©',
        r'Algemene voorwaarden.*?Privacy',
        r'Disclaimer.*?Copyright',
        r'Avertissement.*?Droits d\'auteur',
        r'Vrijwaring.*?Auteursrecht',
        
        # Copyright notices
        r'¬©.*?\d{4}.*?(KBC|BNP|Belfius|ING)',
        r'Alle rechten voorbehouden',
        r'Tous droits r√©serv√©s',
        r'All rights reserved',
    ]
    
    # Banking-specific call-to-action patterns
    banking_cta_patterns = [
        # Appointment booking
        r'Maak een afspraak!.*?',
        r'Prenez rendez-vous.*?',
        r'Make an appointment.*?',
        r'Boek een gesprek.*?',
        r'R√©servez un entretien.*?',
        
        # Investment advice
        r'Ontdek ons advies.*?',
        r'Laissez-vous conseiller.*?',
        r'Discover our advice.*?',
        r'Klaar om te beleggen\?.*?Maak een afspraak!',
        r'Pr√™t\(e\) √† investir\?.*?Prenez rendez-vous',
        r'Ready to invest\?.*?Make an appointment',
        
        # Product promotions
        r'Ontdek onze.*?producten',
        r'D√©couvrez nos.*?produits',
        r'Discover our.*?products',
        r'Meer informatie.*?aanvragen',
        r'Plus d\'informations.*?demander',
        r'More information.*?request',
    ]
    
    # Social media and sharing
    social_patterns = [
        r'Share on.*?Facebook',
        r'Partager sur.*?Facebook',
        r'Delen op.*?Facebook',
        r'Tweet.*?Twitter',
        r'Tweeter.*?Twitter',
        r'LinkedIn.*?delen',
        r'LinkedIn.*?partager',
        r'LinkedIn.*?share',
        r'WhatsApp.*?delen',
        r'WhatsApp.*?partager',
        r'WhatsApp.*?share',
        r'E-mail.*?versturen',
        r'E-mail.*?envoyer',
        r'E-mail.*?send',
        r'Print this page',
        r'Imprimez cette page',
        r'Print deze pagina',
        r'Download PDF',
        r'T√©l√©charger PDF',
        r'PDF downloaden',
    ]
    
    # Breadcrumb and metadata
    metadata_patterns = [
        r'Home\s*‚Ä∫.*?‚Ä∫',
        r'Accueil\s*‚Ä∫.*?‚Ä∫',
        r'Thuis\s*‚Ä∫.*?‚Ä∫',
        r'Last updated:.*?\d{4}',
        r'Derni√®re mise √† jour:.*?\d{4}',
        r'Laatst bijgewerkt:.*?\d{4}',
        r'Posted on.*?\d{4}',
        r'Publi√© le.*?\d{4}',
        r'Geplaatst op.*?\d{4}',
        r'Tags:.*?(?=\n|\.|$)',
        r'√âtiquettes:.*?(?=\n|\.|$)',
        r'Labels:.*?(?=\n|\.|$)',
        r'\d+\s+min read',
        r'\d+\s+min de lecture',
        r'\d+\s+min lezen',
    ]
    
    # Combine all patterns
    all_patterns = (navigation_patterns + cookie_patterns + technical_patterns + 
                   footer_patterns + banking_cta_patterns + social_patterns + metadata_patterns)
    
    # Apply cleaning
    cleaned = raw_text
    for pattern in all_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.DOTALL)
    
    # Remove repeated "Read more" links in all languages
    read_more_patterns = [
        r'Lees meer\s*',
        r'Lire la suite\s*',
        r'Read more\s*',
        r'Meer lezen\s*',
        r'En savoir plus\s*',
        r'Learn more\s*'
    ]
    
    for pattern in read_more_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
    
    # Remove investment sidebar content
    cleaned = re.sub(r'Beleggen in \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'Investir dans \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'Investing in \w+\s+[A-Z].*?\.{3}', '', cleaned, flags=re.DOTALL)
    
    # Remove standalone navigation and form words
    standalone_words = [
        'Contact', 'Zoeken', 'Rechercher', 'Search',
        'Email adres', 'Adresse email', 'Email address',
        'Inschrijven', 'S\'inscrire', 'Subscribe',
        'Versturen', 'Envoyer', 'Send',
        'Annuleren', 'Annuler', 'Cancel',
        'Bevestigen', 'Confirmer', 'Confirm'
    ]
    
    for word in standalone_words:
        cleaned = re.sub(rf'\b{re.escape(word)}\b', '', cleaned, flags=re.IGNORECASE)
    
    # Clean up formatting issues
    # Remove excessive punctuation
    cleaned = re.sub(r'[.]{2,}', '.', cleaned)
    cleaned = re.sub(r'[-]{3,}', '', cleaned)
    cleaned = re.sub(r'[_]{3,}', '', cleaned)
    cleaned = re.sub(r'(\b\w+\b)(\s+\1){2,}', r'\1', cleaned)  # Remove repeated words
    
    # Fix spacing around punctuation
    cleaned = re.sub(r'\s+([.,!?;:])', r'\1', cleaned)
    cleaned = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1 \2', cleaned)
    
    # Remove empty brackets and parentheses
    cleaned = re.sub(r'\(\s*\)', '', cleaned)
    cleaned = re.sub(r'\[\s*\]', '', cleaned)
    cleaned = re.sub(r'\{\s*\}', '', cleaned)
    
    # Normalize whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'\n\s*\n', '\n\n', cleaned)
    
    return cleaned.strip()

# ---------------------- Gemini Scoring ----------------------
def score_page_with_gemini(text, page_type):
    prompt = f"""
**Context:** This prompt is designed for the Gemini language model to evaluate the CEFR B2 level compliance of webpage content from retail banking websites for regulatory compliance. The evaluation focuses on vocabulary, grammar, clarity, and coherence to determine if the text is easily understandable for someone at a B2 level in English, French, Dutch or German. The desired output includes the compliance level percentage and individual scores for vocabulary complexity, grammatical structures, overall clarity, and coherence, with a detailed rationale for each evaluated address presented in a single cell of an output file (e.g., CSV or Excel). The goal is to ensure the evaluation effectively differentiates between webpages with varying levels of B2 compliance, leading to a wider range of scores, and that the rationale is comprehensive yet concise enough to fit within a single cell per address. **It is important to consider that these are banking websites, and some technical or financial terms may be inherent to the content.**

**Task:** Assess the CEFR B2 compliance level of the provided webpage content, ensuring a variable range of scores and a detailed, single-cell rationale for each evaluated address, **while acknowledging the potential presence of necessary banking terminology.**

**Instructions:**

1. **Identify Language:** Determine if the input text is in English, French, Dutch or German.

2. **Evaluate B2 Compliance with Granularity (Considering Banking Terms):** Analyze the text against the CEFR B2 criteria for the identified language, critically and precisely assessing the following aspects on a scale of 0 to 10. Avoid assigning only 0 or 10; use the full scale based on nuance and subtlety. ‚ÄúDo not hesitate to assign low (0‚Äì4) or high (8‚Äì10) scores when the text clearly deserves it. Avoid accumulating around 6‚Äì7 unless the text is truly average.‚Äù Remember the compliancy threshold is 70% (7/10) for B2 level. Therefore if a text is generally compliant it should receive a total score of higher than or equal to 70.

- **Vocabulary Complexity (0‚Äì10)**
  - 10 ‚Üí very simple, common words, basic banking terms, no jargon
  - 7‚Äì9 ‚Üí mostly common words, occasional technical terms explained
  - 4‚Äì6 ‚Üí mix of general and technical terms, some unnecessarily complex or rare words
  - 1‚Äì3 ‚Üí frequent use of complex, low-frequency words or jargon, often unexplained
  - 0 ‚Üí highly complex, dense language with rare or unexplained terms everywhere

- **Grammatical Structures (0‚Äì10)**
  - 10 ‚Üí simple sentences, clear structure, active voice, no complex clauses
  - 7‚Äì9 ‚Üí mostly simple, some moderate clauses, minor passive use
  - 4‚Äì6 ‚Üí mix of simple and complex sentences, occasional embedded or passive forms
  - 1‚Äì3 ‚Üí mostly long, embedded, or passive structures, hard to follow
  - 0 ‚Üí extremely complex grammar, frequent embedding, difficult to parse

- **Overall Clarity (0‚Äì10)**
  - 10 ‚Üí very clear, easy to understand, minimal effort required
  - 7‚Äì9 ‚Üí mostly clear, small moments of complexity
  - 4‚Äì6 ‚Üí mixed clarity, occasional confusion or ambiguity
  - 1‚Äì3 ‚Üí often unclear, requires effort to interpret
  - 0 ‚Üí very unclear, confusing, hard to follow

- **Coherence (0‚Äì10)**
  - 10 ‚Üí logical flow, clear organization, excellent connectors
  - 7‚Äì9 ‚Üí mostly logical, some jumps, minor missing links
  - 4‚Äì6 ‚Üí mixed coherence, weak transitions, partial disorganization
  - 1‚Äì3 ‚Üí often disorganized, unclear connections
  - 0 ‚Üí no logical order, chaotic, fragmented

3. **Provide Detailed Rationale (Single Cell):** Explain the reasoning behind each of the four scores within a single text string suitable for one Excel cell. Explicitly point out specific linguistic features (vocabulary, grammar, discourse markers) that contribute to the assigned level of complexity or simplicity for each criterion. When discussing vocabulary, specifically comment on the presence and handling of banking terminology. Justify why the text is or is not strictly at the B2 level for each aspect. Use clear separators (e.g., "; ") between the rationale for each criterion to ensure readability within the single cell.
```xml
<rationale>Vocabulary: [Explanation with examples, noting banking terms]; Grammar: [Explanation with examples]; Clarity: [Explanation with examples, considering banking terms]; Coherence: [Explanation with examples]</rationale>

**Output Format:**
Return the evaluation in the following XML format, ensuring all information for a single evaluated webpage address can be represented as a single row in an output file:
```xml
<vocabulary_complexity>Y</vocabulary_complexity>
<grammatical_structures>Z</grammatical_structures>
<overall_clarity>W</overall_clarity>
<coherence>V</coherence>
<rationale>Vocabulary: [Explanation with examples, noting banking terms], Grammar: [Explanation with examples], Clarity: [Explanation with examples, considering banking terms], Coherence: [Explanation with examples]</rationale>

Examples of B2 Compliant Texts and C1 Texts Which Are Not B2 Compliant
 To help you understand the evaluation criteria, here are some examples of texts rated at B2 and C1 levels:

English
B2 level text
 Source: LinguaPress Unsolved mysteries ‚Äì a short story by Sarah Wollbach
 Megan‚Äôs acting career began one morning a couple of years ago, when a woman approached her in the parking lot of her neighborhood grocery store. ‚ÄúExcuse me,‚Äù she said, ‚Äúbut have you ever taken acting lessons?‚Äù ‚Äî ‚ÄúNo,‚Äù she answered hesitantly. 
 The woman reached into her pocket and handed Megan a card. ‚ÄúI‚Äôm a casting director for Unsolved Mysteries,‚Äù she said, shaking her hand. Megan had always been stage-struck. 
 For years she'd fantasized about being an actor, sure that deep within her lurked a brilliant chameleon like Meryl Streep or Julia Roberts. Maybe this was her big break. 
 ‚ÄúThe show‚Äôs doing a feature about a woman who was kidnapped,‚Äù the lady continued, ‚Äúand you look exactly like her. The resemblance is amazing. Would you be interested in auditioning?‚Äù 
 The episode aired the next week, with a couple of thousand dollars for two days‚Äô work, plus travel, lodging, and food expenses.


C1 level text
 Source: LinguaPress The Enigma of the Missing Manuscript by John Doe
 The mystery of the missing manuscript has eluded generations of writers. It was said to contain the final, unpublished works and annotations of the author, whose sudden disappearance 
 had only added to the intrigue. The manuscript was believed to be hidden somewhere in the old mansion, a labyrinthine structure filled with secret passages and hidden rooms. 
 Many had tried to find it, but all had failed. The clues were cryptic, the dangers real, and the stakes high. For those who dared to search, it was a journey into the unknown, a test of wit and courage.



French
B2 level text
 Source: LinguaPress Myst√®res non r√©solus ‚Äì une histoire courte par Sarah Wollbach
 La carri√®re d‚Äôactrice de Megan a commenc√© un matin il y a quelques ann√©es, lorsqu‚Äôune femme l‚Äôa abord√©e dans le parking de son √©picerie de quartier. 
 ‚ÄúExcusez-moi,‚Äù dit-elle, ‚Äúmais avez-vous d√©j√† pris des cours de th√©√¢tre?‚Äù ‚Äî ‚ÄúNon,‚Äù r√©pondit-elle avec h√©sitation. La femme a fouill√© dans sa poche et tendu une carte √† Megan. 
 ‚ÄúJe suis directrice de casting de Myst√®res non r√©solus,‚Äù dit-elle en lui serrant la main. Megan avait toujours √©t√© fascin√©e par la sc√®ne. 
 Pendant des ann√©es, elle avait nourri en secret le r√™ve d‚Äô√™tre actrice, convaincue qu‚Äôau fond d‚Äôelle-m√™me se cachait un brillant cam√©l√©on comme Meryl Streep ou Julia Roberts. 
 Peut-√™tre que c‚Äô√©tait sa grande chance. ‚ÄúL‚Äô√©mission fait un reportage sur une femme qui a √©t√© kidnapp√©e,‚Äù continua la dame, ‚Äúet vous lui ressemblez exactement. 
 La ressemblance est incroyable. Seriez-vous int√©ress√©e par une audition?‚Äù Elle expliqua que le r√¥le valait quelques milliers de dollars pour deux jours de travail, plus les frais de voyage, de logement et de nourriture.


C1 level text
 Source: LinguaPress L‚ÄôEnigme du Manuscrit Disparu par Jean Dupont
 Le myst√®re du manuscrit disparu que tout le monde tentait de percer depuis des d√©cennies. On disait qu‚Äôil contenait les derni√®res ≈ìuvres finales, non publi√©es, d‚Äôun auteur renomm√©, 
 dont la disparition soudaine n‚Äôavait fait qu‚Äôajouter √† l‚Äôintrigue. On croyait que le manuscrit √©tait cach√© quelque part dans le vieux manoir, une structure labyrinthique remplie de passages secrets et de pi√®ces cach√©es. 
 Beaucoup avaient essay√© de le trouver, mais tous avaient √©chou√©. Les indices √©taient cryptiques, les dangers r√©els, et les enjeux √©lev√©s. Pour ceux qui osaient chercher, c‚Äô√©tait un voyage dans l‚Äôinconnu, un test d‚Äôesprit et de courage.



Dutch
B2 level text
 Source: LinguaPress Opgeloste mysteries ‚Äì een kortverhaal door Sarah Wollbach
 Megan‚Äôs acteercarri√®re begon op een ochtend een paar jaar geleden, toen een vrouw haar benaderde op de parkeerplaats van haar buurtwinkel. ‚ÄúExcuseer me,‚Äù zei ze, ‚Äúmaar heb je ooit acteerlessen gevolgd?‚Äù ‚Äî ‚ÄúNee,‚Äù antwoordde ze aarzelend. 
 De vrouw stak haar hand in haar zak en gaf Megan een kaartje. ‚ÄúIk ben een castingdirecteur voor Opgeloste mysteries,‚Äù zei ze, terwijl ze haar hand schudde. Megan was altijd al gefascineerd door het toneel. 
 Jarenlang had ze gefantaseerd over het zijn van een actrice, ervan overtuigd dat diep vanbinnen een briljante actrice zoals Meryl Streep of Julia Roberts schuilde. Misschien was dit haar grote doorbraak. 
 ‚ÄúDe show doet een reportage over een vrouw die ontvoerd is,‚Äù vervolgde de dame, ‚Äúen je lijkt precies op haar. De gelijkenis is verbazingwekkend. Zou je ge√Ønteresseerd zijn in een auditie?‚Äù 
 Ze zette uit dat dit alles een paar duizend dollar waard was voor twee dagen werk, plus reis-, verblijf- en voedselkosten.


C1 level text
 Source: LinguaPress Het Raadsel van het Verdwenen Manuscript door Jan Jansen
 Het mysterie van het verdwenen manuscript dat generaties schrijvers decennialang verbijsterd. Er werd gezegd dat het de laatste, ongepubliceerde werken van een beroemde auteur bevatte, wiens plotselinge verdwijning alleen maar bijdroeg aan de intrige. 
 Het gerucht deed de ronde dat het manuscript ergens in het oude herenhuis verborgen was, een labyrintische structuur vol geheime gangen en verborgen kamers. Velen hadden geprobeerd het te vinden, maar allemaal waren ze mislukt. 
 De aanwijzingen waren cryptisch, de gevaren echt, en de inzet hoog. Voor degenen die durfden te zoeken, was het een reis in het onbekende, een test van verstand en moed.


Input Text content to check: \"\"\"{text}\"\"\" 
"""
    try:
        model = genai.GenerativeModel("gemini-2.0-flash")
        response = model.generate_content(prompt, generation_config={"temperature": 0.2})
        output = response.text.strip()

        if output.startswith("```"):
            output = output.strip("` \n").replace("xml", "").strip()

        scores = {
            "vocabulary_complexity": extract_xml_score(output, "vocabulary_complexity"),
            "grammatical_structures": extract_xml_score(output, "grammatical_structures"),
            "overall_clarity": extract_xml_score(output, "overall_clarity"),
            "coherence": extract_xml_score(output, "coherence"),
            "rationale": extract_xml_rationale(output),
        }
        return scores

    except Exception as e:
        return {
            "vocabulary_complexity": 0,
            "grammatical_structures": 0,
            "overall_clarity": 0,
            "coherence": 0,
            "rationale": f"Error: {str(e)}",
        }

def extract_xml_score(xml_text, tag):
    match = re.search(fr"<{tag}>(\d+)</{tag}>", xml_text)
    return int(match.group(1)) if match else 0

def extract_xml_rationale(xml_text):
    match = re.search(r"<rationale>(.*?)</rationale>", xml_text, re.DOTALL)
    return match.group(1).strip() if match else "No rationale found."

# ---------------------- Processing Function ----------------------
def process_single_url(url_data):
    """Process a single URL - designed for parallel execution"""
    url, page_type = url_data
    
    try:
        # Extract and clean text
        text = extract_clean_text_crelan(url)
        if not text.strip():
            return create_error_result(url, page_type, "No text extracted")
        
        cleaned_text = clean_crelan_text(text)
        
        # Score with Gemini
        result = score_page_with_gemini(cleaned_text, page_type)
        
        sub_scores = [
            result.get("vocabulary_complexity", 0),
            result.get("grammatical_structures", 0),
            result.get("overall_clarity", 0),
            result.get("coherence", 0),
        ]
        
        compliance_value = round(sum(sub_scores) / 4 * 10) if all(
            isinstance(score, int) and 0 <= score <= 10 for score in sub_scores
        ) else 0
        
        return {
            "URL": url,
            "Page Type": page_type,
            "Compliance Level": compliance_value,
            "Vocabulary Complexity": result.get("vocabulary_complexity"),
            "Grammatical Structures": result.get("grammatical_structures"),
            "Overall Clarity": result.get("overall_clarity"),
            "Coherence": result.get("coherence"),
            "Rationale": result.get("rationale"),
        }
        
    except Exception as e:
        return create_error_result(url, page_type, str(e))

def create_error_result(url, page_type, error_msg):
    """Create error result dictionary"""
    return {
        "URL": url,
        "Page Type": page_type,
        "Compliance Level": 0,
        "Vocabulary Complexity": 0,
        "Grammatical Structures": 0,
        "Overall Clarity": 0,
        "Coherence": 0,
        "Rationale": f"Error: {error_msg}",
    }

# ---------------------- Main Analysis ----------------------
def analyze_crelan_b2_compliance(excel_file_path, max_workers=4, batch_size=20):
    """Optimized main function with parallel processing"""
    
    print("üöÄ KBC B2 Compliance Analysis (Optimized)")
    print("="*60)
    
    # Load URLs from Excel
    try:
        df_urls = pd.read_excel(excel_file_path)
        url_columns = ['Address', 'URL', 'url', 'address', 'link', 'Link']
        url_column = None
        for col in url_columns:
            if col in df_urls.columns:
                url_column = col
                break
        
        if url_column is None:
            url_column = df_urls.columns[0]
        
        urls = df_urls[url_column].dropna().tolist()
        print(f"‚úÖ Loaded {len(urls)} URLs from {url_column} column")
        
    except Exception as e:
        print(f"‚ùå Error loading Excel file: {e}")
        return
    
    # Create DataFrame and classify URLs
    df = pd.DataFrame({'Address': urls})
    df["Page Type"] = df["Address"].apply(classify_crelan_url)
    
    print(f"\nüìä URL Distribution:")
    print(df['Page Type'].value_counts())
    
    # Setup files
    checkpoint_file = "Crelan/crelan_b2_checkpoint.csv"
    log_file = "Crelan/crelan_b2_log.txt"
    
    # Resume from checkpoint
    processed_urls = set()
    scores = []
    
    if os.path.exists(log_file):
        with open(log_file, 'r') as f:
            processed_urls = set(line.strip() for line in f.readlines())
    
    if os.path.exists(checkpoint_file):
        scores_df = pd.read_csv(checkpoint_file)
        scores = scores_df.to_dict(orient='records')
        print(f"üìÇ Resuming from checkpoint with {len(scores)} existing scores")
    
    # Filter unprocessed URLs
    unprocessed_data = [
        (url, page_type) for url, page_type in zip(df['Address'], df['Page Type'])
        if url not in processed_urls
    ]
    
    if not unprocessed_data:
        print("‚úÖ All URLs already processed!")
        return pd.DataFrame(scores)
    
    print(f"üîÑ Processing {len(unprocessed_data)} remaining URLs with {max_workers} workers")
    
    # Process in batches with parallel execution
    for i in range(0, len(unprocessed_data), batch_size):
        batch = unprocessed_data[i:i + batch_size]
        batch_results = []
        
        print(f"\nüì¶ Processing batch {i//batch_size + 1}/{(len(unprocessed_data)-1)//batch_size + 1}")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks in the batch
            future_to_url = {
                executor.submit(process_single_url, url_data): url_data[0] 
                for url_data in batch
            }
            
            # Collect results with progress bar
            for future in tqdm(as_completed(future_to_url), total=len(batch), desc="ü§ñ Scoring"):
                url = future_to_url[future]
                try:
                    result = future.result(timeout=60)  # 60 second timeout per URL
                    batch_results.append(result)
                    
                    # Log processed URL
                    with open(log_file, 'a') as f:
                        f.write(url + '\n')
                        
                except Exception as e:
                    print(f"‚ùå Failed to process {url}: {e}")
                    # Add error result
                    batch_results.append(create_error_result(url, "Unknown", str(e)))
        
        # Add batch results to main scores
        scores.extend(batch_results)
        
        # Save checkpoint after each batch
        pd.DataFrame(scores).to_csv(checkpoint_file, index=False)
        print(f"üíæ Checkpoint saved: {len(scores)} total items processed")
        
        # Cleanup drivers for this batch
        for _ in range(max_workers):
            try:
                cleanup_driver()
            except:
                pass
        
        # Brief pause between batches
        time.sleep(2)
    
    # Final save and cleanup
    final_df = pd.DataFrame(scores)
    final_df.to_excel("Crelan/crelan_b2_final_results.xlsx", index=False)
    
    # Summary
    if len(final_df) > 0:
        overall_score = final_df["Compliance Level"].mean()
        print(f"\nüåê Overall CEFR B2 Accessibility Score: {overall_score:.2f}%")
        
        typology_avg = final_df.groupby("Page Type")["Compliance Level"].mean()
        for typ, score in typology_avg.items():
            status = "‚úÖ" if score >= 70 else "‚ö†Ô∏è"
            print(f"{status} {typ} pages: {score:.2f}% average compliance")
    
    print(f"\n‚úÖ Analysis complete! Results saved to: kbc_b2_final_results.xlsx")
    return final_df

# ---------------------- EXECUTION ----------------------
if __name__ == "__main__":
    excel_file_path = "Crelan/crelan_urls.xlsx"
    # Adjust parameters for your system:
    # max_workers: Number of parallel threads (4-8 recommended)
    # batch_size: URLs processed before checkpoint (20-50 recommended)
    analyze_crelan_b2_compliance(excel_file_path, max_workers=6, batch_size=30)

üöÄ KBC B2 Compliance Analysis (Optimized)
‚úÖ Loaded 1377 URLs from Address column

üìä URL Distribution:
Page Type
Other      974
Product    316
Legal       32
FAQ         30
Contact     17
Blog         8
Name: count, dtype: int64
üìÇ Resuming from checkpoint with 180 existing scores
üîÑ Processing 1179 remaining URLs with 6 workers

üì¶ Processing batch 1/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:23<00:00,  2.79s/it]


üíæ Checkpoint saved: 210 total items processed

üì¶ Processing batch 2/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:11<00:00,  2.39s/it]


üíæ Checkpoint saved: 240 total items processed

üì¶ Processing batch 3/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:11<00:00,  2.39s/it]


üíæ Checkpoint saved: 270 total items processed

üì¶ Processing batch 4/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:12<00:00,  2.41s/it]


üíæ Checkpoint saved: 300 total items processed

üì¶ Processing batch 5/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:12<00:00,  2.41s/it]


üíæ Checkpoint saved: 330 total items processed

üì¶ Processing batch 6/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:10<00:00,  2.36s/it]


üíæ Checkpoint saved: 360 total items processed

üì¶ Processing batch 7/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:12<00:00,  2.43s/it]


üíæ Checkpoint saved: 390 total items processed

üì¶ Processing batch 8/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:11<00:00,  2.39s/it]


üíæ Checkpoint saved: 420 total items processed

üì¶ Processing batch 9/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:14<00:00,  2.48s/it]


üíæ Checkpoint saved: 450 total items processed

üì¶ Processing batch 10/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:10<00:00,  2.36s/it]


üíæ Checkpoint saved: 480 total items processed

üì¶ Processing batch 11/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:11<00:00,  2.39s/it]


üíæ Checkpoint saved: 510 total items processed

üì¶ Processing batch 12/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:14<00:00,  2.50s/it]


üíæ Checkpoint saved: 540 total items processed

üì¶ Processing batch 13/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:14<00:00,  2.49s/it]


üíæ Checkpoint saved: 570 total items processed

üì¶ Processing batch 14/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:16<00:00,  2.53s/it]


üíæ Checkpoint saved: 600 total items processed

üì¶ Processing batch 15/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:15<00:00,  2.52s/it]


üíæ Checkpoint saved: 630 total items processed

üì¶ Processing batch 16/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:12<00:00,  2.42s/it]


üíæ Checkpoint saved: 660 total items processed

üì¶ Processing batch 17/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:12<00:00,  2.42s/it]


üíæ Checkpoint saved: 690 total items processed

üì¶ Processing batch 18/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:12<00:00,  2.40s/it]


üíæ Checkpoint saved: 720 total items processed

üì¶ Processing batch 19/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:15<00:00,  2.50s/it]


üíæ Checkpoint saved: 750 total items processed

üì¶ Processing batch 20/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:15<00:00,  2.52s/it]


üíæ Checkpoint saved: 780 total items processed

üì¶ Processing batch 21/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:16<00:00,  2.56s/it]


üíæ Checkpoint saved: 810 total items processed

üì¶ Processing batch 22/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:16<00:00,  2.53s/it]


üíæ Checkpoint saved: 840 total items processed

üì¶ Processing batch 23/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:16<00:00,  2.57s/it]


üíæ Checkpoint saved: 870 total items processed

üì¶ Processing batch 24/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:39<00:00,  3.32s/it]


üíæ Checkpoint saved: 900 total items processed

üì¶ Processing batch 25/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:15<00:00,  2.53s/it]


üíæ Checkpoint saved: 930 total items processed

üì¶ Processing batch 26/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:15<00:00,  2.53s/it]


üíæ Checkpoint saved: 960 total items processed

üì¶ Processing batch 27/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:26<00:00,  2.89s/it]


üíæ Checkpoint saved: 990 total items processed

üì¶ Processing batch 28/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:14<00:00,  2.47s/it]


üíæ Checkpoint saved: 1020 total items processed

üì¶ Processing batch 29/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:14<00:00,  2.50s/it]


üíæ Checkpoint saved: 1050 total items processed

üì¶ Processing batch 30/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:16<00:00,  2.56s/it]


üíæ Checkpoint saved: 1080 total items processed

üì¶ Processing batch 31/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:17<00:00,  2.59s/it]


üíæ Checkpoint saved: 1110 total items processed

üì¶ Processing batch 32/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:18<00:00,  2.60s/it]


üíæ Checkpoint saved: 1140 total items processed

üì¶ Processing batch 33/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:18<00:00,  2.60s/it]


üíæ Checkpoint saved: 1170 total items processed

üì¶ Processing batch 34/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:17<00:00,  2.59s/it]


üíæ Checkpoint saved: 1200 total items processed

üì¶ Processing batch 35/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:22<00:00,  2.74s/it]


üíæ Checkpoint saved: 1230 total items processed

üì¶ Processing batch 36/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:09<00:00,  2.32s/it]


üíæ Checkpoint saved: 1260 total items processed

üì¶ Processing batch 37/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:11<00:00,  2.39s/it]


üíæ Checkpoint saved: 1290 total items processed

üì¶ Processing batch 38/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:32<00:00,  1.10s/it]


üíæ Checkpoint saved: 1320 total items processed

üì¶ Processing batch 39/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [15:54<00:00, 31.83s/it]  


üíæ Checkpoint saved: 1350 total items processed

üì¶ Processing batch 40/40


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:28<00:00,  3.20s/it]


üíæ Checkpoint saved: 1359 total items processed

üåê Overall CEFR B2 Accessibility Score: 67.82%
‚ö†Ô∏è Blog pages: 52.00% average compliance
‚ö†Ô∏è Contact pages: 64.59% average compliance
‚úÖ FAQ pages: 70.73% average compliance
‚ö†Ô∏è Legal pages: 55.41% average compliance
‚ö†Ô∏è Other pages: 69.13% average compliance
‚ö†Ô∏è Product pages: 65.41% average compliance

‚úÖ Analysis complete! Results saved to: kbc_b2_final_results.xlsx


<div style="background-color: rgb(240, 142, 23); color: rgba(255, 255, 255, 1); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    8. Specialized Code for ING
</div>

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
import re
import google.generativeai as genai
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from functools import lru_cache

# ---------------------- CONFIG ----------------------
genai.configure(api_key="AIzaSyBzOT2O03scMENbdWouWexYa10v4K4OVPE")

# ---------------------- URL Classification ----------------------
@lru_cache(maxsize=1000)
def classify_ing_url(url):
   """Enhanced URL classification specifically for ING Bank (EN/FR/NL)"""
   url = url.lower()
   
   if any(term in url for term in [
       # Products - EN/FR/NL
       "product", "producten", "produits", "sparen", "saving", "epargne",
       "lenen", "loan", "pret", "credit", "rekening", "account", "compte",
       "beleggen", "investment", "investir", "hypotheek", "mortgage",
       "verzekering", "insurance", "assurance", "kaart", "card", "carte",
       "bankieren", "banking", "banque", "ing-plus", "orange", "zakelijk"
   ]):
       return "Product"
   
   elif any(term in url for term in [
       # Support - EN/FR/NL
       "faq", "support", "help", "hulp", "ondersteuning", "aide",
       "questions", "klantenservice", "service-client", "assistance", "live-chat"
   ]):
       return "FAQ"
   
   elif any(term in url for term in [
       # Legal - EN/FR/NL
       "legal", "juridisch", "juridique", "voorwaarden", "terms",
       "conditions", "privacy", "beleid", "policy",
       "cookie", "gdpr", "compliance", "tarieven", "tarifs", "fees"
   ]):
       return "Legal"
   
   elif any(term in url for term in [
       # Contact - EN/FR/NL
       "contact", "locatie", "location", "agences", "branches", "filialen",
       "kantoren", "afspraak", "appointment", "rendez-vous"
   ]):
       return "Contact"
   
   elif any(term in url for term in [
       # News/Blog - EN/FR/NL
       "blog", "nieuws", "news", "actualites", "insights", "perspectives"
   ]):
       return "Blog"
   
   else:
       return "Other"

# ---------------------- Text Extraction ----------------------
def extract_clean_text_ing(url):
    """Extract text from ING using their API - optimized for JSON structure"""
    try:
        from urllib.parse import urlparse
        parsed = urlparse(url)
        url_path = parsed.path
        
        api_url = "https://api.www.ing.be/be/public/pagemodel"
        params = {"pageUrl": url_path}
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'application/json',
            'Referer': 'https://www.ing.be/',
        }
        
        response = requests.get(api_url, params=params, headers=headers, timeout=15)
        response.raise_for_status()
        data = response.json()

        # Extract meaningful text from ING's JSON structure
        def extract_text_from_ing_json(obj):
            texts = []
            
            if isinstance(obj, dict):
                for key, value in obj.items():
                    # Extract from specific meaningful fields
                    if key in ['title', 'body', 'text', 'intro', 'subtitle', 'promoText', 'description']:
                        if isinstance(value, str) and value.strip():
                            texts.append(value.strip())
                    
                    # Extract from richBody.value (common in ING responses)
                    elif key == 'richBody' and isinstance(value, dict):
                        if 'value' in value and isinstance(value['value'], str):
                            # Clean HTML tags from richBody content
                            from bs4 import BeautifulSoup
                            clean_html = BeautifulSoup(value['value'], 'html.parser').get_text()
                            texts.append(clean_html.strip())
                    
                    # Extract from textLinks
                    elif key == 'textLinks' and isinstance(value, list):
                        for link in value:
                            if isinstance(link, dict) and 'text' in link:
                                if isinstance(link['text'], str) and link['text'].strip():
                                    texts.append(link['text'].strip())
                    
                    # Extract from steps in applicationProcess
                    elif key == 'stepList' and isinstance(value, list):
                        for step in value:
                            if isinstance(step, dict):
                                if 'title' in step and isinstance(step['title'], str):
                                    texts.append(step['title'].strip())
                                if 'body' in step and isinstance(step['body'], str):
                                    texts.append(step['body'].strip())
                    
                    # Extract from USPs (unique selling points)
                    elif key == 'usps' and isinstance(value, list):
                        for usp in value:
                            if isinstance(usp, dict) and 'body' in usp:
                                if isinstance(usp['body'], str):
                                    texts.append(usp['body'].strip())
                    
                    # Recursively process nested objects
                    else:
                        texts.extend(extract_text_from_ing_json(value))
            
            elif isinstance(obj, list):
                for item in obj:
                    texts.extend(extract_text_from_ing_json(item))
            
            return texts

        # Extract all text content
        all_texts = extract_text_from_ing_json(data)
        
        # Join and clean
        combined_text = " ".join(all_texts)
        cleaned_text = clean_ing_text(combined_text)
        
        if cleaned_text and len(cleaned_text.strip()) > 100:
            return cleaned_text[:10000]
        else:
            return ""

    except Exception as e:
        print(f"‚ùå Error fetching ING API content for {url}: {e}")
        return ""

def clean_ing_text(raw_text):
    """
    Generalized text cleaning function for ING bank API responses
    Handles multi-language content (NL/FR/EN) and common ING patterns
    """
    if not raw_text or len(raw_text.strip()) < 20:
        return ""
    
    cleaned = raw_text
    
    # Remove ING API technical elements
    ing_technical_patterns = [
        r'componentType',
        r'transformBaseUrl',
        r'cardType',
        r'cardSize',
        r'flexComponents',
        r'flexPageMetadata',
        r'mainHeaderZone',
        r'flexZone',
        r'robotInstruction',
        r'\bposition\b',
        r'\bstepList\b',
        r'\btextLink\b',
        r'https://[^\s]*',
        r'/[a-z]{2}/[a-zA-Z\-/]*',  # Generic language paths
        r'\.(png|jpg|jpeg|svg|pdf)\b',
    ]
    
    for pattern in ing_technical_patterns:
        cleaned = re.sub(pattern, ' ', cleaned, flags=re.IGNORECASE)
    
    # Fix HTML entities and escaped characters
    html_entity_fixes = [
        (r'&gt;', '>'),
        (r'&lt;', '<'),
        (r'&amp;', '&'),
        (r'&quot;', '"'),
        (r'&apos;', "'"),
        (r'&#39;', "'"),
        (r'\\\'', "'"),
        (r'\\"', '"'),
        (r'\'', "'"),
    ]
    
    for pattern, replacement in html_entity_fixes:
        cleaned = re.sub(pattern, replacement, cleaned)
    
    # Fix navigation breadcrumbs
    navigation_breadcrumb_fixes = [
        (r'www\.ing\.\s*be\s*>\s*([^>]+)\s*>\s*([^>]+)\s*>\s*([^>]+)', r'www.ing.be under \1, \2, \3'),
        (r'www\.ing\.\s*be\s*>\s*([^>]+)\s*>\s*([^>]+)', r'www.ing.be under \1, \2'),
        (r'www\.ing\.\s*be\s*>\s*([^>]+)', r'www.ing.be under \1'),
        (r'\s*>\s*', ' > '),
    ]
    
    for pattern, replacement in navigation_breadcrumb_fixes:
        cleaned = re.sub(pattern, replacement, cleaned, flags=re.IGNORECASE)
    
    # Fix common grammatical patterns
    grammatical_fixes = [
        # Missing auxiliary verbs
        (r'\bwill sent to', r'will be sent to'),
        (r'\bwill available', r'will be available'),
        (r'\bwill happy to', r'will be happy to'),
        (r'\bwill generated', r'will be generated'),
        (r'\bshall renewed', r'shall be renewed'),
        (r'\bshall tacitly renewed', r'shall be tacitly renewed'),
        (r'\bmay extended', r'may be extended'),
        (r'\bcan insured', r'can be insured'),
        (r'\bcan subscribed', r'can be subscribed'),
        (r'\bis by phone', r'is available by phone'),
        
        # Missing prepositions
        (r'\ba result of', r'as a result of'),
        (r'\bcovered this', r'covered by this'),
        (r'\bget to (\d+%)', r'get up to \1'),
        (r'\bup to (\d+) years old\b', r'up to \1 years old'),
        
        # Missing conditional words
        (r'\bapplies only you', r'applies only if you'),
        (r'\bonly applies the', r'only applies if the'),
        (r'\bonly subscribed the', r'only be subscribed if the'),
        
        # Question formation fixes
        (r'\bHow can ([a-z]+) the', r'How can I \1 the'),
        (r'\bWhat need to do\?', r'What do I need to do?'),
        (r'\bCan ([a-z]+) premium', r'Can I \1 premium'),
        
        # Common missing words
        (r'\bIt\'s to you', r'It\'s up to you'),
        (r'\bfor to (\d+)', r'for up to \1'),
        (r'\blong you don\'t', r'As long as you don\'t'),
        
        # ING-specific terms
        (r'\bHome\'Bank', r'Home Bank'),
        (r'\'([^\']+)\'', r'"\1"'),
    ]
    
    for pattern, replacement in grammatical_fixes:
        cleaned = re.sub(pattern, replacement, cleaned, flags=re.IGNORECASE)
    
    # Remove common ING navigation and CTA patterns
    navigation_patterns = [
        r'Log in to receive your personal [a-zA-Z\s]+',
        r'Not a (client|customer) yet\?[^.]*',
        r'Click here[^.]*',
        r'Call the ING [^)]+\)',
        r'Call us on \+32 [0-9 ]+',
        r'Monday to Friday[^.]+',
        r'available \d+/\d+',
        r'Or,?\s*available \d+/\d+',
    ]
    
    for pattern in navigation_patterns:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
    
    # Remove standalone technical words
    technical_words = [
        'componentType', 'transformBaseUrl', 'cardType', 'cardSize', 
        'position', 'stepList', 'textLink', 'richBody', 'flexZone',
        'mainHeaderZone', 'alignedImage', 'usps', 'cta', 'sectionTitle'
    ]
    
    for word in technical_words:
        cleaned = re.sub(rf'\b{re.escape(word)}\b', '', cleaned, flags=re.IGNORECASE)
    
    # Fix repetitive words
    cleaned = re.sub(r'\b(\w+)(\s+\1){2,}', r'\1', cleaned)
    
    # Clean up formatting and spacing
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'[.]{2,}', '.', cleaned)
    cleaned = re.sub(r'\s+([.,!?;:])', r'\1', cleaned)
    cleaned = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1 \2', cleaned)
    
    # Fix spacing around special characters
    cleaned = re.sub(r'\u200b', '', cleaned)
    cleaned = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned)
    
    # Remove empty brackets and orphaned punctuation
    cleaned = re.sub(r'\(\s*\)', '', cleaned)
    cleaned = re.sub(r'\[\s*\]', '', cleaned)
    cleaned = re.sub(r'\{\s*\}', '', cleaned)
    cleaned = re.sub(r'\s*,\s*\.', '.', cleaned)
    cleaned = re.sub(r'^\s*[,.]', '', cleaned)
    cleaned = re.sub(r'\s*[>]+\s*$', '', cleaned)
    cleaned = re.sub(r'^\s*[>]+', '', cleaned)
    
    # Fix orphaned punctuation and incomplete sentences
    cleaned = re.sub(r'\s+\?\s+', '. ', cleaned)  # Orphaned question marks
    cleaned = re.sub(r'\s+\.\s+', '. ', cleaned)  # Orphaned periods
    cleaned = re.sub(r':\s*$', '', cleaned)  # Trailing colons
    cleaned = re.sub(r'\s*\(\s*$', '', cleaned)  # Orphaned opening parentheses
    cleaned = re.sub(r'^\s*\)', '', cleaned)  # Orphaned closing parentheses
    cleaned = re.sub(r'\s+Or\s*$', '', cleaned)  # Trailing "Or"
    
    # Remove incomplete sentences at the end (be more conservative)
    cleaned = re.sub(r'\s+[A-Z][a-z]{0,3}\s*$', '', cleaned)  # Very short incomplete words
    cleaned = re.sub(r'\s+\.$', '.', cleaned)
    
    # Fix specific common fragments
    cleaned = re.sub(r'assistance\?\s*', 'assistance? ', cleaned)
    cleaned = re.sub(r'\?\s*You can still get', '. You can still get', cleaned)
    cleaned = re.sub(r'Are you a tenant instead\?\s*\.', 'Are you a tenant instead?', cleaned)
    
    # Final cleanup
    cleaned = re.sub(r'\s+', ' ', cleaned)
    
    return cleaned.strip()

# ---------------------- Gemini Scoring ----------------------
def score_page_with_gemini(text, page_type):
    prompt = f"""
**Context:** This prompt is designed for the Gemini language model to evaluate the CEFR B2 level compliance of webpage content from retail banking websites for regulatory compliance. The evaluation focuses on vocabulary, grammar, clarity, and coherence to determine if the text is easily understandable for someone at a B2 level in English, French, Dutch or German. The desired output includes the compliance level percentage and individual scores for vocabulary complexity, grammatical structures, overall clarity, and coherence, with a detailed rationale for each evaluated address presented in a single cell of an output file (e.g., CSV or Excel). The goal is to ensure the evaluation effectively differentiates between webpages with varying levels of B2 compliance, leading to a wider range of scores, and that the rationale is comprehensive yet concise enough to fit within a single cell per address. **It is important to consider that these are banking websites, and some technical or financial terms may be inherent to the content.**

**Task:** Assess the CEFR B2 compliance level of the provided webpage content, ensuring a variable range of scores and a detailed, single-cell rationale for each evaluated address, **while acknowledging the potential presence of necessary banking terminology.**

**Instructions:**

1. **Identify Language:** Determine if the input text is in English, French, Dutch or German.

2. **Evaluate B2 Compliance with Granularity (Considering Banking Terms):** Analyze the text against the CEFR B2 criteria for the identified language, critically and precisely assessing the following aspects on a scale of 0 to 10. Avoid assigning only 0 or 10; use the full scale based on nuance and subtlety. ‚ÄúDo not hesitate to assign low (0‚Äì4) or high (8‚Äì10) scores when the text clearly deserves it. Avoid accumulating around 6‚Äì7 unless the text is truly average.‚Äù Remember the compliancy threshold is 70% (7/10) for B2 level. Therefore if a text is generally compliant it should receive a total score of higher than or equal to 70.

- **Vocabulary Complexity (0‚Äì10)**
  - 10 ‚Üí very simple, common words, basic banking terms, no jargon
  - 7‚Äì9 ‚Üí mostly common words, occasional technical terms explained
  - 4‚Äì6 ‚Üí mix of general and technical terms, some unnecessarily complex or rare words
  - 1‚Äì3 ‚Üí frequent use of complex, low-frequency words or jargon, often unexplained
  - 0 ‚Üí highly complex, dense language with rare or unexplained terms everywhere

- **Grammatical Structures (0‚Äì10)**
  - 10 ‚Üí simple sentences, clear structure, active voice, no complex clauses
  - 7‚Äì9 ‚Üí mostly simple, some moderate clauses, minor passive use
  - 4‚Äì6 ‚Üí mix of simple and complex sentences, occasional embedded or passive forms
  - 1‚Äì3 ‚Üí mostly long, embedded, or passive structures, hard to follow
  - 0 ‚Üí extremely complex grammar, frequent embedding, difficult to parse

- **Overall Clarity (0‚Äì10)**
  - 10 ‚Üí very clear, easy to understand, minimal effort required
  - 7‚Äì9 ‚Üí mostly clear, small moments of complexity
  - 4‚Äì6 ‚Üí mixed clarity, occasional confusion or ambiguity
  - 1‚Äì3 ‚Üí often unclear, requires effort to interpret
  - 0 ‚Üí very unclear, confusing, hard to follow

- **Coherence (0‚Äì10)**
  - 10 ‚Üí logical flow, clear organization, excellent connectors
  - 7‚Äì9 ‚Üí mostly logical, some jumps, minor missing links
  - 4‚Äì6 ‚Üí mixed coherence, weak transitions, partial disorganization
  - 1‚Äì3 ‚Üí often disorganized, unclear connections
  - 0 ‚Üí no logical order, chaotic, fragmented

3. **Provide Detailed Rationale (Single Cell):** Explain the reasoning behind each of the four scores within a single text string suitable for one Excel cell. Explicitly point out specific linguistic features (vocabulary, grammar, discourse markers) that contribute to the assigned level of complexity or simplicity for each criterion. When discussing vocabulary, specifically comment on the presence and handling of banking terminology. Justify why the text is or is not strictly at the B2 level for each aspect. Use clear separators (e.g., "; ") between the rationale for each criterion to ensure readability within the single cell.
```xml
<rationale>Vocabulary: [Explanation with examples, noting banking terms]; Grammar: [Explanation with examples]; Clarity: [Explanation with examples, considering banking terms]; Coherence: [Explanation with examples]</rationale>

**Output Format:**
Return the evaluation in the following XML format, ensuring all information for a single evaluated webpage address can be represented as a single row in an output file:
```xml
<vocabulary_complexity>Y</vocabulary_complexity>
<grammatical_structures>Z</grammatical_structures>
<overall_clarity>W</overall_clarity>
<coherence>V</coherence>
<rationale>Vocabulary: [Explanation with examples, noting banking terms], Grammar: [Explanation with examples], Clarity: [Explanation with examples, considering banking terms], Coherence: [Explanation with examples]</rationale>

Examples of B2 Compliant Texts and C1 Texts Which Are Not B2 Compliant
 To help you understand the evaluation criteria, here are some examples of texts rated at B2 and C1 levels:

English
B2 level text
 Source: LinguaPress Unsolved mysteries ‚Äì a short story by Sarah Wollbach
 Megan‚Äôs acting career began one morning a couple of years ago, when a woman approached her in the parking lot of her neighborhood grocery store. ‚ÄúExcuse me,‚Äù she said, ‚Äúbut have you ever taken acting lessons?‚Äù ‚Äî ‚ÄúNo,‚Äù she answered hesitantly. 
 The woman reached into her pocket and handed Megan a card. ‚ÄúI‚Äôm a casting director for Unsolved Mysteries,‚Äù she said, shaking her hand. Megan had always been stage-struck. 
 For years she'd fantasized about being an actor, sure that deep within her lurked a brilliant chameleon like Meryl Streep or Julia Roberts. Maybe this was her big break. 
 ‚ÄúThe show‚Äôs doing a feature about a woman who was kidnapped,‚Äù the lady continued, ‚Äúand you look exactly like her. The resemblance is amazing. Would you be interested in auditioning?‚Äù 
 The episode aired the next week, with a couple of thousand dollars for two days‚Äô work, plus travel, lodging, and food expenses.


C1 level text
 Source: LinguaPress The Enigma of the Missing Manuscript by John Doe
 The mystery of the missing manuscript has eluded generations of writers. It was said to contain the final, unpublished works and annotations of the author, whose sudden disappearance 
 had only added to the intrigue. The manuscript was believed to be hidden somewhere in the old mansion, a labyrinthine structure filled with secret passages and hidden rooms. 
 Many had tried to find it, but all had failed. The clues were cryptic, the dangers real, and the stakes high. For those who dared to search, it was a journey into the unknown, a test of wit and courage.



French
B2 level text
 Source: LinguaPress Myst√®res non r√©solus ‚Äì une histoire courte par Sarah Wollbach
 La carri√®re d‚Äôactrice de Megan a commenc√© un matin il y a quelques ann√©es, lorsqu‚Äôune femme l‚Äôa abord√©e dans le parking de son √©picerie de quartier. 
 ‚ÄúExcusez-moi,‚Äù dit-elle, ‚Äúmais avez-vous d√©j√† pris des cours de th√©√¢tre?‚Äù ‚Äî ‚ÄúNon,‚Äù r√©pondit-elle avec h√©sitation. La femme a fouill√© dans sa poche et tendu une carte √† Megan. 
 ‚ÄúJe suis directrice de casting de Myst√®res non r√©solus,‚Äù dit-elle en lui serrant la main. Megan avait toujours √©t√© fascin√©e par la sc√®ne. 
 Pendant des ann√©es, elle avait nourri en secret le r√™ve d‚Äô√™tre actrice, convaincue qu‚Äôau fond d‚Äôelle-m√™me se cachait un brillant cam√©l√©on comme Meryl Streep ou Julia Roberts. 
 Peut-√™tre que c‚Äô√©tait sa grande chance. ‚ÄúL‚Äô√©mission fait un reportage sur une femme qui a √©t√© kidnapp√©e,‚Äù continua la dame, ‚Äúet vous lui ressemblez exactement. 
 La ressemblance est incroyable. Seriez-vous int√©ress√©e par une audition?‚Äù Elle expliqua que le r√¥le valait quelques milliers de dollars pour deux jours de travail, plus les frais de voyage, de logement et de nourriture.


C1 level text
 Source: LinguaPress L‚ÄôEnigme du Manuscrit Disparu par Jean Dupont
 Le myst√®re du manuscrit disparu que tout le monde tentait de percer depuis des d√©cennies. On disait qu‚Äôil contenait les derni√®res ≈ìuvres finales, non publi√©es, d‚Äôun auteur renomm√©, 
 dont la disparition soudaine n‚Äôavait fait qu‚Äôajouter √† l‚Äôintrigue. On croyait que le manuscrit √©tait cach√© quelque part dans le vieux manoir, une structure labyrinthique remplie de passages secrets et de pi√®ces cach√©es. 
 Beaucoup avaient essay√© de le trouver, mais tous avaient √©chou√©. Les indices √©taient cryptiques, les dangers r√©els, et les enjeux √©lev√©s. Pour ceux qui osaient chercher, c‚Äô√©tait un voyage dans l‚Äôinconnu, un test d‚Äôesprit et de courage.



Dutch
B2 level text
 Source: LinguaPress Opgeloste mysteries ‚Äì een kortverhaal door Sarah Wollbach
 Megan‚Äôs acteercarri√®re begon op een ochtend een paar jaar geleden, toen een vrouw haar benaderde op de parkeerplaats van haar buurtwinkel. ‚ÄúExcuseer me,‚Äù zei ze, ‚Äúmaar heb je ooit acteerlessen gevolgd?‚Äù ‚Äî ‚ÄúNee,‚Äù antwoordde ze aarzelend. 
 De vrouw stak haar hand in haar zak en gaf Megan een kaartje. ‚ÄúIk ben een castingdirecteur voor Opgeloste mysteries,‚Äù zei ze, terwijl ze haar hand schudde. Megan was altijd al gefascineerd door het toneel. 
 Jarenlang had ze gefantaseerd over het zijn van een actrice, ervan overtuigd dat diep vanbinnen een briljante actrice zoals Meryl Streep of Julia Roberts schuilde. Misschien was dit haar grote doorbraak. 
 ‚ÄúDe show doet een reportage over een vrouw die ontvoerd is,‚Äù vervolgde de dame, ‚Äúen je lijkt precies op haar. De gelijkenis is verbazingwekkend. Zou je ge√Ønteresseerd zijn in een auditie?‚Äù 
 Ze zette uit dat dit alles een paar duizend dollar waard was voor twee dagen werk, plus reis-, verblijf- en voedselkosten.


C1 level text
 Source: LinguaPress Het Raadsel van het Verdwenen Manuscript door Jan Jansen
 Het mysterie van het verdwenen manuscript dat generaties schrijvers decennialang verbijsterd. Er werd gezegd dat het de laatste, ongepubliceerde werken van een beroemde auteur bevatte, wiens plotselinge verdwijning alleen maar bijdroeg aan de intrige. 
 Het gerucht deed de ronde dat het manuscript ergens in het oude herenhuis verborgen was, een labyrintische structuur vol geheime gangen en verborgen kamers. Velen hadden geprobeerd het te vinden, maar allemaal waren ze mislukt. 
 De aanwijzingen waren cryptisch, de gevaren echt, en de inzet hoog. Voor degenen die durfden te zoeken, was het een reis in het onbekende, een test van verstand en moed.


Input Text content to check: \"\"\"{text}\"\"\" 
"""
    try:
        model = genai.GenerativeModel("gemini-2.0-flash")
        response = model.generate_content(prompt, generation_config={"temperature": 0.2})
        output = response.text.strip()

        if output.startswith("```"):
            output = output.strip("` \n").replace("xml", "").strip()

        scores = {
            "vocabulary_complexity": extract_xml_score(output, "vocabulary_complexity"),
            "grammatical_structures": extract_xml_score(output, "grammatical_structures"),
            "overall_clarity": extract_xml_score(output, "overall_clarity"),
            "coherence": extract_xml_score(output, "coherence"),
            "rationale": extract_xml_rationale(output),
        }
        return scores

    except Exception as e:
        return {
            "vocabulary_complexity": 0,
            "grammatical_structures": 0,
            "overall_clarity": 0,
            "coherence": 0,
            "rationale": f"Error: {str(e)}",
        }

def extract_xml_score(xml_text, tag):
    match = re.search(fr"<{tag}>(\d+)</{tag}>", xml_text)
    return int(match.group(1)) if match else 0

def extract_xml_rationale(xml_text):
    match = re.search(r"<rationale>(.*?)</rationale>", xml_text, re.DOTALL)
    return match.group(1).strip() if match else "No rationale found."

# ---------------------- Processing Function ----------------------
def process_single_url(url_data):
    """Process a single URL - designed for parallel execution"""
    url, page_type = url_data
    
    try:
        # Extract and clean text using API
        text = extract_clean_text_ing(url)
        if not text.strip():
            return create_error_result(url, page_type, "No text extracted from API")
        
        cleaned_text = clean_ing_text(text)
        
        # Score with Gemini
        result = score_page_with_gemini(cleaned_text, page_type)
        
        sub_scores = [
            result.get("vocabulary_complexity", 0),
            result.get("grammatical_structures", 0),
            result.get("overall_clarity", 0),
            result.get("coherence", 0),
        ]
        
        compliance_value = round(sum(sub_scores) / 4 * 10) if all(
            isinstance(score, int) and 0 <= score <= 10 for score in sub_scores
        ) else 0
        
        return {
            "URL": url,
            "Page Type": page_type,
            "Compliance Level": compliance_value,
            "Vocabulary Complexity": result.get("vocabulary_complexity"),
            "Grammatical Structures": result.get("grammatical_structures"),
            "Overall Clarity": result.get("overall_clarity"),
            "Coherence": result.get("coherence"),
            "Rationale": result.get("rationale"),
        }
        
    except Exception as e:
        return create_error_result(url, page_type, str(e))

def create_error_result(url, page_type, error_msg):
    """Create error result dictionary"""
    return {
        "URL": url,
        "Page Type": page_type,
        "Compliance Level": 0,
        "Vocabulary Complexity": 0,
        "Grammatical Structures": 0,
        "Overall Clarity": 0,
        "Coherence": 0,
        "Rationale": f"Error: {error_msg}",
    }

# ---------------------- Main Analysis ----------------------
def analyze_ing_b2_compliance(excel_file_path, max_workers=4, batch_size=20):
    """Optimized main function with parallel processing"""
    
    print("üöÄ KBC B2 Compliance Analysis (Optimized)")
    print("="*60)
    
    # Load URLs from Excel
    try:
        df_urls = pd.read_excel(excel_file_path)
        url_columns = ['Address', 'URL', 'url', 'address', 'link', 'Link']
        url_column = None
        for col in url_columns:
            if col in df_urls.columns:
                url_column = col
                break
        
        if url_column is None:
            url_column = df_urls.columns[0]
        
        urls = df_urls[url_column].dropna().tolist()
        print(f"‚úÖ Loaded {len(urls)} URLs from {url_column} column")
        
    except Exception as e:
        print(f"‚ùå Error loading Excel file: {e}")
        return
    
    # Create DataFrame and classify URLs
    df = pd.DataFrame({'Address': urls})
    df["Page Type"] = df["Address"].apply(classify_ing_url)
    
    print(f"\nüìä URL Distribution:")
    print(df['Page Type'].value_counts())
    
    # Setup files
    checkpoint_file = "ING/ing_b2_checkpoint.csv"
    log_file = "ING/ing_b2_log.txt"
    
    # Resume from checkpoint
    processed_urls = set()
    scores = []
    
    if os.path.exists(log_file):
        with open(log_file, 'r') as f:
            processed_urls = set(line.strip() for line in f.readlines())
    
    if os.path.exists(checkpoint_file):
        scores_df = pd.read_csv(checkpoint_file)
        scores = scores_df.to_dict(orient='records')
        print(f"üìÇ Resuming from checkpoint with {len(scores)} existing scores")
    
    # Filter unprocessed URLs
    unprocessed_data = [
        (url, page_type) for url, page_type in zip(df['Address'], df['Page Type'])
        if url not in processed_urls
    ]
    
    if not unprocessed_data:
        print("‚úÖ All URLs already processed!")
        return pd.DataFrame(scores)
    
    print(f"üîÑ Processing {len(unprocessed_data)} remaining URLs with {max_workers} workers")
    
    # Process in batches with parallel execution
    for i in range(0, len(unprocessed_data), batch_size):
        batch = unprocessed_data[i:i + batch_size]
        batch_results = []
        
        print(f"\nüì¶ Processing batch {i//batch_size + 1}/{(len(unprocessed_data)-1)//batch_size + 1}")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks in the batch
            future_to_url = {
                executor.submit(process_single_url, url_data): url_data[0] 
                for url_data in batch
            }
            
            # Collect results with progress bar
            for future in tqdm(as_completed(future_to_url), total=len(batch), desc="ü§ñ Scoring"):
                url = future_to_url[future]
                try:
                    result = future.result(timeout=60)  # 60 second timeout per URL
                    batch_results.append(result)
                    
                    # Log processed URL
                    with open(log_file, 'a') as f:
                        f.write(url + '\n')
                        
                except Exception as e:
                    print(f"‚ùå Failed to process {url}: {e}")
                    # Add error result
                    batch_results.append(create_error_result(url, "Unknown", str(e)))
        
        # Add batch results to main scores
        scores.extend(batch_results)
        
        # Save checkpoint after each batch
        pd.DataFrame(scores).to_csv(checkpoint_file, index=False)
        print(f"üíæ Checkpoint saved: {len(scores)} total items processed")
        
        # Brief pause between batches
        time.sleep(2)
    
    # Final save and cleanup
    final_df = pd.DataFrame(scores)
    final_df.to_excel("ING/ing_b2_final_results.xlsx", index=False)
    
    # Summary
    if len(final_df) > 0:
        overall_score = final_df["Compliance Level"].mean()
        print(f"\nüåê Overall CEFR B2 Accessibility Score: {overall_score:.2f}%")
        
        typology_avg = final_df.groupby("Page Type")["Compliance Level"].mean()
        for typ, score in typology_avg.items():
            status = "‚úÖ" if score >= 70 else "‚ö†Ô∏è"
            print(f"{status} {typ} pages: {score:.2f}% average compliance")
    
    print(f"\n‚úÖ Analysis complete! Results saved to: kbc_b2_final_results.xlsx")
    return final_df


In [25]:
text = extract_clean_text_ing("https://www.ing.be/en/individuals/insurance/insure-my-home/home-insurance-owners")
clean_text = clean_ing_text(text)
clean_text

'Protect your home with our reliable ING Property Insurance for homeowners. Cover water damage, fire, storms and vandalism. Immediate coverage, easy online application - even for non-clients! From ‚Ç¨11 a month Discounts 24-hour assistance? Calculate your price here Home insurance for owners Simulate and get your ING Property Insurance | Insurer: NN Non-life Promo: 15% discount with promocode HOME* Home Insurance Are you a tenant instead? ING Home Insurance covers damage from fire, water damage, theft, third-party damage and loss of keys.Have a question or need assistance with a claim? Our assistance team is available by phone (+32 2 725 15 00).Choose ING Home Insurance and get up to 40% discount for homes under construction and up to 20% for homes less than 7 years old. How to take out ING Property Insurance? Calculate Calculate the premium in a few steps. Apply Choose your cover and complete your application in 5 minutes. Confirm Your policies will be sent to you by email and you‚Äôl

In [27]:
text_2 = extract_clean_text_ing("https://www.ing.be/fr/particuliers/pension/fonds-epargne-pension")
clean_text_2 = clean_ing_text(text_2)
clean_text_2

'D√©couvrez Star Fund, notre offre de fonds d"√©pargne pension pour compl√©ter votre pension l√©gale tout en b√©n√©ficiant d"une r√©duction d"imp√¥t jusqu"√† 30%! Compl√©tez votre pension l√©gale √Ä partir de 10‚Ç¨/mois (120‚Ç¨ par ann√©e civile) Jusqu"√† 30% d"avantage fiscal Pr√©parer ma pension Simuler ma pension √âpargner pour votre pension tout en payant moins d"imp√¥ts? Compl√©tez votre pension l√©gale en investissant dans une offre diversifi√©e et b√©n√©ficiez d"un avantage fiscal annuel allant jusqu"√† 30%. ING_201230_02 67 Fonds d"√©pargne-pension Star Fund 10 tickets duo "ING Night" √† gagner* Pension *10 tickets duo "ING Night" √† gagner. Une soir√©e inoubliable au rythme d"artistes belges √† ne pas manquer. N"attendez plus et ouvrez un nouveau compte Star Fund Investment ou une assurance √©pargne-pension ING Life Star Plan. Consultez les conditions g√©n√©rales de cette action. Notre offre Star Fund en bref Faites d‚Äôune pierre deux coups et profitez d‚Äôun avantage fiscal 

In [4]:
if __name__ == "__main__":
    excel_file_path = "ING/ing_urls.xlsx"
    analyze_ing_b2_compliance(excel_file_path, max_workers=6, batch_size=30)

üöÄ KBC B2 Compliance Analysis (Optimized)
‚úÖ Loaded 3712 URLs from Address column

üìä URL Distribution:
Page Type
Other      1778
Product    1626
Blog        195
FAQ          44
Contact      42
Legal        27
Name: count, dtype: int64
üìÇ Resuming from checkpoint with 2010 existing scores
üîÑ Processing 1677 remaining URLs with 6 workers

üì¶ Processing batch 1/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.03it/s]


üíæ Checkpoint saved: 2040 total items processed

üì¶ Processing batch 2/56


ü§ñ Scoring:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [00:08<00:04,  3.15it/s]

‚ùå Error fetching ING API content for https://www.ing.be/fr/particuliers/assurer/adaptation-contrat: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Ffr%2Fparticuliers%2Fassurer%2Fadaptation-contrat


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.03it/s]


üíæ Checkpoint saved: 2070 total items processed

üì¶ Processing batch 3/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.00it/s]


üíæ Checkpoint saved: 2100 total items processed

üì¶ Processing batch 4/56


ü§ñ Scoring:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 21/30 [00:08<00:01,  6.13it/s]

‚ùå Error fetching ING API content for https://www.ing.be/fr/professionnel/financer/intakeformbusinesscreditcenter: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Ffr%2Fprofessionnel%2Ffinancer%2Fintakeformbusinesscreditcenter


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:12<00:00,  2.38it/s]


üíæ Checkpoint saved: 2130 total items processed

üì¶ Processing batch 5/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.04it/s]


üíæ Checkpoint saved: 2160 total items processed

üì¶ Processing batch 6/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.95it/s]


üíæ Checkpoint saved: 2190 total items processed

üì¶ Processing batch 7/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.94it/s]


üíæ Checkpoint saved: 2220 total items processed

üì¶ Processing batch 8/56


ü§ñ Scoring:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 26/30 [00:13<00:01,  2.51it/s]

‚ùå Error fetching ING API content for https://www.ing.be/fr/private-banking/publications/actualite-juridique-et-fiscale/articles-sur-limmobilier: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Ffr%2Fprivate-banking%2Fpublications%2Factualite-juridique-et-fiscale%2Farticles-sur-limmobilier


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.90it/s]


üíæ Checkpoint saved: 2250 total items processed

üì¶ Processing batch 9/56


ü§ñ Scoring:  13%|‚ñà‚ñé        | 4/30 [00:00<00:00, 39.68it/s]

‚ùå Error fetching ING API content for https://www.ing.be/fr/private-banking/publications/actualite-juridique-et-fiscale/articles-sur-linvestissements: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Ffr%2Fprivate-banking%2Fpublications%2Factualite-juridique-et-fiscale%2Farticles-sur-linvestissements
‚ùå Error fetching ING API content for https://www.ing.be/fr/private-banking/publications/actualite-juridique-et-fiscale/articles-varia: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Ffr%2Fprivate-banking%2Fpublications%2Factualite-juridique-et-fiscale%2Farticles-varia
‚ùå Error fetching ING API content for https://www.ing.be/fr/private-banking/publications/actualite-juridique-et-fiscale/articles-sur-limmobilier-a-letranger: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Ffr%2Fprivate-banking%2Fpublications%2Factualite-juridique-et-fiscale%2Farticles-sur-limmobilier-a-letranger
‚ùå Error f

ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:13<00:00,  2.23it/s]


üíæ Checkpoint saved: 2280 total items processed

üì¶ Processing batch 10/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:13<00:00,  2.19it/s]


üíæ Checkpoint saved: 2310 total items processed

üì¶ Processing batch 11/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.06it/s]


üíæ Checkpoint saved: 2340 total items processed

üì¶ Processing batch 12/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.03it/s]


üíæ Checkpoint saved: 2370 total items processed

üì¶ Processing batch 13/56


ü§ñ Scoring:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 24/30 [00:11<00:01,  3.77it/s]

‚ùå Error fetching ING API content for https://www.ing.be/fr/particuliers/gerer-le-quotidien/quel-est-linteret-davoir-une-carte-de-credit: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Ffr%2Fparticuliers%2Fgerer-le-quotidien%2Fquel-est-linteret-davoir-une-carte-de-credit
‚ùå Error fetching ING API content for https://www.ing.be/fr/private-banking/publications/actualite-juridique-et-fiscale/nouvelle-regularisation-fiscale-dans-laccord-de-gouvernement-federal-2025-2029: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Ffr%2Fprivate-banking%2Fpublications%2Factualite-juridique-et-fiscale%2Fnouvelle-regularisation-fiscale-dans-laccord-de-gouvernement-federal-2025-2029


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:13<00:00,  2.18it/s]


üíæ Checkpoint saved: 2400 total items processed

üì¶ Processing batch 14/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.02it/s]


üíæ Checkpoint saved: 2430 total items processed

üì¶ Processing batch 15/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.97it/s]


üíæ Checkpoint saved: 2460 total items processed

üì¶ Processing batch 16/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.89it/s]


üíæ Checkpoint saved: 2490 total items processed

üì¶ Processing batch 17/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.97it/s]


üíæ Checkpoint saved: 2520 total items processed

üì¶ Processing batch 18/56


ü§ñ Scoring:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [00:05<00:07,  2.67it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/campaign/bb-pi-carfair-2025: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fcampaign%2Fbb-pi-carfair-2025


ü§ñ Scoring:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [00:08<00:05,  2.58it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/professioneel/financieren/financiele-leasing-wagen: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fprofessioneel%2Ffinancieren%2Ffinanciele-leasing-wagen


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.99it/s]


üíæ Checkpoint saved: 2550 total items processed

üì¶ Processing batch 19/56


ü§ñ Scoring:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 16/30 [00:08<00:05,  2.54it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/professioneel/betalen/betaaloplossingen: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fprofessioneel%2Fbetalen%2Fbetaaloplossingen


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.02it/s]


üíæ Checkpoint saved: 2580 total items processed

üì¶ Processing batch 20/56


ü§ñ Scoring:  40%|‚ñà‚ñà‚ñà‚ñà      | 12/30 [00:06<00:07,  2.56it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/campaign/pi-carfair-2025: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fcampaign%2Fpi-carfair-2025


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.07it/s]


üíæ Checkpoint saved: 2610 total items processed

üì¶ Processing batch 21/56


ü§ñ Scoring:   0%|          | 0/30 [00:00<?, ?it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/voor-elke-dag/toegankelijkheid: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fvoor-elke-dag%2Ftoegankelijkheid


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.10it/s]


üíæ Checkpoint saved: 2640 total items processed

üì¶ Processing batch 22/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.05it/s]


üíæ Checkpoint saved: 2670 total items processed

üì¶ Processing batch 23/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.93it/s]


üíæ Checkpoint saved: 2700 total items processed

üì¶ Processing batch 24/56


ü§ñ Scoring:  13%|‚ñà‚ñé        | 4/30 [00:03<00:15,  1.70it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/beleggen/invest-academy-nl: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fbeleggen%2Finvest-academy-nl


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.01it/s]


üíæ Checkpoint saved: 2730 total items processed

üì¶ Processing batch 25/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.08it/s]


üíæ Checkpoint saved: 2760 total items processed

üì¶ Processing batch 26/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.90it/s]


üíæ Checkpoint saved: 2790 total items processed

üì¶ Processing batch 27/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.07it/s]


üíæ Checkpoint saved: 2820 total items processed

üì¶ Processing batch 28/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  2.00it/s]


üíæ Checkpoint saved: 2850 total items processed

üì¶ Processing batch 29/56


ü§ñ Scoring:  13%|‚ñà‚ñé        | 4/30 [00:02<00:11,  2.23it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/beleggen/nieuw-platform-private-banking: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fbeleggen%2Fnieuw-platform-private-banking
‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/beleggen/nieuw-platform-legal-entities: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fbeleggen%2Fnieuw-platform-legal-entities
‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/beleggen/nieuw-platform-private-individuals: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fbeleggen%2Fnieuw-platform-private-individuals
‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/beleggen/nieuw-platform-beleggen-2023: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulie

ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:13<00:00,  2.22it/s]


üíæ Checkpoint saved: 2880 total items processed

üì¶ Processing batch 30/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.10it/s]


üíæ Checkpoint saved: 2910 total items processed

üì¶ Processing batch 31/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.09it/s]


üíæ Checkpoint saved: 2940 total items processed

üì¶ Processing batch 32/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.05it/s]


üíæ Checkpoint saved: 2970 total items processed

üì¶ Processing batch 33/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:13<00:00,  2.21it/s]


üíæ Checkpoint saved: 3000 total items processed

üì¶ Processing batch 34/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.09it/s]


üíæ Checkpoint saved: 3030 total items processed

üì¶ Processing batch 35/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.08it/s]


üíæ Checkpoint saved: 3060 total items processed

üì¶ Processing batch 36/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.96it/s]


üíæ Checkpoint saved: 3090 total items processed

üì¶ Processing batch 37/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.93it/s]


üíæ Checkpoint saved: 3120 total items processed

üì¶ Processing batch 38/56


ü§ñ Scoring:  13%|‚ñà‚ñé        | 4/30 [00:00<00:00, 38.17it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/professioneel/financieren/intakeformbusinesscreditcenter: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fprofessioneel%2Ffinancieren%2Fintakeformbusinesscreditcenter


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.12it/s]


üíæ Checkpoint saved: 3150 total items processed

üì¶ Processing batch 39/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.96it/s]


üíæ Checkpoint saved: 3180 total items processed

üì¶ Processing batch 40/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.11it/s]


üíæ Checkpoint saved: 3210 total items processed

üì¶ Processing batch 41/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:16<00:00,  1.87it/s]


üíæ Checkpoint saved: 3240 total items processed

üì¶ Processing batch 42/56


ü§ñ Scoring:   0%|          | 0/30 [00:00<?, ?it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/mijn-leven/wonen/tweede-verblijf-bestemming: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fmijn-leven%2Fwonen%2Ftweede-verblijf-bestemming


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.06it/s]


üíæ Checkpoint saved: 3270 total items processed

üì¶ Processing batch 43/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.09it/s]


üíæ Checkpoint saved: 3300 total items processed

üì¶ Processing batch 44/56


ü§ñ Scoring:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23/30 [00:11<00:02,  2.55it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/mijn-vermogen-beheren/beleggingsacademie/vastgoed-kopen-als-natuurlijk-persoon-of-als-vennootschap: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fmijn-vermogen-beheren%2Fbeleggingsacademie%2Fvastgoed-kopen-als-natuurlijk-persoon-of-als-vennootschap


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.99it/s]


üíæ Checkpoint saved: 3330 total items processed

üì¶ Processing batch 45/56


ü§ñ Scoring:  20%|‚ñà‚ñà        | 6/30 [00:02<00:06,  3.88it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/mijn-vermogen-beheren/successieplanning/bescherm-de-langstlevende-echtgenoot: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fmijn-vermogen-beheren%2Fsuccessieplanning%2Fbescherm-de-langstlevende-echtgenoot


ü§ñ Scoring:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [00:05<00:09,  2.02it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/mijn-vermogen-beheren/successieplanning/cadeau-of-schenking: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fmijn-vermogen-beheren%2Fsuccessieplanning%2Fcadeau-of-schenking
‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/mijn-vermogen-beheren/successieplanning/schenken-welke-mogelijkheden: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fmijn-vermogen-beheren%2Fsuccessieplanning%2Fschenken-welke-mogelijkheden


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.13it/s]


üíæ Checkpoint saved: 3360 total items processed

üì¶ Processing batch 46/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.95it/s]


üíæ Checkpoint saved: 3390 total items processed

üì¶ Processing batch 47/56


ü§ñ Scoring:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 18/30 [00:08<00:04,  2.61it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/diensten/ken-onze-klant: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fdiensten%2Fken-onze-klant


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:13<00:00,  2.23it/s]


üíæ Checkpoint saved: 3420 total items processed

üì¶ Processing batch 48/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:14<00:00,  2.06it/s]


üíæ Checkpoint saved: 3450 total items processed

üì¶ Processing batch 49/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.97it/s]


üíæ Checkpoint saved: 3480 total items processed

üì¶ Processing batch 50/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:16<00:00,  1.78it/s]


üíæ Checkpoint saved: 3510 total items processed

üì¶ Processing batch 51/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.93it/s]


üíæ Checkpoint saved: 3540 total items processed

üì¶ Processing batch 52/56


ü§ñ Scoring:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [00:08<00:04,  3.15it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/private-banking/nieuws/juridisch-en-fiscaal-nieuws/artikels-uw-familiebedrijf: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fprivate-banking%2Fnieuws%2Fjuridisch-en-fiscaal-nieuws%2Fartikels-uw-familiebedrijf
‚ùå Error fetching ING API content for https://www.ing.be/nl/private-banking/nieuws/juridisch-en-fiscaal-nieuws/artikels-varia: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fprivate-banking%2Fnieuws%2Fjuridisch-en-fiscaal-nieuws%2Fartikels-varia


ü§ñ Scoring:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19/30 [00:09<00:03,  3.60it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/private-banking/nieuws/juridisch-en-fiscaal-nieuws/artikels-beleggen: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fprivate-banking%2Fnieuws%2Fjuridisch-en-fiscaal-nieuws%2Fartikels-beleggen


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:13<00:00,  2.18it/s]


üíæ Checkpoint saved: 3570 total items processed

üì¶ Processing batch 53/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.99it/s]


üíæ Checkpoint saved: 3600 total items processed

üì¶ Processing batch 54/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.97it/s]


üíæ Checkpoint saved: 3630 total items processed

üì¶ Processing batch 55/56


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:15<00:00,  1.96it/s]


üíæ Checkpoint saved: 3660 total items processed

üì¶ Processing batch 56/56


ü§ñ Scoring:  37%|‚ñà‚ñà‚ñà‚ñã      | 10/27 [00:05<00:05,  2.85it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/particulieren/voor-elke-dag/wat-is-het-nut-van-een-kredietkaart: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fparticulieren%2Fvoor-elke-dag%2Fwat-is-het-nut-van-een-kredietkaart


ü§ñ Scoring:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 16/27 [00:08<00:04,  2.41it/s]

‚ùå Error fetching ING API content for https://www.ing.be/nl/private-banking/nieuws/juridisch-en-fiscaal-nieuws/nieuwe-fiscale-regularisatieronde-in-het-federale-regeerakkoord-2025-2029: 404 Client Error:  for url: https://api.www.ing.be/be/public/pagemodel?pageUrl=%2Fnl%2Fprivate-banking%2Fnieuws%2Fjuridisch-en-fiscaal-nieuws%2Fnieuwe-fiscale-regularisatieronde-in-het-federale-regeerakkoord-2025-2029


ü§ñ Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27/27 [00:13<00:00,  1.97it/s]


üíæ Checkpoint saved: 3687 total items processed

üåê Overall CEFR B2 Accessibility Score: 66.21%
‚ö†Ô∏è Blog pages: 68.87% average compliance
‚úÖ Contact pages: 73.10% average compliance
‚úÖ FAQ pages: 70.39% average compliance
‚ö†Ô∏è Legal pages: 60.67% average compliance
‚ö†Ô∏è Other pages: 66.09% average compliance
‚ö†Ô∏è Product pages: 65.82% average compliance

‚úÖ Analysis complete! Results saved to: kbc_b2_final_results.xlsx
