In [None]:
import json
import os
import re
from bs4 import BeautifulSoup
import textstat
from groq import Groq

folder_path = 'data/100_clean'
CEFR_LEVEL_THRESHOLD = 20
throttle_delay = 1

def is_difficult(word):
    try:
        return textstat.flesch_kincaid_grade(word) > CEFR_LEVEL_THRESHOLD
    except Exception:
        return False

# Step 1: Collect all difficult words from all xHTML files.
difficult_words = set()
for filename in os.listdir(folder_path):
    if filename.endswith('.xhtml'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            text = soup.get_text()
            words = re.findall(r'\b\w+\b', text, flags=re.UNICODE)
            for word in words:
                lower_word = word.lower()
                if is_difficult(lower_word):
                    difficult_words.add(lower_word)

# Step 2: Build a dictionary mapping each difficult word to its "help" definition.
client = Groq(api_key='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')
difficult_word_to_help = {}
difficult_words_list = list(difficult_words)

# Process difficult words in batches of 100 until 80 requests have been made.
for i in range(0, len(difficult_words_list), 100):
    # print progress based on i, len(difficult_words_list) and 100 per request
    print(f"Processing batch {i // 100 + 1} of {len(difficult_words_list) // 100}")
    batch = difficult_words_list[i:i + 100]
    words_str = ", ".join(batch)
    prompt = (
        f"Provide a defining translation in English for each of the following words: {words_str}. "
        "Use no more than three words per defining translation. "
        "Return the result as a JSON array where each element is an object with the keys 'word' and 'help'. "
        "Do not include any extra text outside the JSON."
    )    

    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="gemma2-9b-it",
            stream=False,
        )
        response_text = chat_completion.choices[0].message.content.strip()
        definitions = json.loads(response_text)
        for entry in definitions:
            # Normalize keys to lowercase for consistency.
            word_key = entry['word'].lower()
            difficult_word_to_help[word_key] = entry['help']
    except:
        try:
            chat_completion = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama-3.3-70b-versatile",
                stream=False,
            )
            response_text = chat_completion.choices[0].message.content.strip()
            definitions = json.loads(response_text)
            for entry in definitions:
                # Normalize keys to lowercase for consistency.
                word_key = entry['word'].lower()
                difficult_word_to_help[word_key] = entry['help']
        except json.JSONDecodeError:
            print(f"Failed to parse JSON for batch: {batch}")
            print("Response was:")
            print(response_text)

In [None]:
# Step 3: Process each XHTML file and replace difficult words with "original_word [help]".
for filename in os.listdir(folder_path):
    if filename.endswith('.xhtml'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Build a regex pattern matching any difficult word (case-insensitive).
        pattern = re.compile(
            r'\b(' + '|'.join(re.escape(word) for word in difficult_word_to_help.keys()) + r')\b',
            flags=re.IGNORECASE
        )

        def replacer(match):
            original_word = match.group(0)
            help_text = difficult_word_to_help.get(original_word.lower(), '')
            return f"{original_word} [{help_text}]" if help_text else original_word

        new_content = pattern.sub(replacer, content)

        # Overwrite the file with the updated content.
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(new_content)