In [None]:
from groq import Groq
import os
import time
import json
import re
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# --- Settings ---
folder_path = 'data/100_clean'
FREQUENCY_THRESHOLD = 100
throttle_delay = 1  # seconds delay between processing files
dict_file = 'difficult_word_to_help.json'
freq_file = 'data/es_frequencies.csv'
difficult_words_file = 'difficult_words.json'  # New file to store scanned difficult words

# --- Load frequency data and create a lookup dictionary ---
freq_df = pd.read_csv(freq_file)
# Use lower-case keys for consistency.
freq_dict = {str(word).lower(): freq for word, freq in zip(freq_df['Word'], freq_df['Freq_count'])}


def is_difficult(word):
    """Return True if the word's frequency is below threshold."""
    freq = freq_dict.get(word)
    return freq is not None and freq < FREQUENCY_THRESHOLD

# --- Process XHTML files or load existing difficult words ---
difficult_words = set()
file_list = [f for f in os.listdir(folder_path) if f.endswith(('.xhtml', '.htm'))]
start_time = time.time()
for filename in tqdm(file_list, desc="Processing files"):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        text = soup.get_text()
        words = re.findall(r'\b\w+\b', text, flags=re.UNICODE)
        for word in words:
            lower_word = word.lower()
            if is_difficult(lower_word):
                difficult_words.add(lower_word)
    time.sleep(throttle_delay)
elapsed = time.time() - start_time
print(f"Processed {len(file_list)} files in {elapsed:.2f} seconds.")
print(f"Found {len(difficult_words)} difficult words.")

with open(difficult_words_file, 'w', encoding='utf-8') as f:
    json.dump(list(difficult_words), f, ensure_ascii=False, indent=2)

# --- Load existing dictionary if it exists ---
if os.path.exists(dict_file):
    with open(dict_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # If loaded data is a list, convert it to a dict.
    if isinstance(data, dict):
        difficult_word_to_help = data
    else:
        difficult_word_to_help = {}
else:
    difficult_word_to_help = {}

remaining_words = difficult_words - set(difficult_word_to_help.keys())
difficult_words_list = list(remaining_words)

In [4]:
if not difficult_words_list:
    print("No new difficult words to process. Exiting.")

# Initialize Groq client.
client = Groq(api_key='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')

# Determine total batches (each batch of 100 words).
batch_size = 100
total_batches = (len(difficult_words_list) + batch_size - 1) // batch_size

# Process difficult words in batches.
for batch_index in range(total_batches):
    start = batch_index * batch_size
    batch = difficult_words_list[start:start + batch_size]
    print(f"Processing batch {batch_index + 1} of {total_batches} (words {start + 1} to {start + len(batch)})")
    
    words_str = ", ".join(batch)
    prompt = (
        f"Provide a defining translation in English for each of the following words: {words_str}. "
        "Use no more than three words per defining translation. "
        "Return the result as a JSON array where each element is an object with the keys 'word' and 'help'. "
        "Do not include any extra text outside the JSON."
    )

    # First attempt with llama-3.3-70b-versatile.
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3.3-70b-versatile",
            stream=False,
        )
        response_text = chat_completion.choices[0].message.content.strip()
        definitions = json.loads(response_text)
        for entry in definitions:
            word_key = entry['word'].lower()  # Normalize keys to lowercase.
            difficult_word_to_help[word_key] = entry['help']
    except Exception:
        # Second attempt with gemma2-9b-it.
        try:
            chat_completion = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="gemma2-9b-it",
                stream=False,
            )
            response_text = chat_completion.choices[0].message.content.strip()
            definitions = json.loads(response_text)
            for entry in definitions:
                word_key = entry['word'].lower()
                difficult_word_to_help[word_key] = entry['help']
        except json.JSONDecodeError:
            print(f"Failed to parse JSON for batch: {batch}")
            print("Response was:")
            print(response_text)

    # Optional throttle delay.
    time.sleep(throttle_delay)

# Save the updated dictionary locally.
with open(dict_file, 'w', encoding='utf-8') as f:
    json.dump(difficult_word_to_help, f, ensure_ascii=False, indent=2)

Processing batch 1 of 2 (words 1 to 100)
Processing batch 2 of 2 (words 101 to 127)


In [7]:
# Step 3: Process each XHTML file and replace difficult words with "original_word [help]".
total_changes = 0  # Track total changes across all files

for filename in os.listdir(folder_path):
    if filename.endswith(('.xhtml', '.htm')):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Build a regex pattern matching any word from difficult_words_list (case-insensitive)
        pattern = re.compile(
            r'\b(' + '|'.join(re.escape(word) for word in difficult_words_list) + r')\b',
            flags=re.IGNORECASE
        )

        def replacer(match):
            global total_changes  # Use global keyword to modify the module-level variable
            original_word = match.group(0)
            # Lookup using the dictionary (normalized to lowercase).
            help_text = difficult_word_to_help.get(original_word.lower(), '')
            if help_text:
                total_changes += 1  # Increment total changes for each replacement
                return f"{original_word} [{help_text}]"
            return original_word

        new_content = pattern.sub(replacer, content)

        # Overwrite the file with the updated content.
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(new_content)

# Print the total number of changes
print(f"Total changes made across all files: {total_changes}")


Total changes made across all files: 157
