In [None]:
from bs4 import BeautifulSoup
import re
import concurrent.futures
import pandas as pd

def fast_parse(html):
    if not html or not isinstance(html, str):
        return ""
    
    # Use regex for the fastest approach - directly strip tags
    # This avoids creating multiple BeautifulSoup objects
    try:
        # Remove scripts and style elements with regex
        html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL)
        html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL)
        
        # Remove all ix tags in one pass
        html = re.sub(r'<ix:[^>]*>.*?</ix:[^>]*>', ' ', html, flags=re.DOTALL)
        
        # Strip remaining HTML tags
        text = re.sub(r'<[^>]+>', ' ', html)
        
        # Clean up whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Handle encoding
        return text.encode("ascii", "ignore").decode()
    except Exception as e:
        return f"Error parsing: {str(e)[:50]}"

# Process in parallel for speed
def parallel_parse(df, column='Contents', workers=4):
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
        # Convert series to list for parallel processing
        content_list = df[column].tolist()
        results = list(executor.map(fast_parse, content_list))
    
    return pd.Series(results)

# Usage:
# df['parsed_text'] = parallel_parse(df)