In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("test_pyspark") \
    .config("spark.driver.memory", "100g") \
    .config("spark.executor.memory", "100g") \
    .config("spark.sql.orc.enableVectorizedReader", "false") \
    .config("spark.sql.parquet.columnarReaderBatchSize", "256") \
    .config("spark.sql.orc.columnarReaderBatchSize", "256") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/29 12:44:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/29 12:44:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/29 12:44:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Spellcheck Text Renderings (OCRs)

In [None]:
import os
import requests
from symspellpy import SymSpell, Verbosity
from pyspark.sql.types import FloatType
from pyspark.sql import functions as F
import re
import pandas as pd # For Pandas UDF type hint

# Define paths for dictionary files
data_dir = "../data"
os.makedirs(data_dir, exist_ok=True)

dictionary_path = os.path.join(data_dir, "tmp", "frequency_dictionary_en_82_765.txt")

# URL for the dictionary
dict_url = "https://raw.githubusercontent.com/wolfgarbe/SymSpell/refs/heads/master/SymSpell/frequency_dictionary_en_82_765.txt"

def download_file(url, dest_path):
    if not os.path.exists(dest_path):
        print(f"Downloading {url} to {dest_path}...")
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status() # Raise an exception for HTTP errors
            with open(dest_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print("Download complete.")
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url}: {e}")
            # Optionally, re-raise or handle as critical error
            raise
    else:
        print(f"{dest_path} already exists.")

try:
    download_file(dict_url, dictionary_path)

    # Add dictionary file to Spark context so it's available on workers
    spark.sparkContext.addFile(dictionary_path)
    print(f"Added {os.path.basename(dictionary_path)} to Spark context.")

    # Test dictionary loading on the driver side (optional)
    sym_spell_test = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    if sym_spell_test.load_dictionary(dictionary_path, term_index=0, count_index=1):
        print("SymSpell dictionary loaded successfully for testing on driver.")
    else:
        print("Failed to load SymSpell dictionary for testing on driver.")

except Exception as e:
    print(f"An error occurred during dictionary setup: {e}")

../data/frequency_dictionary_en_82_765.txt already exists.
Added frequency_dictionary_en_82_765.txt to Spark context.
SymSpell dictionary loaded successfully for testing on driver.
SymSpell dictionary loaded successfully for testing on driver.


In [8]:
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import FloatType
from symspellpy import SymSpell, Verbosity # Ensure it's available for UDF context
import re
import os # For basename in UDF
import pandas as pd # For type hint pd.Series

# Get the basename of the dictionary file that was added to SparkFiles
# This ensures the UDF uses the correct path on worker nodes
# Note: dictionary_path is from the previous cell's scope. If running cells separately, ensure it's defined.
# For robustness in notebook execution, re-define or pass as argument if necessary.
# However, spark.sparkContext.addFile makes it findable by basename.
dictionary_filename = os.path.basename(dictionary_path) 

@pandas_udf(FloatType()) 
def calculate_misspelled_ratio(texts: pd.Series) -> pd.Series:
    # Initialize SymSpell inside the UDF
    # max_dictionary_edit_distance must be >= max_edit_distance used in lookup
    sym_spell = SymSpell(max_dictionary_edit_distance=1, prefix_length=7)
    
    # Import SparkFiles inside UDF to get path on worker
    from pyspark import SparkFiles # Lazy import for worker context
    local_dictionary_path = SparkFiles.get(dictionary_filename)
    
    if not os.path.exists(local_dictionary_path) or \
       not sym_spell.load_dictionary(local_dictionary_path, term_index=0, count_index=1):
        # If dictionary load fails on worker, return error code or NaN
        return pd.Series([float('nan')] * len(texts))

    results = []
    for text in texts:
        if pd.isna(text) or not isinstance(text, str) or not text.strip():
            results.append(0.0) # Treat empty/invalid text as having no misspellings
            continue

        # Preprocessing: lowercase, keep only English letters and spaces, then split
        clean_text = re.sub(r'[^a-z\s]', '', str(text).lower())
        words = [word for word in clean_text.split() if word] 

        if not words:
            results.append(0.0) # No words to check
            continue

        misspelled_count = 0
        for word in words:
            # max_edit_distance=1: try to find the word with at most 1 edit
            # Verbosity.TOP: return only the top suggestion
            # include_unknown=True: if word not found, it's returned itself with count 0
            suggestions = sym_spell.lookup(word, Verbosity.TOP, max_edit_distance=1, include_unknown=True)
            
            # A word is misspelled if:
            # 1. No suggestions (should not happen with include_unknown=True)
            # 2. The top suggestion's term is different from the original word.
            # 3. The top suggestion's term is the same, but its count is 0 (meaning it's unknown to the dictionary).
            if not suggestions: # Should ideally not be hit with include_unknown=True
                misspelled_count +=1
            else:
                top_suggestion = suggestions[0]
                if top_suggestion.term != word or top_suggestion.count == 0:
                    misspelled_count += 1
        
        ratio = misspelled_count / len(words) if len(words) > 0 else 0.0
        results.append(ratio)
        
    return pd.Series(results)

print("Pandas UDF 'calculate_misspelled_ratio' defined.")

Pandas UDF 'calculate_misspelled_ratio' defined.


In [None]:
from wc_simd.utility import spark_path

text_df = spark.read.parquet(spark_path(
    "../data/plain_text_rendering.parquet")).sample(False, 0.0005, seed=42)
works_df = spark.table("works")

works_in_text_df = (
    works_df
    .join(
        text_df,
        works_df.id == text_df.id,
        "inner"
    )
    .select(works_df["id"], works_df["languages"], text_df["text"])
)

# Extract the first language from the languages array
works_in_text_df_first_lang = works_in_text_df.withColumn(
    "first_language", F.col("languages")[0]["id"])

# For demonstration, take a small sample. Adjust sample size as needed.
# Spellchecking can be computationally intensive.
# Using a fraction of 0.0005 (0.05%) for initial testing.
# Increase fraction for more comprehensive analysis if performance allows.
sampled_text_df = works_in_text_df_first_lang.where(F.col("first_language") == "eng")

print(
    f"Processing approximately {
        sampled_text_df.count()} records for spellcheck analysis.")



Processing approximately 96 records for spellcheck analysis.


                                                                                



In [None]:
sampled_text_df.write.saveAsTable("sampled_text_df", mode="overwrite")



                                                                                

In [3]:
sampled_text_df = spark.table("sampled_text_df")

In [9]:
# Apply the UDF
# Ensure to select relevant columns and dropna for the column used in UDF if it can be null
# The UDF itself handles None/NaN for text, so direct application is fine.
df_with_spellcheck = sampled_text_df.withColumn("misspelled_ratio", calculate_misspelled_ratio(F.col("text")))

# Show some results, including cases that might be problematic
print("Sample results with misspelled_ratio:")
df_with_spellcheck.select("id", "text", "misspelled_ratio").show(20, truncate=50)

# Cache the result if you plan to reuse it for multiple analyses
# df_with_spellcheck.cache()

# Trigger action to compute and cache
print(f"Total records to process: {df_with_spellcheck.count()}")

Sample results with misspelled_ratio:


                                                                                

+--------+--------------------------------------------------+----------------+
|      id|                                              text|misspelled_ratio|
+--------+--------------------------------------------------+----------------+
|qh3zjc7x|6 ANNUAL REPORTS of the medical officer of heal...|       0.1433239|
|jb5napff|EX BIBLIOTHECA CAR. I. TAB ORI S MINOR SURGERY ...|      0.16810748|
|r5yjxxbr|DEPARTMENT OF OBSTETRICS & GYNECOLOGY YALE UNIV...|      0.16791707|
|ab8ud2qv|HOMCEOPATHY UNVEJLED OR, OBSERVATIONS ON HAHNEM...|      0.11810287|
|dn2qn93b|THE SIXTY-SEVENTH REPORT OF THE LONDON FEVER HO...|      0.29506704|
|am6mzvqk|Egkm Urban Sanitary District. REPORT TO THE URB...|      0.17860554|
|tc2z7q2e|) ' I I r r THE SCIENCE AND ART OF MIDWIFEEY. B...|      0.21040253|
|sgu2zcbh|SUTTON-IN-ASHFIELD URBAN DISTRICT COUNCIL Chair...|      0.17836677|
|mhx6vvxq|, > . M'2 ' ; /: . . - V . Â«. * .y"S . a w : '...|      0.10701774|
|zzxfabmk|BOROUGH OF REIGATE. ANNUAL REPORT OF THE M

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # Ensure pandas is imported for .toPandas()

# Ensure df_with_spellcheck is computed and available
if 'df_with_spellcheck' in locals() and df_with_spellcheck is not None:
    # Collect the misspelled_ratio for analysis
    # Filter out any potential NaN values if UDF returned them for errors
    misspelled_ratios_pd = df_with_spellcheck.select("misspelled_ratio").dropna().toPandas()

    if not misspelled_ratios_pd.empty:
        plt.figure(figsize=(12, 7))
        sns.histplot(misspelled_ratios_pd["misspelled_ratio"], bins=30, kde=True) # Added KDE for smooth distribution
        plt.title("Histogram of Misspelled Word Ratio in Sampled OCR Texts")
        plt.xlabel("Misspelled Word Ratio (0 = perfect, 1 = all misspelled)")
        plt.ylabel("Number of Documents")
        plt.grid(axis='y', alpha=0.75)
        plt.show()

        # Print descriptive statistics
        print("\nDescriptive statistics for misspelled_ratio:")
        print(misspelled_ratios_pd["misspelled_ratio"].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99]))
    else:
        print("No valid misspelled_ratio data to plot or describe. Check UDF execution, data sampling, or filtering.")

    # Unpersist the cached DataFrame if no longer needed
    # df_with_spellcheck.unpersist()
    # print("Unpersisted df_with_spellcheck.")
else:
    print("df_with_spellcheck is not defined or is None. Please run the previous cell.")


In [None]:
df_with_spellcheck_pd = df_with_spellcheck.toPandas()

In [None]:
df_with_spellcheck_pd.to_csv(
    "../data/tmp/text_with_spellcheck_eng_sample.csv",
    index=False
)

In [None]:
import pandas as pd

df_with_spellcheck_pd = pd.read_csv(
    "../data/tmp/text_with_spellcheck_eng_sample.csv"
)

In [None]:
df_with_spellcheck_pd