In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
spark.sql("USE CATALOG workspace")
spark.sql("USE SCHEMA med")

In [0]:
spark = SparkSession.builder.getOrCreate()
raw_df = spark.table("workspace.med.raw_data")

In [0]:
# clean
raw = F.col("raw_text").cast("string") # raw text as string

txt = F.regexp_replace(raw, "&lt;", "<")
txt = F.regexp_replace(txt, "&gt;", ">")
txt = F.regexp_replace(txt, "&amp;", "&")
txt = F.regexp_replace(txt, "&quot;", "\"")

txt = F.lower(txt) # lower case
txt = F.regexp_replace(txt, "<[^>]+>", " ") # remove html tags
txt = F.regexp_replace(txt, "[\\r\\n\\t]+", " ") # remove line breaks and tabs
txt = F.regexp_replace(txt, "[^\\p{L}\\p{N}\\p{P}\\s]", " ") # remove all non-ASCII characters
txt = F.regexp_replace(txt, "\\s+", " ") # remove multiple spaces
txt = F.trim(txt) # trim
clean_expr = txt

In [0]:
# filter short or empty docs
df_with_clean = raw_df.withColumn("text_clean", clean_expr)
df_with_clean = df_with_clean.filter(F.col("text_clean").isNotNull() & (F.length("text_clean") >= 50))

In [0]:
# split sentences
sentence_split_pattern = "(?<=[\\.\\!\\?])\\s+" # split everything after a period, exclamation mark, or question mark and then a space
df_with_sentences = df_with_clean.withColumn("sentences", F.split(F.col("text_clean"), sentence_split_pattern))

In [0]:
# remove any duplicated instances of the same doc
w = Window.partitionBy("source", "doc_id").orderBy(F.col("ingested_at").desc())
df_dedup = (
    df_with_sentences
      .withColumn("rn", F.row_number().over(w))
      .filter(F.col("rn") == 1)
      .drop("rn")
)

In [0]:
df_docs_clean = (
    df_dedup
      .select(
          "doc_id",
          "source",
          "category",
          "title",
          "synonyms",
          "url",
          "text_clean",
          F.col("ingested_at").alias("snapshot_ts"),
          "sentences"
      )
)

df_docs_clean.printSchema()
display(df_docs_clean.limit(5))

In [0]:
(
    df_docs_clean
      .write
      .format("delta")
      .mode("overwrite")
      .option("overwriteSchema", "true")
      .saveAsTable("workspace.med.docs_clean")
)