In [0]:
import io, json
import pandas as pd
from pyspark.sql import functions as F, types as T

dbutils.widgets.text("source_system", "SHAREPOINT_HIST")
SOURCE_SYSTEM = dbutils.widgets.get("source_system")

files = (
    spark.table("tp_finance.raw.sharepoint_workbook_files")
      .where(F.col("source_system") == SOURCE_SYSTEM)
      .select("source_system", "source_path", "file_sha256", "source_modified_ts", "load_date")
)

parsed = (
    spark.table("tp_finance.audit.parsed_files")
      .where(F.col("source_system") == SOURCE_SYSTEM)
      .select("file_sha256").distinct()
)

to_parse = files.join(parsed, on="file_sha256", how="left_anti")
print("Files to parse:", to_parse.count())

if to_parse.count() == 0:
    print("No new files to parse. Exiting.")
else:
    paths = [r["source_path"] for r in to_parse.select("source_path").distinct().collect()]

    binary = (
        spark.read.format("binaryFile")
          .load(paths)
          .select(F.col("path").alias("source_path"), F.col("content").alias("content_bytes"))
    )

    work = (to_parse.join(binary, on="source_path", how="inner")
                  .select("source_system","source_path","file_sha256","load_date","source_modified_ts","content_bytes"))

    schema = T.StructType([
      T.StructField("source_system", T.StringType()),
      T.StructField("source_path", T.StringType()),
      T.StructField("file_sha256", T.StringType()),
      T.StructField("sheet_name", T.StringType()),
      T.StructField("row_num", T.IntegerType()),
      T.StructField("row_json", T.StringType()),
      T.StructField("load_date", T.DateType()),
      T.StructField("source_modified_ts", T.TimestampType()),
      T.StructField("ingestion_run_id", T.StringType()),
      T.StructField("load_ts", T.TimestampType()),
    ])

    def parse_excel_batches(iterator):
        from openpyxl import load_workbook
        import uuid
        from datetime import datetime
        import io, json
        import pandas as pd

        for pdf in iterator:
            run_id = str(uuid.uuid4())
            now = datetime.utcnow()

            out_rows = []

            for row in pdf.itertuples(index=False):
                try:
                    wb = load_workbook(filename=io.BytesIO(row.content_bytes), data_only=True, read_only=True)

                    for sheet_name in wb.sheetnames:
                        ws = wb[sheet_name]
                        it = ws.iter_rows(values_only=True)

                        header = next(it, None)
                        if not header:
                            continue
                        header = [str(h).strip() if h is not None else "" for h in header]

                        row_num = 1
                        for vals in it:
                            row_num += 1
                            d = {}
                            for k, v in zip(header, vals):
                                if not k:
                                    continue
                                if hasattr(v, "isoformat"):
                                    d[k] = v.isoformat()
                                else:
                                    d[k] = v

                            out_rows.append((
                                row.source_system,
                                row.source_path,
                                row.file_sha256,
                                sheet_name,
                                row_num,
                                json.dumps(d, default=str),
                                row.load_date,
                                row.source_modified_ts,
                                run_id,
                                now
                            ))
                except Exception as e:
                    # For now: skip file; later write FAILED rows to audit
                    continue

            # yield a pandas.DataFrame (not return a list/str)
            yield pd.DataFrame(out_rows, columns=[f.name for f in schema.fields])

    rows_df = work.mapInPandas(parse_excel_batches, schema=schema)

    # Write rows (donâ€™t count() first; write triggers execution and is safer)
    rows_df.write.mode("append").saveAsTable("tp_finance.raw.sharepoint_sheet_rows")

    # Mark parsed only after write succeeds
    (
      to_parse.select(
          F.lit(SOURCE_SYSTEM).alias("source_system"),
          "file_sha256",
          "source_path",
          F.current_timestamp().alias("parsed_ts"),
          F.lit("SUCCESS").alias("status"),
          F.lit(None).cast("string").alias("error")
      )
      .write.mode("append")
      .saveAsTable("tp_finance.audit.parsed_files")
    )

    print("Parsing + write complete.")

In [0]:
from pyspark.sql import functions as F

SOURCE_SYSTEM = "SHAREPOINT_HIST"

files = spark.table("tp_finance.raw.sharepoint_workbook_files").where(F.col("source_system")==SOURCE_SYSTEM)
print("files:", files.count())
files.select("source_path","file_sha256").show(truncate=False)

parsed = spark.table("tp_finance.audit.parsed_files").where(F.col("source_system")==SOURCE_SYSTEM)
print("parsed:", parsed.count())

to_parse = files.join(parsed.select("file_sha256").distinct(), on="file_sha256", how="left_anti")
print("to_parse:", to_parse.count())
to_parse.select("source_path","file_sha256").show(truncate=False)