#### Cell 1 — Imports + config + list files

In [0]:
import os
import re
import pandas as pd
from datetime import datetime

dataVolumePath = "/Volumes/cornell_catalog/cornell_schema/blob_gems_data"
duaVolumePath  = "/Volumes/cornell_catalog/cornell_schema/blob_gems_dua"

workbookFiles = sorted([f for f in os.listdir(dataVolumePath) if f.lower().endswith(".xlsx")])
duaFiles = sorted([f for f in os.listdir(duaVolumePath) if f.lower().endswith(".pdf")])

targetWorkbookFiles = ["012_PuchunNiu_1.xlsx", "013_PuchunNiu_2.xlsx"]

print("Workbook count:", len(workbookFiles))
print("DUA count:", len(duaFiles))
print("Target workbooks found:", [f for f in workbookFiles if f in targetWorkbookFiles])
print("DUA files:", duaFiles)

Workbook count: 21
DUA count: 6
Target workbooks found: ['012_PuchunNiu_1.xlsx', '013_PuchunNiu_2.xlsx']
DUA files: ['BjoernKuhla.pdf', 'Diogo Costa.pdf', 'GeorgettePyoos.pdf', 'LucaCattaneo.pdf', 'MazdakSalavati.pdf', 'PuchunNiu.pdf']


#### Cell 2 — Helper functions (filename parse + checklist gate)

In [0]:
def parseWorkbookFilename(workbookFileName: str):
    """
    012_PuchunNiu_1.xlsx -> studyId=012, contractName=PuchunNiu, sequence=1
    """
    match = re.match(
        r"^(?P<studyId>\d+)_+(?P<contractName>.+?)_+(?P<sequence>\d+)\.xlsx$",
        workbookFileName,
        re.IGNORECASE
    )
    if not match:
        return None

    return match.group("studyId"), match.group("contractName"), match.group("sequence")


def isChecklistApproved(checklistDf: pd.DataFrame):
    """
    MVP rule: approve if the Checklist sheet contains either:
      - 'Completed'
      - 'Not relevant'
    anywhere in the sheet (case-insensitive).
    """
    flat = checklistDf.astype(str).fillna("").applymap(lambda x: x.strip().lower())
    textBlob = " ".join(flat.values.ravel())

    return (
        "completed" in textBlob
        or "not relevant" in textBlob
        or "not-relevant" in textBlob
    )

#### Cell 3 — Build gateResults (DUA match + checklist ok)

In [0]:
# ---------------------------------------------
# Purpose:
#   - Loop over selected workbook files
#   - Parse studyId / contractName / sequence from filename
#   - Check that a matching DUA PDF exists
#   - Read the Checklist sheet
#   - Approve ONLY if the Status row shows:
#       all non-empty entries are either
#       "Completed" or "Not relevant"
#   - Fail closed if anything cannot be read
# ---------------------------------------------

gateRows = []

for workbookFileName in workbookFiles:

    # MVP: only process the two example workbooks
    if workbookFileName not in targetWorkbookFiles:
        continue

    # -------------------------------
    # Parse filename
    # -------------------------------
    parsed = parseWorkbookFilename(workbookFileName)

    if not parsed:
        gateRows.append({
            "studyId": None,
            "contractName": None,
            "sequence": None,
            "workbookFile": workbookFileName,
            "workbookPath": os.path.join(dataVolumePath, workbookFileName),
            "duaFile": None,
            "duaPath": None,
            "duaExists": False,
            "checklistOk": False,
            "checklistError": "Filename pattern not recognized"
        })
        continue

    studyId, contractName, sequence = parsed
    workbookPath = os.path.join(dataVolumePath, workbookFileName)

    # -------------------------------
    # DUA matching
    # -------------------------------
    duaFile = f"{contractName}.pdf"
    duaPath = os.path.join(duaVolumePath, duaFile)
    duaExists = duaFile in duaFiles

    # -------------------------------
    # Checklist gate (fail closed)
    # -------------------------------
    checklistOk = True
    checklistError = None

    try:
        # Read Checklist sheet
        checklistDf = pd.read_excel(
            workbookPath,
            sheet_name="Checklist",
            engine="openpyxl"
        )

        # Normalize all cells to lowercase strings
        normalized = (
            checklistDf.astype(str)
                       .fillna("")
                       .applymap(lambda x: x.strip().lower())
        )

        # Find the row where column A == "status"
        firstCol = normalized.iloc[:, 0]
        statusRowIdx = firstCol[firstCol == "status"].index

        if len(statusRowIdx) == 0:
            checklistOk = False
            checklistError = "Status row not found in Checklist sheet"
        else:
            idx = statusRowIdx[0]
            statusRow = normalized.loc[idx, :].tolist()

            allowedValues = {"completed", "not relevant"}

            # Ignore first cell ("status") and ignore blanks
            statusValues = [v for v in statusRow[1:] if v != ""]

            if len(statusValues) == 0:
                checklistOk = False
                checklistError = "Status row contains no values"
            else:
                checklistOk = all(v in allowedValues for v in statusValues)

                if not checklistOk:
                    checklistError = (
                        "One or more Status values are not "
                        "'Completed' or 'Not relevant'"
                    )

    except Exception as e:
        # Any failure to read the checklist fails the gate
        checklistOk = False
        checklistError = str(e)

    # -------------------------------
    # Append gate result row
    # -------------------------------
    gateRows.append({
        "studyId": studyId,
        "contractName": contractName,
        "sequence": sequence,
        "workbookFile": workbookFileName,
        "workbookPath": workbookPath,
        "duaFile": duaFile,
        "duaPath": duaPath,
        "duaExists": duaExists,
        "checklistOk": checklistOk,
        "checklistError": checklistError
    })

# Convert to DataFrame
gateDf = pd.DataFrame(gateRows)
gateDf


  .applymap(lambda x: x.strip().lower())
  .applymap(lambda x: x.strip().lower())


Unnamed: 0,studyId,contractName,sequence,workbookFile,workbookPath,duaFile,duaPath,duaExists,checklistOk,checklistError
0,12,PuchunNiu,1,012_PuchunNiu_1.xlsx,/Volumes/cornell_catalog/cornell_schema/blob_g...,PuchunNiu.pdf,/Volumes/cornell_catalog/cornell_schema/blob_g...,True,True,
1,13,PuchunNiu,2,013_PuchunNiu_2.xlsx,/Volumes/cornell_catalog/cornell_schema/blob_g...,PuchunNiu.pdf,/Volumes/cornell_catalog/cornell_schema/blob_g...,True,True,


#### Cell 4 — Write gateDf to ops.gateResults (with runId)

In [0]:
# ---------------------------------------------
# Behavior:
#   - checklistError remains NULL when no error
#   - New runs append new rows
# ---------------------------------------------

from pyspark.sql import functions as F
from datetime import datetime

# 1) Unique run identifier
runId = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")

# 2) Convert pandas -> spark and add runId
sparkGateDf = (
    spark.createDataFrame(gateDf)
         .withColumn("runId", F.lit(runId))
         .withColumn("checklistError", F.col("checklistError").cast("string"))
)

# 3) Write results (creates table if missing, appends otherwise)
(
    sparkGateDf.write
        .mode("append")
        .option("mergeSchema", "true").saveAsTable("cornell_catalog.cornell_schema.gemsGateResults")
)

# 4) Display + confirmation
display(sparkGateDf)
print(
    f"Saved to cornell_catalog.cornell_schema.gemsGateResults "
    f"with runId: {runId}"
)

  runId = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")


studyId,contractName,sequence,workbookFile,workbookPath,duaFile,duaPath,duaExists,checklistOk,checklistError,runId
12,PuchunNiu,1,012_PuchunNiu_1.xlsx,/Volumes/cornell_catalog/cornell_schema/blob_gems_data/012_PuchunNiu_1.xlsx,PuchunNiu.pdf,/Volumes/cornell_catalog/cornell_schema/blob_gems_dua/PuchunNiu.pdf,True,True,,20260212T154946Z
13,PuchunNiu,2,013_PuchunNiu_2.xlsx,/Volumes/cornell_catalog/cornell_schema/blob_gems_data/013_PuchunNiu_2.xlsx,PuchunNiu.pdf,/Volumes/cornell_catalog/cornell_schema/blob_gems_dua/PuchunNiu.pdf,True,True,,20260212T154946Z


Saved to cornell_catalog.cornell_schema.gemsGateResults with runId: 20260212T154946Z


#### Cell 5 — Verify latest run only

In [0]:
# ---------------------------------------------

latestRunId = (
    spark.sql("""
        SELECT max(runId) AS runId
        FROM cornell_catalog.cornell_schema.gemsGateResults
    """).collect()[0]["runId"]
)

display(
    spark.sql(f"""
        SELECT studyId, contractName, sequence, workbookFile, workbookPath,
               duaFile, duaPath, duaExists, checklistOk, runId
        FROM cornell_catalog.cornell_schema.gemsGateResults
        WHERE runId = '{latestRunId}'
        ORDER BY studyId
    """)
)


studyId,contractName,sequence,workbookFile,workbookPath,duaFile,duaPath,duaExists,checklistOk,runId
12,PuchunNiu,1,012_PuchunNiu_1.xlsx,/Volumes/cornell_catalog/cornell_schema/blob_gems_data/012_PuchunNiu_1.xlsx,PuchunNiu.pdf,/Volumes/cornell_catalog/cornell_schema/blob_gems_dua/PuchunNiu.pdf,True,True,20260212T154946Z
13,PuchunNiu,2,013_PuchunNiu_2.xlsx,/Volumes/cornell_catalog/cornell_schema/blob_gems_data/013_PuchunNiu_2.xlsx,PuchunNiu.pdf,/Volumes/cornell_catalog/cornell_schema/blob_gems_dua/PuchunNiu.pdf,True,True,20260212T154946Z
