In [0]:
# Databricks notebook source
# Script 4 — 04_per_institution_bronze_ingest
#
# Purpose:
#   Consume institution_ingest_plan (created by Script 3), and for each (file × institution):
#     - get bearer token from SST staging using X-API-KEY (from Databricks secrets)
#     - call /api/v1/institutions/pdp-id/{pdp_id} to resolve institution name
#     - map name -> schema prefix via databricksify_inst_name()
#     - locate <prefix>_bronze schema in staging_sst_01
#     - choose a volume in that schema containing "bronze"
#     - filter rows by institution id (exactly like current script)
#     - write to bronze volume using helper.process_and_save_file (exact same ingestion method)
#   After all institutions for a file are processed, update ingestion_manifest:
#     - BRONZE_WRITTEN if all institution ingests succeeded (or were already present)
#     - FAILED if any error occurred for that file (store error_message)
#
# Constraints:
#   - NO SFTP connection (uses staged local files from Script 1/3)
#   - Uses existing ingestion function + behavior from current script


In [0]:
%pip install pandas python-box pyyaml requests paramiko
%restart_python

In [0]:
import logging
import os
import yaml

import pandas as pd
from box import Box
from databricks.connect import DatabricksSession

from pyspark.sql import functions as F

from edvise.utils.api_requests import (
    EdviseAPIClient,
    databricksify_inst_name,
    fetch_institution_by_pdp_id,
)
from edvise.utils.data_cleaning import convert_to_snake_case
from edvise.ingestion.nsc_sftp_helpers import (
    find_bronze_schema,
    find_bronze_volume_name,
    output_file_name_from_sftp,
    process_and_save_file,
    update_manifest,
)
from edvise.ingestion.constants import (
    CATALOG,
    PLAN_TABLE_PATH,
    MANIFEST_TABLE_PATH,
    SST_BASE_URL,
    SST_TOKEN_ENDPOINT,
    INSTITUTION_LOOKUP_PATH,
    SST_API_KEY_SECRET_KEY,
    COLUMN_RENAMES,
)

try:
    dbutils  # noqa: F821
except NameError:
    from unittest.mock import MagicMock

    dbutils = MagicMock()

try:
    display  # noqa: F821
except NameError:

    def display(x):
        return x


spark = DatabricksSession.builder.getOrCreate()

In [0]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)

# COMMAND ----------

# ---------------------------
# Config + constants
# ---------------------------
with open("gcp_config.yaml", "rb") as f:
    cfg = Box(yaml.safe_load(f))

CATALOG = "staging_sst_01"
DEFAULT_SCHEMA = "default"

PLAN_TABLE = f"{CATALOG}.{DEFAULT_SCHEMA}.institution_ingest_plan"
MANIFEST_TABLE = f"{CATALOG}.{DEFAULT_SCHEMA}.ingestion_manifest"

SST_BASE_URL = "https://staging-sst.datakind.org"
SST_TOKEN_ENDPOINT = f"{SST_BASE_URL}/api/v1/token-from-api-key"
INSTITUTION_LOOKUP_PATH = "/api/v1/institutions/pdp-id/{pdp_id}"

# IMPORTANT: set these two to your actual secret scope + key name(s)
SST_SECRET_SCOPE = cfg.institution.secure_assets["scope"]
SST_API_KEY_SECRET_KEY = (
    "sst_staging_api_key"  # <-- update if your secret key is named differently
)
SST_API_KEY = dbutils.secrets.get(
    scope=SST_SECRET_SCOPE, key=SST_API_KEY_SECRET_KEY
).strip()
if not SST_API_KEY:
    raise RuntimeError(
        f"Empty SST API key from secrets: scope={SST_SECRET_SCOPE} key={SST_API_KEY_SECRET_KEY}"
    )

api_client = EdviseAPIClient(
    api_key=SST_API_KEY,
    base_url=SST_BASE_URL,
    token_endpoint=SST_TOKEN_ENDPOINT,
    institution_lookup_path=INSTITUTION_LOOKUP_PATH,
)

In [0]:
if not spark.catalog.tableExists(PLAN_TABLE_PATH):
    logger.info(f"Plan table not found: {PLAN_TABLE_PATH}. Exiting (no-op).")
    dbutils.notebook.exit("NO_PLAN_TABLE")

if not spark.catalog.tableExists(MANIFEST_TABLE_PATH):
    raise RuntimeError(f"Manifest table missing: {MANIFEST_TABLE_PATH}")

plan_df = spark.table(PLAN_TABLE_PATH)
if plan_df.limit(1).count() == 0:
    logger.info("institution_ingest_plan is empty. Exiting (no-op).")
    dbutils.notebook.exit("NO_WORK_ITEMS")

manifest_df = spark.table(MANIFEST_TABLE_PATH).select("file_fingerprint", "status")
plan_new_df = plan_df.join(manifest_df, on="file_fingerprint", how="inner").where(
    F.col("status") == F.lit("NEW")
)
display(plan_new_df)
if plan_new_df.limit(1).count() == 0:
    logger.info("No planned work items where manifest status=NEW. Exiting (no-op).")
    dbutils.notebook.exit("NO_NEW_TO_INGEST")

# Collect file groups
file_groups = (
    plan_new_df.select(
        "file_fingerprint",
        "file_name",
        "local_path",
        "inst_col",
        "file_size",
        "file_modified_time",
    )
    .distinct()
    .collect()
)

logger.info(f"Preparing to ingest {len(file_groups)} NEW file(s).")

In [0]:
# ---------------------------
# Main per-file ingest loop
# ---------------------------
processed_files = 0
failed_files = 0
skipped_files = 0

for fg in file_groups:
    fp = fg["file_fingerprint"]
    sftp_file_name = fg["file_name"]
    local_path = fg["local_path"]
    inst_col = fg["inst_col"]

    if not local_path or not os.path.exists(local_path):
        err = f"Staged local file missing for fp={fp}: {local_path}"
        logger.error(err)
        update_manifest(
            spark, MANIFEST_TABLE, fp, status="FAILED", error_message=err[:8000]
        )
        failed_files += 1
        continue

    try:
        df_full = pd.read_csv(local_path, on_bad_lines="warn")
        df_full = df_full.rename(columns={c: convert_to_snake_case(c) for c in df_full.columns})
        df_full = df_full.rename(columns=RENAMES)

        if inst_col not in df_full.columns:
            err = f"Expected institution column '{inst_col}' not found after normalization/renames for file={sftp_file_name} fp={fp}"
            logger.error(err)
            update_manifest(
                spark, MANIFEST_TABLE, fp, status="FAILED", error_message=err[:8000]
            )
            failed_files += 1
            continue

        # Only cast institution ID column to string (leave other columns as inferred)
        df_full[inst_col] = df_full[inst_col].astype(str)

        inst_ids = (
            plan_new_df.where(F.col("file_fingerprint") == fp)
            .select("institution_id")
            .distinct()
            .collect()
        )
        inst_ids = [r["institution_id"] for r in inst_ids]

        if not inst_ids:
            logger.info(
                f"No institution_ids in plan for file={sftp_file_name} fp={fp}. Marking BRONZE_WRITTEN (no-op)."
            )
            update_manifest(
                spark, MANIFEST_TABLE, fp, status="BRONZE_WRITTEN", error_message=None
            )
            skipped_files += 1
            continue

        # Aggregate errors at file-level
        file_errors = []

        for inst_id in inst_ids:
            try:
                target_inst_id = str(inst_id)
                filtered_df = df_full[df_full[inst_col] == target_inst_id].reset_index(
                    drop=True
                )

                if filtered_df.empty:
                    logger.info(
                        f"file={sftp_file_name} fp={fp}: institution {inst_id} has 0 rows; skipping."
                    )
                    continue

                # Resolve institution -> name
                inst_info = fetch_institution_by_pdp_id(api_client, inst_id)
                inst_name = inst_info.get("name")
                if not inst_name:
                    raise ValueError(
                        f"SST API returned no 'name' for pdp_id={inst_id}. Response={inst_info}"
                    )

                inst_prefix = databricksify_inst_name(inst_name)

                # Find bronze schema + volume
                bronze_schema = find_bronze_schema(spark, CATALOG, inst_prefix)
                bronze_volume_name = find_bronze_volume_name(
                    spark, CATALOG, bronze_schema
                )
                volume_dir = f"/Volumes/{CATALOG}/{bronze_schema}/{bronze_volume_name}"

                # Output naming rule (same as current script)
                out_file_name = output_file_name_from_sftp(sftp_file_name)
                full_path = os.path.join(volume_dir, out_file_name)

                # Idempotency check
                if os.path.exists(full_path):
                    logger.info(
                        f"file={sftp_file_name} inst={inst_id}: already exists in {volume_dir}; skipping write."
                    )
                    continue

                logger.info(
                    f"file={sftp_file_name} inst={inst_id}: writing to {volume_dir} as {out_file_name}"
                )
                process_and_save_file(
                    volume_dir=volume_dir, file_name=out_file_name, df=filtered_df
                )
                logger.info(f"file={sftp_file_name} inst={inst_id}: write complete.")

            except Exception as e:
                msg = f"inst_ingest_failed file={sftp_file_name} fp={fp} inst={inst_id}: {e}"
                logger.exception(msg)
                file_errors.append(msg)

        if file_errors:
            err = " | ".join(file_errors)[:8000]
            update_manifest(
                spark, MANIFEST_TABLE, fp, status="FAILED", error_message=err
            )
            failed_files += 1
        else:
            update_manifest(
                spark, MANIFEST_TABLE, fp, status="BRONZE_WRITTEN", error_message=None
            )
            processed_files += 1

    except Exception as e:
        msg = f"fatal_file_error file={sftp_file_name} fp={fp}: {e}"
        logger.exception(msg)
        update_manifest(
            spark, MANIFEST_TABLE, fp, status="FAILED", error_message=msg[:8000]
        )
        failed_files += 1

logger.info(
    f"Done. processed_files={processed_files}, failed_files={failed_files}, skipped_files={skipped_files}"
)
dbutils.notebook.exit(
    f"PROCESSED={processed_files};FAILED={failed_files};SKIPPED={skipped_files}"
)