In [0]:
# 1. Connect to SFTP and scan the receive folder for files.
# 2. Upsert unseen files into `ingestion_manifest` with status=NEW.
# 3. Download and stage NEW + unqueued files locally and upsert them into `pending_ingest_queue`.

# Recent refactor:
# - SFTP helpers moved to `helper.py` (`connect_sftp`, `list_receive_files`, `download_sftp_atomic`).
# - `list_receive_files` now takes `source_system` explicitly (no hidden notebook globals).

# Constraints:
# - SFTP connection required
# - NO API calls
# - Stages files locally (TMP_DIR) + writes to Delta tables only

# Inputs:
# - SFTP folder: `./receive`
# - Required workflow parameters (exact SFTP file names):
#   - `cohort_file_name`
#   - `course_file_name`

# Outputs:
# - `staging_sst_01.default.ingestion_manifest`
# - `staging_sst_01.default.pending_ingest_queue`
# - Staged files written to: `/tmp/pdp_sftp_stage`


In [0]:
%pip install paramiko python-box pyyaml
%pip install git+https://github.com/datakind/edvise.git@Automated_Ingestion_Workflow

In [0]:
%restart_python

In [0]:
import logging
from databricks.connect import DatabricksSession

from edvise.utils.sftp import connect_sftp, list_receive_files
from edvise.ingestion.constants import (
    QUEUE_TABLE_PATH,
    SFTP_REMOTE_FOLDER,
    SFTP_SOURCE_SYSTEM,
)
from edvise.ingestion.nsc_sftp_helpers import (
    build_listing_df,
    download_new_files_and_queue,
    ensure_manifest_and_queue_tables,
    get_files_to_queue,
    upsert_new_to_manifest,
)
from edvise import utils
try:
    dbutils  # noqa: F821
except NameError:
    from unittest.mock import MagicMock

    dbutils = MagicMock()
spark = DatabricksSession.builder.getOrCreate()


In [0]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)

asset_scope = "nsc-sftp-asset"

host = dbutils.secrets.get(scope=asset_scope, key="nsc-sftp-host")
user = dbutils.secrets.get(scope=asset_scope, key="nsc-sftp-user")
password = dbutils.secrets.get(scope=asset_scope, key="nsc-sftp-password")

cohort_file_name = utils.databricks.get_db_widget_param("cohort_file_name")
course_file_name = utils.databricks.get_db_widget_param("course_file_name")
if not cohort_file_name or not course_file_name:
    raise ValueError(
        "Both 'cohort_file_name' and 'course_file_name' must be provided as widget parameters."
    )
logger.info(
    "Manual file selection enabled: "
    f"cohort_file_name={cohort_file_name}, course_file_name={course_file_name}"
)

logger.info("SFTP secured assets loaded successfully.")

In [0]:
transport = None
sftp = None

try:
    ensure_manifest_and_queue_tables(spark)

    transport, sftp = connect_sftp(host, user, password)
    logger.info(
        f"Connected to SFTP host={host} and scanning folder={SFTP_REMOTE_FOLDER}"
    )

    file_rows_all = list_receive_files(sftp, SFTP_REMOTE_FOLDER, SFTP_SOURCE_SYSTEM)
    if not file_rows_all:
        logger.info(
            f"No files found in SFTP folder: {SFTP_REMOTE_FOLDER}. Exiting (no-op)."
        )
        dbutils.notebook.exit("NO_FILES")

    requested_names = {cohort_file_name, course_file_name}
    file_rows = [r for r in file_rows_all if r.get("file_name") in requested_names]

    found_names = {r.get("file_name") for r in file_rows}
    missing_names = sorted(requested_names - found_names)
    if missing_names:
        available = sorted({r.get("file_name") for r in file_rows_all})
        preview = available[:25]
        raise FileNotFoundError(
            f"Requested file(s) not found on SFTP in folder '{SFTP_REMOTE_FOLDER}': {missing_names}. "
            f"Available file count={len(available)}; first 25={preview}"
        )

    df_listing = build_listing_df(spark, file_rows)

    # 1) Ensure everything on SFTP is at least represented in manifest as NEW
    upsert_new_to_manifest(spark, df_listing)

    # 2) Queue anything that is still NEW and not already queued
    df_to_queue = get_files_to_queue(spark, df_listing)

    to_queue_count = df_to_queue.count()
    if to_queue_count == 0:
        logger.info(
            "No files to queue: either nothing is NEW, or NEW files are already queued. Exiting (no-op)."
        )
        dbutils.notebook.exit("QUEUED_FILES=0")

    logger.info(
        f"Queuing {to_queue_count} NEW-unqueued file(s) to {QUEUE_TABLE_PATH} and staging locally."
    )
    queued_count = download_new_files_and_queue(spark, sftp, df_to_queue, logger)

    logger.info(
        f"Queued {queued_count} file(s) for downstream processing in {QUEUE_TABLE_PATH}."
    )
    dbutils.notebook.exit(f"QUEUED_FILES={queued_count}")

finally:
    try:
        if sftp is not None:
            sftp.close()
    except Exception:
        pass
    try:
        if transport is not None:
            transport.close()
    except Exception:
        pass