In [0]:
from pyspark.sql.functions import current_timestamp, lit,col,broadcast
from datetime import datetime, timedelta

In [0]:
def get_last_seven_days_unprocessed_files(base_path,raw_table_name):
    # Get today's date
    today = datetime.today()

    paths_to_check = []

    # Generate folder paths for the last 7 days
    for i in range(7):
        date = today - timedelta(days=i)
        folder_path = date.strftime("%Y/%m/%d")
        paths_to_check.append(f"{base_path}/{folder_path}")

    existing_paths = []
    # Check which paths exist in DBFS
    for path in paths_to_check:
        try:
            if dbutils.fs.ls(path):
                existing_paths.append(path)
        except Exception as e:
            # Skip paths that do not exist
            print(f"Path not found: {path}. Skipping...")

        # Read all files from existing paths as binary files
        files_df = (spark.read
            .format("binaryFile")
            .load(existing_paths))

    # Filter for CSV files only
    csv_files_df = files_df.filter(col("path").endswith(".csv"))

    # Get list of already processed file paths from the raw table
    if spark.catalog.tableExists(raw_table_name):
        processed_files_df = spark.table(raw_table_name).select("file_path").distinct()
    else:
        processed_files_df = spark.createDataFrame([], "file_path string")

    # Find new CSV files that have not been processed yet
    new_files_df = csv_files_df.join(
        broadcast(processed_files_df),
        csv_files_df.path == processed_files_df.file_path,
        "left_anti"
    ).select(csv_files_df.path)

    new_files_list = [row['path'] for row in new_files_df.collect()]
    # Return DataFrame of new, unprocessed CSV file paths
    return new_files_list