In [0]:
# Import necessary PySpark data types for schema definition
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

# Import PySpark SQL functions for DataFrame operations
from pyspark.sql.functions import current_timestamp, lit, col, lower, broadcast

# Import Python datetime utilities for date calculations
from datetime import datetime, timedelta

In [0]:
%run ../utils/helpers

In [0]:
%run ../utils/config

In [0]:
# Retrieve DataFrame of unprocessed files for the last seven days for the given process

raw_customers_path=raw_customers
table_name="{}.{}".format(raw_uk_schema,raw_customers_table)

new_files_list = get_last_seven_days_unprocessed_files(base_path=raw_customers_path,raw_table_name=table_name)

In [0]:
# Check if there are any new files to process
if len(new_files_list)==0:
    print("No new files found to process.")
else:
        # Read the raw CSV file into a DataFrame
        df_raw = spark.read.format("csv") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .load(new_files_list)
        
        # Add metadata columns to the DataFrame
        df_raw = df_raw.withColumn("created_at", current_timestamp())
        df_raw = df_raw.withColumn("file_path",col("_metadata.file_path"))
        df_raw = df_raw.withColumn("source_system", lit(None).cast("string")) #Set to None for testing scenario
        df_raw = df_raw.withColumn("system_of_record", lit(None).cast("string")) #Set to None for testing scenario
        # Append the processed DataFrame to the target table
        
        df_raw.write.mode("append").option("mergeSchema", "true").saveAsTable("{}.{}".format(raw_uk_schema,raw_customers_table))
        print("File processed and appended successfully.")