In [0]:
# Create a managed volume
#spark.sql("CREATE VOLUME medisure_jen.bronze.landing_zone")

# Create a landing zone directory if it doesn't exist
#dbutils.fs.mkdirs("/Volumes/medisure_jen/bronze/landing_zone/")

### **Handle Batch processing**

In [0]:

# Define the source path for BATCH files
source_data_path = "/Volumes/medisure_jen/bronze/landing_zone/"

# List all files in the directory
file_list = dbutils.fs.ls(source_data_path)

# Filter out the streaming file - we will process it separately!
batch_files = [f for f in file_list if "claims_stream.json" not in f.name]

print("Processing BATCH files found in landing zone:")
for file_info in batch_files:
    print(f" - {file_info.name}")

for file_info in batch_files:
    file_path = file_info.path
    file_name = file_info.name
    base_table_name = file_name.split('.')[0]
    
    print(f"\nStarting BATCH ingestion for: {file_name}")
    
    # Determine the file format and set read options accordingly
    if file_name.endswith('.csv'):
        df = (spark.read
              .format("csv")
              .option("header", "true")
              .option("inferSchema", "true")
              .option("escape", '"')
              .option("quote", '"')
              .load(file_path)
             )
        
    elif file_name.endswith('.json'):
        df = (spark.read
              .format("json")
              .option("multiLine", "true")
              .option("escape", '"')
              .load(file_path)
             )
    else:
        print(f"Unsupported file format for {file_name}. Skipping.")
        continue

    # Add metadata columns
    from pyspark.sql.functions import current_timestamp, input_file_name
    df_with_metadata = df.withColumn("_source_file", input_file_name()) \
                         .withColumn("_ingestion_timestamp", current_timestamp())
    
    full_target_table_name = f"medisure_jen.bronze.{base_table_name}"
    
    # Write the DataFrame to the Bronze Delta Table
    # Use 'overwrite' for initial load. For ongoing, consider 'append' with merge/duplicate handling.
    (df_with_metadata.write
     .format("delta")
     .mode("overwrite") 
     .option("overwriteSchema", "true")
     .saveAsTable(full_target_table_name)
    )
    
    print(f"Successfully ingested {file_name} into Delta table: {full_target_table_name}")
    print(f"Number of records written: {df.count()}")

print("\nAll BATCH files processed.")