In [0]:
%sql
DROP VOLUME IF EXISTS sivaadbuc.default.batch16v1;


In [0]:
%sql
CREATE VOLUME sivaadbuc.default.batch16v1;


In [0]:
%sql
drop table if exists sivaadbuc.default.transactions_bronze

In [0]:
df = (spark.readStream
          .format("cloudFiles")                            # Required → tells Spark to use Auto Loader
          
          # File Format
          .option("cloudFiles.format", "csv")              # File format (csv, json, parquet, avro, etc.)
          .option("header", "true")                        # Only for CSV → first row is header
          .option("delimiter", ",")                        # CSV delimiter (default is ',')
          
          # Schema Options
          .option("cloudFiles.schemaLocation", "/Volumes/sivaadbuc/default/batch16v1/_schema")  
                                                          # Where to store inferred schema (mandatory)
          .option("cloudFiles.inferColumnTypes", "true")   # Infer column types automatically
          .option("cloudFiles.schemaEvolutionMode", "addNewColumns") 
                                                          # Allow schema evolution by adding new columns
          .option("cloudFiles.schemaHints", "amount DOUBLE, transaction_date TIMESTAMP") 
                                                          # Manually enforce column types (optional)

          # File Detection Options
          .option("cloudFiles.useNotifications", "true")   # Use Event Grid/Event Hub (if configured) for faster detection
          .option("cloudFiles.includeExistingFiles", "true")  
                                                          # Ingest already existing files on first run
          .option("cloudFiles.maxFilesPerTrigger", "1000") # Controls ingestion batch size
          .option("cloudFiles.allowOverwrites", "true")    # If files with same name arrive, overwrite allowed
          
          # Data Quality Options
          .option("badRecordsPath", "/Volumes/sivaadbuc/default/batch16v1/_badrecords") 
                                                          # Store corrupt/unreadable records
          .option("enforceSchema", "true")                 # Fail if schema mismatch is found

          # Source Path
          .load("/Volumes/sivaadbuc/default/batch16v1"))   # Source directory (Volume path)


In [0]:
(df_clean.writeStream
   .format("delta")                                        # Write as Delta Lake format
   
   # Checkpointing (mandatory for streaming)
   .option("checkpointLocation", "/Volumes/sivaadbuc/default/batch16v1/_checkpoints")
   
   # Output mode
   .outputMode("append")                                   # append | complete | update
   
   # Trigger options
   .trigger(once=True)                                     # Run once and stop (batch-like)
   # .trigger(processingTime="1 minute")                   # Run every minute (continuous)
   # .trigger(availableNow=True)                           # Process backlog quickly then stop
   
   # Partitioning
   .partitionBy("transaction_date")                        # Example: partition by a column
   
   # Schema handling
   .option("mergeSchema", "true")                          # Allow schema evolution on write
   
   # Performance tuning
   .option("maxFilesPerTrigger", "1000")                   # Control batch size when processing
   
   # Fault tolerance
   .option("ignoreDeletes", "true")                        # Ignore file deletes in source
   .option("ignoreChanges", "true")                        # Ignore file overwrites/updates in source

   # Target table
   .table("sivaadbuc.default.transactions_bronze"))



In [0]:
%sql
select count(*) from sivaadbuc.default.transactions_bronze