## Bronze Table Creation

## Sample Code Used


`
spark_bronze_df = 
(
    spark.readStream.format("cloudFiles")
    
    .option("cloudFiles.format", "parquet")
    
    .option("cloudFiles.schemaLocation", "/Volumes/demo-taxi/bronze-layer/taxi_landing/schema/bronze/")

    .option("cloudFiles.schemaEvolutionMode", "addNewColumns")

    .option("cloudFiles.inferColumnTypes", "true")

    .load("/Volumes/demo-taxi/bronze-layer/taxi_landing/taxi-raw/jan-2025/yellow_*.parquet")
    )`



In [0]:

def process_yellow_bronze(month_folder):
    # Creating the yellow taxi bronze table
    
    spark_bronze_df = (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "parquet")
        .option("cloudFiles.schemaLocation", "/Volumes/demo-taxi/bronze-layer/taxi_landing/schema/bronze/")
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
        .option("cloudFiles.inferColumnTypes", "true")
        .load(f"/Volumes/demo-taxi/bronze-layer/taxi_landing/taxi-raw/{month_folder}/yellow_*.parquet")
        .writeStream
            .outputMode("append")
            .option("checkpointLocation", "/Volumes/demo-taxi/bronze-layer/taxi_landing/checkpoint/bronze/yellow/")
            .option("mergeSchema", "true")
            .trigger(availableNow=True)
            .toTable("`demo-taxi`.`bronze-layer`.yellow_bronze")            
        )
    spark_bronze_df.awaitTermination()




In [0]:
%sql
select COUNT(*) from `demo-taxi`.`bronze-layer`.yellow_bronze 

In [0]:
def process_green_bronze(month_folder):
    # Creating the green taxi bronze table
    spark_bronze_df =(
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "parquet")
        .option("cloudFiles.schemaLocation", "/Volumes/demo-taxi/bronze-layer/taxi_landing/schema/bronze_green/")
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
        .option("cloudFiles.inferColumnTypes", "true")
        .load(f"/Volumes/demo-taxi/bronze-layer/taxi_landing/taxi-raw/{month_folder}/green_*.parquet")
        .writeStream
            .outputMode("append")
            .option("checkpointLocation", "/Volumes/demo-taxi/bronze-layer/taxi_landing/checkpoint/bronze/green/")
            .option("mergeSchema", "true")
            .trigger(availableNow=True)
            .toTable("`demo-taxi`.`bronze-layer`.green_bronze")            
        )
    spark_bronze_df.awaitTermination()
    


In [0]:
def process_fhv_bronze(month_folder):
    # Creating the fhv taxi bronze table
    spark_bronze_df =(
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "parquet")
        .option("cloudFiles.schemaLocation", "/Volumes/demo-taxi/bronze-layer/taxi_landing/schema/bronze_fhv/")
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
        .option("cloudFiles.inferColumnTypes", "true")
        .load(f"/Volumes/demo-taxi/bronze-layer/taxi_landing/taxi-raw/{month_folder}/fhv_*.parquet")
        .writeStream
            .outputMode("append")
            .option("checkpointLocation", "/Volumes/demo-taxi/bronze-layer/taxi_landing/checkpoint/bronze/fhv/")
            .option("mergeSchema", "true")
            .trigger(availableNow=True)
            .toTable("`demo-taxi`.`bronze-layer`.fhv_bronze")            
        )
    spark_bronze_df.awaitTermination()

In [0]:
def process_bronze(month_folder):
    """
        Executes all the Bronze file loading methods
    """
    process_yellow_bronze(month_folder)
    process_green_bronze(month_folder)
    process_fhv_bronze(month_folder)

In [0]:
# Get the month value from widget else set to default value
try:
    month_folder = dbutils.widgets.get("month")
    if not month_folder:
        month_folder = "jan-2025"
except:
    month_folder = "feb-2025"

process_bronze(month_folder)