In [0]:
# ingest_bronze // NOT FOR S3 BUCKETS
import yaml
from pyspark.sql import SparkSession

spark.sql("use catalog bronze")

config_path = "/Workspace/Users/clarkscoberly@gmail.com/config.yaml"
with open(config_path) as f:
    config = yaml.safe_load(f)

spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

S3 = config.get("s3_bucket")
BRONZE = config.get("bronze_path", "bronze/")

bronze_consumer = spark.read.option("header", True).csv("/Volumes/landing/default/purchase/Tendo Exercise Data - consumer.csv")
bronze_purchase = spark.read.option("header", True).csv("/Volumes/landing/default/purchase/Tendo Exercise Data - purchase.csv")
bronze_fertilizer = spark.read.option("header", True).csv("/Volumes/landing/default/purchase/Tendo Exercise Data - fertilizer.csv")
bronze_avocado = spark.read.option("header", True).csv("/Volumes/landing/default/purchase/Tendo Exercise Data - avocado.csv")

# Format for spaces in column names
bronze_purchase = bronze_purchase.toDF(
    *[c.replace(" ", "_") for c in bronze_purchase.columns]
)
bronze_consumer = bronze_consumer.toDF(
    *[c.replace(" ", "_") for c in bronze_consumer.columns]
)
bronze_fertilizer = bronze_fertilizer.toDF(
    *[c.replace(" ", "_") for c in bronze_fertilizer.columns]
)
bronze_avocado = bronze_avocado.toDF(
    *[c.replace(" ", "_") for c in bronze_avocado.columns]
)

bronze_consumer.write.format("delta").mode("overwrite").saveAsTable("bronze.consumer")
bronze_purchase.write.format("delta").mode("overwrite").saveAsTable("bronze.purchase")
bronze_fertilizer.write.format("delta").mode("overwrite").saveAsTable("bronze.fertilizer")
bronze_avocado.write.format("delta").mode("overwrite").saveAsTable("bronze.avocado")

In [0]:
# ingest_bronze_autoloader.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import yaml

spark.sql("CREATE DATABASE IF NOT EXISTS bronze")

# Load configuration
config_path = "/Workspace/Users/clarkscoberly@gmail.com/config.yaml"
with open(config_path) as f:
    config = yaml.safe_load(f)

S3_BUCKET = config["s3_bucket"]
BRONZE_PATH = config.get("bronze_path", "bronze/")

# AutoLoader options
def autoload_delta(path, table_name):
    df = (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("header", True)
        .option("inferSchema", True)
        .load(path)
    )
    # Rename columns to remove spaces
    df = df.toDF(*[c.replace(" ", "_") for c in df.columns])
    
    (
        df.writeStream
        .format("delta")
        .outputMode("append")
        .option("checkpointLocation", f"{BRONZE_PATH}/{table_name}/_checkpoints")
        .toTable(f"bronze.{table_name}")
    )

# Paths in S3 (incremental ingest)
autoload_delta(f"{S3_BUCKET}/consumer/", "consumer")
autoload_delta(f"{S3_BUCKET}/purchase/", "purchase")
autoload_delta(f"{S3_BUCKET}/avocado/", "avocado")
autoload_delta(f"{S3_BUCKET}/fertilizer/", "fertilizer")
