In [1]:
import os, sys
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Application started successfully.")

2025-06-30 06:01:17,072 - INFO - Application started successfully.


In [2]:
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']


In [3]:
# set pyspark configuration
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("read_bronze_adventure_works") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g") \
    
# set MinIO config
conf.set("fs.s3a.access.key", AWS_ACCESS_KEY) 
conf.set("fs.s3a.secret.key", AWS_SECRET_KEY) 
conf.set("fs.s3a.endpoint", AWS_S3_ENDPOINT) 
conf.set("fs.s3a.connection.ssl.enabled", "false") 
conf.set("fs.s3a.path.style.access", "true") 

<pyspark.conf.SparkConf at 0x7fc32f19d720>

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

25/06/30 06:01:18 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
# read config file
import toml

config_file = "notebooks/staging/staging_adventure_works.toml"
filepath = os.path.join(os.getcwd(), config_file)
with open(filepath, 'r') as f:
    config = toml.load(f)

# Access values
source_bucket = config['minio_source']['s3_bucket']
target_bucket = config['minio_target']['s3_bucket']
option = config['read_options']


In [7]:
from pyspark.sql.functions import input_file_name, regexp_extract, current_timestamp, to_date, lit

for item in [config["source"][r] for r in config["source"] if config["source"][r]["enabled"]]:
    source_path = os.path.join(config["minio_source"]["s3_bucket"], item["source_path"])
    target_path = os.path.join(config["minio_target"]["s3_bucket"], item["target_path"])

    logger.info(f"Processing source path: {source_path}")
    df = spark.read.option("header", "true") \
        .option("delimiter", option["sep"]) \
        .option("inferSchema", option["inferSchema"]) \
        .option("header", option["header"])  \
        .csv(source_path)
    
    df = df.withColumn("file_path", input_file_name())
    df = df.withColumn("ingest_timestamp", current_timestamp())
    from datetime import datetime
    target_date = datetime.now().strftime("%Y-%m-%d")
    df = df.withColumn("ingest_date", lit(target_date))

    df.write.mode("overwrite") \
        .option("header", "true") \
        .partitionBy("ingest_date") \
        .csv(target_path)
    
    logger.info(f"Reprocessed data for {target_date} written to {target_path}")

2025-06-30 06:01:53,151 - INFO - Processing source path: s3a://seed/adventure_works/sales/*.csv
25/06/30 06:01:55 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/30 06:01:55 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/30 06:01:55 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/30 06:01:55 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/30 06:01:55 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/30 06:01:55 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/30 06:01:55 WARN AbstractS3ACommitterFactory: Using standa

In [None]:
spark.stop()