In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("MinIO") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://host.docker.internal:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.secret.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [3]:
spark

In [4]:
clients_json_df = (
    spark
    .read
    .option("inferSchema", "true")
    .json("s3a://landing-zone/dataway/sap/clients")
)

In [5]:
clients_json_df.show(truncate=False)

+-----------------------------------------------------------+---------------------------+-----------------+---------------------+
|address                                                    |email                      |name             |phone_number         |
+-----------------------------------------------------------+---------------------------+-----------------+---------------------+
|4759 William Haven Apt. 194\nWest Corey, CA 90152          |thomas15@example.com       |Norma Fisher     |993-787-7840x80160   |
|28711 Golden Overpass\nWest Andreaville, OH 44115          |tamaramorrison@example.net |Frederick Harrell|671-996-5934x2320    |
|86848 Melissa Springs\nRileymouth, NM 87040                |christopher91@example.com  |Michael Anderson |001-453-730-4135     |
|98910 Cynthia Rest\nSouth Christopherville, AZ 79865       |leetara@example.net        |Antonio Logan    |673-300-8691         |
|91634 Strong Mountains Apt. 302\nWest Alyssa, DE 78723     |awade@example.com          |M

In [6]:
clients_json_df_bronze_df_api = (
    clients_json_df
    .withColumn('system', F.lit('sap'))
    .withColumn('processed_at', F.current_timestamp())
)

In [8]:
clients_json_df_bronze_df_api.show(5, truncate=False)

+------------------------------------------------------+--------------------------+-----------------+---------------------+------+--------------------------+
|address                                               |email                     |name             |phone_number         |system|processed_at              |
+------------------------------------------------------+--------------------------+-----------------+---------------------+------+--------------------------+
|4759 William Haven Apt. 194\nWest Corey, CA 90152     |thomas15@example.com      |Norma Fisher     |993-787-7840x80160   |sap   |2025-01-11 19:42:49.768583|
|28711 Golden Overpass\nWest Andreaville, OH 44115     |tamaramorrison@example.net|Frederick Harrell|671-996-5934x2320    |sap   |2025-01-11 19:42:49.768583|
|86848 Melissa Springs\nRileymouth, NM 87040           |christopher91@example.com |Michael Anderson |001-453-730-4135     |sap   |2025-01-11 19:42:49.768583|
|98910 Cynthia Rest\nSouth Christopherville, AZ 7986

In [16]:
clients_json_df_bronze_sql = spark.sql("""
        SELECT *, 
               'sap' as system,
               current_timestamp() as processed_at
        FROM {clients_json_df}
    """,
    clients_json_df=clients_json_df
)

In [17]:
clients_json_df_bronze_sql.show(5, truncate=False)

+------------------------------------------------------+--------------------------+-----------------+---------------------+------+--------------------------+
|address                                               |email                     |name             |phone_number         |system|processed_at              |
+------------------------------------------------------+--------------------------+-----------------+---------------------+------+--------------------------+
|4759 William Haven Apt. 194\nWest Corey, CA 90152     |thomas15@example.com      |Norma Fisher     |993-787-7840x80160   |sap   |2025-01-11 19:45:09.801978|
|28711 Golden Overpass\nWest Andreaville, OH 44115     |tamaramorrison@example.net|Frederick Harrell|671-996-5934x2320    |sap   |2025-01-11 19:45:09.801978|
|86848 Melissa Springs\nRileymouth, NM 87040           |christopher91@example.com |Michael Anderson |001-453-730-4135     |sap   |2025-01-11 19:45:09.801978|
|98910 Cynthia Rest\nSouth Christopherville, AZ 7986