In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("MinIO") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://host.docker.internal:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.secret.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [3]:
spark

In [4]:
clients_json_df = (
    spark
    .read
    .option("inferSchema", "true")
    .json("s3a://landing-zone/dataway/sap/clients")
)

In [5]:
clients_json_df.show(truncate=False)

+------------------------------------------------------------+----------------------------+----------------+---------------------+
|address                                                     |email                       |name            |phone_number         |
+------------------------------------------------------------+----------------------------+----------------+---------------------+
|6351 Travis Streets Suite 163\nChoibury, MI 54268           |ecook@example.com           |Mark Malone PhD |+1-772-334-7852x342  |
|20780 Brown Circle Apt. 437\nSouth Shannonview, CO 50610    |ymorris@example.net         |Tina Cabrera    |304.681.4095x1771    |
|07502 Martinez Squares Apt. 052\nRamirezborough, VT 60381   |cynthiaperez@example.net    |Monique Simon   |+1-380-504-3760x0173 |
|847 Sanders Falls\nPort Pamelamouth, NM 41906               |jessica85@example.net       |Donald Blake    |588.451.5239x92873   |
|0721 Wanda Green\nWest Christina, MP 22049                  |wilkersonjohn@example

In [6]:
clients_json_df_bronze_df_api = (
    clients_json_df
    .withColumn('system', F.lit('sap'))
    .withColumn('processed_at', F.current_timestamp())
)

In [8]:
clients_json_df_bronze_df_api.show(5)

+--------------------+--------------------+---------------+--------------------+------+--------------------+
|             address|               email|           name|        phone_number|system|        processed_at|
+--------------------+--------------------+---------------+--------------------+------+--------------------+
|6351 Travis Stree...|   ecook@example.com|Mark Malone PhD| +1-772-334-7852x342|   sap|2024-12-22 18:10:...|
|20780 Brown Circl...| ymorris@example.net|   Tina Cabrera|   304.681.4095x1771|   sap|2024-12-22 18:10:...|
|07502 Martinez Sq...|cynthiaperez@exam...|  Monique Simon|+1-380-504-3760x0173|   sap|2024-12-22 18:10:...|
|847 Sanders Falls...|jessica85@example...|   Donald Blake|  588.451.5239x92873|   sap|2024-12-22 18:10:...|
|0721 Wanda Green\...|wilkersonjohn@exa...| Robin Jennings|001-738-415-8625x382|   sap|2024-12-22 18:10:...|
+--------------------+--------------------+---------------+--------------------+------+--------------------+
only showing top 5 

In [9]:
clients_json_df_bronze_sql = spark.sql("""
        SELECT *, 
               'sap' as system,
               current_timestamp() as processed_at
        FROM {clients_json_df}
    """,
    clients_json_df=clients_json_df
)

In [10]:
clients_json_df_bronze_sql.show(5)

+--------------------+--------------------+---------------+--------------------+------+--------------------+
|             address|               email|           name|        phone_number|system|        processed_at|
+--------------------+--------------------+---------------+--------------------+------+--------------------+
|6351 Travis Stree...|   ecook@example.com|Mark Malone PhD| +1-772-334-7852x342|   sap|2024-12-22 18:10:...|
|20780 Brown Circl...| ymorris@example.net|   Tina Cabrera|   304.681.4095x1771|   sap|2024-12-22 18:10:...|
|07502 Martinez Sq...|cynthiaperez@exam...|  Monique Simon|+1-380-504-3760x0173|   sap|2024-12-22 18:10:...|
|847 Sanders Falls...|jessica85@example...|   Donald Blake|  588.451.5239x92873|   sap|2024-12-22 18:10:...|
|0721 Wanda Green\...|wilkersonjohn@exa...| Robin Jennings|001-738-415-8625x382|   sap|2024-12-22 18:10:...|
+--------------------+--------------------+---------------+--------------------+------+--------------------+
only showing top 5 