In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("MinIO") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://host.docker.internal:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.secret.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [3]:
spark

# CLOUD X

In [4]:
clients_parquet_df = spark.read.parquet("s3a://landing-zone/dataway/cloud_x/clients")
clients_parquet_df.show()

+----------------+--------------------+--------------------+--------------------+
|            name|             address|               email|        phone_number|
+----------------+--------------------+--------------------+--------------------+
|Nicholas Schmidt|17259 Reynolds St...|williamfaulkner@e...|001-750-587-9560x...|
|  David Martinez|4363 Alicia Forge...| xbrewer@example.com|001-386-794-5054x...|
|       Peter Lee|3076 Smith Ranch ...|cheryldiaz@exampl...|        345-694-9912|
| Linda Rasmussen|431 John Mountain...|jasminharris@exam...| +1-276-656-7694x067|
|   Rachel Willis|6007 Burch Square...|christopheranders...|  787-658-8779x90447|
|    Wesley James|0333 Abigail Cros...| karen24@example.org|    451.935.8241x763|
|   Karen Jenkins|26177 Chan Missio...|  ryan26@example.com|  512-831-4421x95023|
|      Paul Scott|01414 Lori Forge ...|juliehart@example...|+1-725-384-5288x8...|
|  Erica Robinson|PSC 8644, Box 690...|fmorales@example.org|       (975)816-8187|
| Michael Collin

In [5]:
clients_parquet_df_bronze = (
    clients_parquet_df
    .withColumn('system', F.lit('cloud_x'))
    .withColumn('processed_at', F.current_timestamp())
)

In [8]:
clients_parquet_df_bronze.show(truncate=False)

+----------------+------------------------------------------------------------+-------------------------------+---------------------+-------+--------------------------+
|name            |address                                                     |email                          |phone_number         |system |processed_at              |
+----------------+------------------------------------------------------------+-------------------------------+---------------------+-------+--------------------------+
|Nicholas Schmidt|17259 Reynolds Stravenue Apt. 667\nAdamburgh, LA 47025      |williamfaulkner@example.net    |001-750-587-9560x7334|cloud_x|2024-12-22 17:52:44.939143|
|David Martinez  |4363 Alicia Forge\nEast Michelleburgh, ND 47615             |xbrewer@example.com            |001-386-794-5054x7856|cloud_x|2024-12-22 17:52:44.939143|
|Peter Lee       |3076 Smith Ranch Suite 961\nReynoldsland, LA 06612          |cheryldiaz@example.net         |345-694-9912         |cloud_x|2024-12-22 17:

In [7]:
( 
    clients_parquet_df_bronze
    .write
    .mode('overwrite')
    .parquet("s3a://bronze-zone/dataway/cloud_x/clients")
)