In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("MinIO") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://host.docker.internal:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.secret.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [3]:
spark

In [4]:
clients_csv_df = (
    spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", ",")
    .csv("s3a://landing-zone/dataway/protheus/clients")
)

In [5]:
clients_csv_df.show(truncate=False)

+----------------------+-------------------------------+--------------------------+---------------------+
|name                  |address                        |email                     |phone_number         |
+----------------------+-------------------------------+--------------------------+---------------------+
|Norma Fisher          |4759 William Haven Apt. 194    |NULL                      |NULL                 |
|West Corey            | CA 90152"                     |thomas15@example.com      |993-787-7840x80160   |
|Frederick Harrell     |28711 Golden Overpass          |NULL                      |NULL                 |
|West Andreaville      | OH 44115"                     |tamaramorrison@example.net|671-996-5934x2320    |
|Michael Anderson      |86848 Melissa Springs          |NULL                      |NULL                 |
|Rileymouth            | NM 87040"                     |christopher91@example.com |001-453-730-4135     |
|Antonio Logan         |98910 Cynthia Rest    

In [6]:
clients_csv_df_bronze = (
    clients_csv_df
    .withColumn('system', F.lit('protheus'))
    .withColumn('processed_at', F.current_timestamp())
)

In [7]:
clients_csv_df_bronze.show(truncate=False)

+----------------------+-------------------------------+--------------------------+---------------------+--------+--------------------------+
|name                  |address                        |email                     |phone_number         |system  |processed_at              |
+----------------------+-------------------------------+--------------------------+---------------------+--------+--------------------------+
|Norma Fisher          |4759 William Haven Apt. 194    |NULL                      |NULL                 |protheus|2024-12-22 17:59:29.071918|
|West Corey            | CA 90152"                     |thomas15@example.com      |993-787-7840x80160   |protheus|2024-12-22 17:59:29.071918|
|Frederick Harrell     |28711 Golden Overpass          |NULL                      |NULL                 |protheus|2024-12-22 17:59:29.071918|
|West Andreaville      | OH 44115"                     |tamaramorrison@example.net|671-996-5934x2320    |protheus|2024-12-22 17:59:29.071918|
|Micha

In [8]:
( 
    clients_csv_df_bronze
    .write
    .mode('overwrite')
    .parquet("s3a://bronze-zone/dataway/protheus/clients")
)