In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("MinIO") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://host.docker.internal:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.secret.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [3]:
spark

In [4]:
clients_json_df = (
    spark
    .read
    .option("inferSchema", "true")
    .json("s3a://landing-zone/dataway/sap/clients")
)

In [5]:
clients_json_df_bronze = (
    clients_json_df
    .withColumn('system', F.lit('sap'))
    .withColumn('processed_at', F.current_timestamp())
    .withColumn(
        'nationality',
        F.when(
            F.round(F.rand() * 2 + 1).cast("int") == 1, F.lit('BRAZILIAN')
        ).otherwise(F.lit('NOT_BRAZILIAN')))
)

In [6]:
clients_json_df_bronze.show(10)

+--------------------+--------------------+-----------------+--------------------+------+--------------------+-------------+
|             address|               email|             name|        phone_number|system|        processed_at|  nationality|
+--------------------+--------------------+-----------------+--------------------+------+--------------------+-------------+
|4759 William Have...|thomas15@example.com|     Norma Fisher|  993-787-7840x80160|   sap|2025-01-11 19:30:...|NOT_BRAZILIAN|
|28711 Golden Over...|tamaramorrison@ex...|Frederick Harrell|   671-996-5934x2320|   sap|2025-01-11 19:30:...|NOT_BRAZILIAN|
|86848 Melissa Spr...|christopher91@exa...| Michael Anderson|    001-453-730-4135|   sap|2025-01-11 19:30:...|NOT_BRAZILIAN|
|98910 Cynthia Res...| leetara@example.net|    Antonio Logan|        673-300-8691|   sap|2025-01-11 19:30:...|NOT_BRAZILIAN|
|91634 Strong Moun...|   awade@example.com|     Matthew Bell|+1-269-684-5642x8...|   sap|2025-01-11 19:30:...|NOT_BRAZILIAN|


In [7]:
( 
    clients_json_df_bronze
    .write
    .mode('overwrite')
    .partitionBy('nationality')
    .parquet("s3a://bronze-zone/dataway/sap/clients")
)

In [8]:
clients_json_df_bronze.rdd.getNumPartitions()

1

In [9]:
df = ( 
    spark
    .read
    .parquet("s3a://bronze-zone/dataway/sap/clients")
)

In [10]:
df.rdd.getNumPartitions()

2

# Coalesce vs Repartition

In [None]:
# Coalesce (Sempre use para reduzir o número de partições)
# Repartition (Sempre use para aumentar o número de partiçōes)

In [11]:
df = df.coalesce(1)

In [12]:
df.rdd.getNumPartitions()

1

In [13]:
df = df.repartition(5)

In [14]:
df.rdd.getNumPartitions()

5

In [15]:
( 
    df
    .write
    .mode('overwrite')
    .partitionBy('nationality')
    .parquet("s3a://bronze-zone/dataway/sap/clients")
)