In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("MinIO") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://host.docker.internal:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.secret.key", "MinioAdmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [3]:
spark

In [4]:
clients_json_df = (
    spark
    .read
    .option("inferSchema", "true")
    .json("s3a://landing-zone/dataway/sap/clients")
)

In [5]:
clients_json_df_bronze = (
    clients_json_df
    .withColumn('system', F.lit('sap'))
    .withColumn('processed_at', F.current_timestamp())
    .withColumn(
        'nationality',
        F.when(
            F.round(F.rand() * 2 + 1).cast("int") == 1, F.lit('BRAZILIAN')
        ).otherwise(F.lit('NOT_BRAZILIAN')))
)

In [6]:
clients_json_df_bronze.show(10)

+--------------------+--------------------+----------------+--------------------+------+--------------------+-------------+
|             address|               email|            name|        phone_number|system|        processed_at|  nationality|
+--------------------+--------------------+----------------+--------------------+------+--------------------+-------------+
|6351 Travis Stree...|   ecook@example.com| Mark Malone PhD| +1-772-334-7852x342|   sap|2024-12-22 18:30:...|NOT_BRAZILIAN|
|20780 Brown Circl...| ymorris@example.net|    Tina Cabrera|   304.681.4095x1771|   sap|2024-12-22 18:30:...|NOT_BRAZILIAN|
|07502 Martinez Sq...|cynthiaperez@exam...|   Monique Simon|+1-380-504-3760x0173|   sap|2024-12-22 18:30:...|    BRAZILIAN|
|847 Sanders Falls...|jessica85@example...|    Donald Blake|  588.451.5239x92873|   sap|2024-12-22 18:30:...|NOT_BRAZILIAN|
|0721 Wanda Green\...|wilkersonjohn@exa...|  Robin Jennings|001-738-415-8625x382|   sap|2024-12-22 18:30:...|NOT_BRAZILIAN|
|68494 K

In [7]:
( 
    clients_json_df_bronze
    .write
    .mode('overwrite')
    .partitionBy('nationality')
    .parquet("s3a://bronze-zone/dataway/sap/clients")
)

In [8]:
clients_json_df_bronze.rdd.getNumPartitions()

1

In [9]:
df = ( 
    spark
    .read
    .parquet("s3a://bronze-zone/dataway/sap/clients")
)

In [10]:
df.rdd.getNumPartitions()

2

# Coalesce vs Repartition

In [11]:
# Coalesce (Sempre use para reduzir o número de partições)
# Repartition (Sempre use para aumentar o número de partiçōes)

In [12]:
df = df.coalesce(1)

In [13]:
df.rdd.getNumPartitions()

1

In [14]:
df = df.repartition(5)

In [15]:
df.rdd.getNumPartitions()

5

In [16]:
( 
    df
    .write
    .mode('overwrite')
    .partitionBy('nationality')
    .parquet("s3a://bronze-zone/dataway/sap/clients")
)