In [132]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

if 'spark' in locals() or 'spark' in globals():
    spark.stop()
    
spark = SparkSession\
    .builder\
    .appName("Testando")\
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

spark

In [133]:
spark.conf.get("spark.sql.sources.partitionOverwriteMode")

'STATIC'

In [139]:
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
spark.conf.get("spark.sql.sources.partitionOverwriteMode")

'dynamic'

In [141]:
spark.sql("drop table default.exemplo")
DDL = """
CREATE EXTERNAL TABLE IF NOT EXISTS default.exemplo (
    col_1 string,
    col_2 int
)
USING PARQUET
PARTITIONED BY (anomesdia int)
LOCATION 's3a://datalake/exemplo/'
TBLPROPERTIES ('parquet.compression'='SNAPPY')
"""

spark.sql(DDL)
spark.sql("show tables in default").show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|  default|       exemplo|      false|
|  default|tb_sales_final|      false|
|  default| tb_sales_test|      false|
|  default| tbl_employees|      false|
+---------+--------------+-----------+



In [142]:
print(spark.sql('SHOW CREATE TABLE default.exemplo').collect()[0].__getitem__('createtab_stmt'))

CREATE TABLE default.exemplo (
  col_1 STRING,
  col_2 INT,
  anomesdia INT)
USING PARQUET
PARTITIONED BY (anomesdia)
LOCATION 's3a://datalake/exemplo'
TBLPROPERTIES (
  'bucketing_version' = '2',
  'parquet.compression' = 'SNAPPY')



In [148]:
dataDict = [
    ("banana", 200001, 20230412),
    ("abobrinha", 200002, 20230412),
    ("tomate", 200003, 20230412),
    ("laranja2", 200004, 20230412),
    ("batata", 200005, 20230412),
]

df_12 = spark.createDataFrame(data = dataDict, schema = ["col_1", "col_2", "anomesdia"])
df_12.show(5)

+---------+------+---------+
|    col_1| col_2|anomesdia|
+---------+------+---------+
|   banana|200001| 20230412|
|abobrinha|200002| 20230412|
|   tomate|200003| 20230412|
| laranja2|200004| 20230412|
|   batata|200005| 20230412|
+---------+------+---------+



In [149]:
df_12\
    .write\
    .mode('overwrite')\
    .partitionBy('anomesdia')\
    .option("path", "s3a://datalake/exemplo/")\
    .saveAsTable('default.exemplo')

spark.sql('SHOW PARTITIONS default.exemplo').show()
spark.sql("SELECT * FROM default.exemplo").show()

                                                                                

+------------------+
|         partition|
+------------------+
|anomesdia=20230412|
|anomesdia=20230413|
+------------------+

+---------+------+---------+
|    col_1| col_2|anomesdia|
+---------+------+---------+
|abobrinha|200002| 20230412|
|abobrinha|200002| 20230413|
| laranja2|200004| 20230412|
|  laranja|200004| 20230413|
|   banana|200001| 20230412|
|   tomate|200003| 20230412|
|   batata|200005| 20230412|
|   banana|200001| 20230413|
|   tomate|200003| 20230413|
|   batata|200005| 20230413|
+---------+------+---------+



In [146]:
dataDict = [
    ("banana", 200001, 20230413),
    ("abobrinha", 200002, 20230413),
    ("tomate", 200003, 20230413),
    ("laranja", 200004, 20230413),
    ("batata", 200005, 20230413),
]

df_13 = spark.createDataFrame(data = dataDict, schema = ["col_1", "col_2", "anomesdia"])
df_13.show(5)

+---------+------+---------+
|    col_1| col_2|anomesdia|
+---------+------+---------+
|   banana|200001| 20230413|
|abobrinha|200002| 20230413|
|   tomate|200003| 20230413|
|  laranja|200004| 20230413|
|   batata|200005| 20230413|
+---------+------+---------+



In [168]:
from pyspark.sql.functions import expr
df_13.withColumn('data', expr('SUBSTR(anomesdia, 0, 5)')).show()

+---------+------+---------+-----+
|    col_1| col_2|anomesdia| data|
+---------+------+---------+-----+
|   banana|200001| 20230413|20230|
|abobrinha|200002| 20230413|20230|
|   tomate|200003| 20230413|20230|
|  laranja|200004| 20230413|20230|
|   batata|200005| 20230413|20230|
+---------+------+---------+-----+



In [147]:
df_13\
    .write\
    .format('parquet')\
    .mode('overwrite')\
    .partitionBy('anomesdia')\
    .option("path", "s3a://datalake/exemplo/")\
    .saveAsTable('default.exemplo')


spark.sql('SHOW PARTITIONS default.exemplo').show()
spark.sql("SELECT * FROM default.exemplo").show()

                                                                                

+------------------+
|         partition|
+------------------+
|anomesdia=20230412|
|anomesdia=20230413|
+------------------+

+---------+------+---------+
|    col_1| col_2|anomesdia|
+---------+------+---------+
|abobrinha|200002| 20230412|
|abobrinha|200002| 20230413|
|  laranja|200004| 20230412|
|  laranja|200004| 20230413|
|   banana|200001| 20230412|
|   tomate|200003| 20230412|
|   batata|200005| 20230412|
|   banana|200001| 20230413|
|   tomate|200003| 20230413|
|   batata|200005| 20230413|
+---------+------+---------+



In [26]:
spark.sql("ALTER TABLE default.exemplo DROP PARTITION (anomesdia=20230412)")
spark.sql('SHOW PARTITIONS default.exemplo').show()

+---------+
|partition|
+---------+
+---------+



In [104]:
print(spark.sql('SHOW create table default.exemplo').collect()[0].__getitem__('createtab_stmt'))

CREATE TABLE default.exemplo (
  col_1 STRING,
  col_2 INT,
  anomesdia INT)
USING parquet
PARTITIONED BY (anomesdia)
LOCATION 's3a://datalake/exemplo'
TBLPROPERTIES (
  'bucketing_version' = '2',
  'parquet.compression' = 'SNAPPY',
  'transient_lastDdlTime' = '1681303603')



In [28]:
spark.conf.get("spark.sql.sources.partitionOverwriteMode")
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

In [155]:
df = spark.read.table("default.exemplo")

In [163]:
df.select('anomesdia').distinct().show()

+---------+
|anomesdia|
+---------+
| 20230412|
| 20230413|
+---------+

