# Hive style partitioning in Delta

In [17]:
import pyspark
import pyspark.sql.functions as F
from delta import configure_spark_with_delta_pip
import delta

In [4]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [5]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-210/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-58eb9658-8fc1-4f65-a365-d3f8c61d733b;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 317ms :: artifacts dl 19ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

22/12/16 13:22:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [7]:
df = spark.createDataFrame(
    [
        ("Ernesto", "Guevara", "Argentina"),
        ("Maria", "Sharapova", "Russia"),
        ("Bruce", "Lee", "China"),
        ("Jack", "Ma", "China"),
    ]
).toDF("first_name", "last_name", "country")

In [11]:
df.repartition(F.col("country")).write.partitionBy("country").format(
    "delta"
).saveAsTable("country_people")

                                                                                

In [12]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [01;34mcountry=Argentina[0m
│   └── [00mpart-00000-3ecf97b7-79e6-460d-b1b8-a5081e75ad9a.c000.snappy.parquet[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-b2fba73f-3ff7-4fe0-9272-9a8873098338.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-902db841-9614-436b-9964-f0419f2138aa.c000.snappy.parquet[0m

4 directories, 4 files


## Add partition to Delta table

In [13]:
df = spark.createDataFrame(
    [
        ("Orlando", "Cabrera", "Colombia"),
        ("Carlos", "Vives", "Colombia"),
    ]
).toDF("first_name", "last_name", "country")

In [14]:
df.repartition(F.col("country")).write.mode("append").partitionBy("country").format(
    "delta"
).saveAsTable("country_people")

                                                                                

In [15]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
├── [01;34mcountry=Argentina[0m
│   └── [00mpart-00000-3ecf97b7-79e6-460d-b1b8-a5081e75ad9a.c000.snappy.parquet[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-b2fba73f-3ff7-4fe0-9272-9a8873098338.c000.snappy.parquet[0m
├── [01;34mcountry=Colombia[0m
│   └── [00mpart-00000-39c4b107-1928-498b-a028-e1a018572b02.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-902db841-9614-436b-9964-f0419f2138aa.c000.snappy.parquet[0m

5 directories, 6 files


## Remove partition from Delta table

In [18]:
dt = delta.DeltaTable.forName(spark, "country_people")

In [19]:
dt.delete(F.col("country") == "Argentina")

                                                                                

In [23]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   └── [00m00000000000000000002.json[0m
├── [01;34mcountry=Argentina[0m
│   └── [00mpart-00000-3ecf97b7-79e6-460d-b1b8-a5081e75ad9a.c000.snappy.parquet[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-b2fba73f-3ff7-4fe0-9272-9a8873098338.c000.snappy.parquet[0m
├── [01;34mcountry=Colombia[0m
│   └── [00mpart-00000-39c4b107-1928-498b-a028-e1a018572b02.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-902db841-9614-436b-9964-f0419f2138aa.c000.snappy.parquet[0m

5 directories, 7 files


In [21]:
dt = delta.DeltaTable.forName(spark, "country_people")

In [22]:
dt.toDF().show()

+----------+---------+--------+
|first_name|last_name| country|
+----------+---------+--------+
|     Maria|Sharapova|  Russia|
|   Orlando|  Cabrera|Colombia|
|    Carlos|    Vives|Colombia|
|     Bruce|      Lee|   China|
|      Jack|       Ma|   China|
+----------+---------+--------+



In [24]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

In [26]:
spark.sql("VACUUM country_people RETAIN 0 HOURS").show(truncate=False)

                                                                                

Deleted 1 files and directories in a total of 5 directories.
+-----------------------------------------------------------------------------------------------------------------+
|path                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------+
|file:/Users/matthew.powers/Documents/code/my_apps/delta-examples/notebooks/pyspark/spark-warehouse/country_people|
+-----------------------------------------------------------------------------------------------------------------+



In [27]:
!tree spark-warehouse/country_people

[01;34mspark-warehouse/country_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   └── [00m00000000000000000002.json[0m
├── [01;34mcountry=Argentina[0m
├── [01;34mcountry=China[0m
│   └── [00mpart-00000-b2fba73f-3ff7-4fe0-9272-9a8873098338.c000.snappy.parquet[0m
├── [01;34mcountry=Colombia[0m
│   └── [00mpart-00000-39c4b107-1928-498b-a028-e1a018572b02.c000.snappy.parquet[0m
└── [01;34mcountry=Russia[0m
    └── [00mpart-00000-902db841-9614-436b-9964-f0419f2138aa.c000.snappy.parquet[0m

5 directories, 6 files
