## Drop column from Delta Lake table

This notebook demonstrates how to drop a column of a Delta Lake table.

It demonstrates how the column mapping functionality that was added in Delta 1.2 makes this operation a lot more efficient.

In [42]:
import pyspark
from delta import *

In [43]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [44]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Create Delta Lake

In [45]:
spark.sql("drop table if exists `my_cool_table`")

DataFrame[]

In [46]:
columns = ["language", "num_speakers"]
data = [("English", "1.5"), ("Mandarin", "1.1"), ("Hindi", "0.6")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [47]:
df.write.format("delta").saveAsTable("default.my_cool_table")

In [48]:
spark.sql("select * from `my_cool_table`").show()

+--------+------------+
|language|num_speakers|
+--------+------------+
|Mandarin|         1.1|
| English|         1.5|
|   Hindi|         0.6|
+--------+------------+



In [49]:
%ls -l ./spark-warehouse/my_cool_table/

total 32
drwxr-xr-x  4 matthew.powers  staff  128 Aug  7 11:35 [34m_delta_log[m[m/
-rw-r--r--  1 matthew.powers  staff  415 Aug  7 11:35 part-00000-37799dc8-4639-4c57-b31b-f93640e459ff-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  751 Aug  7 11:35 part-00003-3a33dd46-1310-472f-bca4-f1a305c7871f-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  759 Aug  7 11:35 part-00006-0e644e16-76e2-4304-beb2-2c25790d06d6-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  738 Aug  7 11:35 part-00009-f75076d6-f254-4514-ba4b-6534f1f01d34-c000.snappy.parquet


## Drop column from Delta Lake

In [50]:
spark.sql(
    """ALTER TABLE `my_cool_table` SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')"""
)

                                                                                

DataFrame[]

In [51]:
spark.sql("alter table `my_cool_table` drop column language")

DataFrame[]

In [52]:
spark.sql("select * from `my_cool_table`").show()

+------------+
|num_speakers|
+------------+
|         1.1|
|         1.5|
|         0.6|
+------------+



In [53]:
%ls -l ./spark-warehouse/my_cool_table/

total 32
drwxr-xr-x  8 matthew.powers  staff  256 Aug  7 11:35 [34m_delta_log[m[m/
-rw-r--r--  1 matthew.powers  staff  415 Aug  7 11:35 part-00000-37799dc8-4639-4c57-b31b-f93640e459ff-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  751 Aug  7 11:35 part-00003-3a33dd46-1310-472f-bca4-f1a305c7871f-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  759 Aug  7 11:35 part-00006-0e644e16-76e2-4304-beb2-2c25790d06d6-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  738 Aug  7 11:35 part-00009-f75076d6-f254-4514-ba4b-6534f1f01d34-c000.snappy.parquet


## Drop column from Delta Lake pre Delta 1.2

In [54]:
spark.sql("drop table if exists `another_cool_table`")

DataFrame[]

In [56]:
columns = ["language", "num_speakers"]
data = [("Spanish", "0.5"), ("French", "0.3"), ("Arabic", "0.3")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [57]:
df.write.format("delta").saveAsTable("default.another_cool_table")

                                                                                

In [58]:
df = spark.sql("select * from another_cool_table")

In [59]:
df.show()

+--------+------------+
|language|num_speakers|
+--------+------------+
| Spanish|         0.5|
|  Arabic|         0.3|
|  French|         0.3|
+--------+------------+



In [60]:
%ls -l ./spark-warehouse/another_cool_table/

total 32
drwxr-xr-x  4 matthew.powers  staff  128 Aug  7 11:54 [34m_delta_log[m[m/
-rw-r--r--  1 matthew.powers  staff  415 Aug  7 11:54 part-00000-47ce65fc-44a8-4099-a7a2-e6697a1b4842-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  752 Aug  7 11:54 part-00003-3586c1ea-0e01-4ea3-ae0d-79bc43e5c0d5-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  745 Aug  7 11:54 part-00006-f60f37e4-7913-4948-aeeb-46feec516bbd-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  745 Aug  7 11:54 part-00009-f41a2472-2678-473c-92de-7b02d0ff56c9-c000.snappy.parquet


In [63]:
df = df.drop("num_speakers")

In [64]:
df.show()

+--------+
|language|
+--------+
| Spanish|
|  Arabic|
|  French|
+--------+



In [65]:
df.write.format("delta").mode("OVERWRITE").option(
    "overwriteSchema", "true"
).saveAsTable("default.another_cool_table")

                                                                                

In [66]:
spark.sql("select * from another_cool_table").show()

+--------+
|language|
+--------+
| Spanish|
|  French|
|  Arabic|
+--------+



In [67]:
%ls -l ./spark-warehouse/another_cool_table/

total 56
drwxr-xr-x  6 matthew.powers  staff  192 Aug  7 11:58 [34m_delta_log[m[m/
-rw-r--r--  1 matthew.powers  staff  501 Aug  7 11:58 part-00000-14ad4a76-98fd-4e32-9b76-f54bcc7d5cb0-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  415 Aug  7 11:54 part-00000-47ce65fc-44a8-4099-a7a2-e6697a1b4842-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  494 Aug  7 11:58 part-00001-b6977598-cc3d-4064-8be6-0cc3baef0112-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  494 Aug  7 11:58 part-00002-54c84cf0-7a42-4ef9-a660-429f663de904-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  752 Aug  7 11:54 part-00003-3586c1ea-0e01-4ea3-ae0d-79bc43e5c0d5-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  745 Aug  7 11:54 part-00006-f60f37e4-7913-4948-aeeb-46feec516bbd-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  745 Aug  7 11:54 part-00009-f41a2472-2678-473c-92de-7b02d0ff56c9-c000.snappy.parquet


## Cleanup

In [64]:
spark.sql("drop table if exists `my_cool_table`")

DataFrame[]

In [None]:
spark.sql("drop table if exists `another_cool_table`")