## Drop column from Delta Lake table

This notebook demonstrates how to drop a column of a Delta Lake table.

It demonstrates how the column mapping functionality that was added in Delta 1.2 makes this operation a lot more efficient.

In [1]:
import pyspark
from delta import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-300-delta-200/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-99917b62-3055-4427-8b4a-7283d4ec6ef6;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 327ms :: artifacts dl 26ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

## Create Delta Lake

In [4]:
spark.sql("drop table if exists `my_cool_table`")

DataFrame[]

In [5]:
columns = ["language", "num_speakers"]
data = [("English", "1.5"), ("Mandarin", "1.1"), ("Hindi", "0.6")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

                                                                                

In [6]:
df.write.format("delta").saveAsTable("default.my_cool_table")

                                                                                

In [7]:
spark.sql("select * from `my_cool_table`").show()

+--------+------------+
|language|num_speakers|
+--------+------------+
|Mandarin|         1.1|
| English|         1.5|
|   Hindi|         0.6|
+--------+------------+



In [8]:
%ls -l ./spark-warehouse/my_cool_table/

total 32
drwxr-xr-x  4 matthew.powers  staff  128 Aug  9 17:42 [34m_delta_log[m[m/
-rw-r--r--  1 matthew.powers  staff  415 Aug  9 17:42 part-00000-20190e69-4f80-4707-94ac-21930bdd0f92-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  751 Aug  9 17:42 part-00003-7dc949c7-f8d5-4cad-ae6f-aa930811f63f-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  759 Aug  9 17:42 part-00006-5d124e5a-68c7-484e-997a-64bdc826a0fd-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  738 Aug  9 17:42 part-00009-c2c56321-09cf-4545-b753-d53d45461db5-c000.snappy.parquet


In [9]:
spark.sql("select * from `my_cool_table`").printSchema()

root
 |-- language: string (nullable = true)
 |-- num_speakers: string (nullable = true)



## Drop column from Delta Lake

In [10]:
spark.sql(
    """ALTER TABLE `my_cool_table` SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')"""
)

                                                                                

DataFrame[]

In [11]:
spark.sql("alter table `my_cool_table` drop column language")

                                                                                

DataFrame[]

In [12]:
spark.sql("select * from `my_cool_table`").show()

+------------+
|num_speakers|
+------------+
|         1.1|
|         1.5|
|         0.6|
+------------+



In [13]:
%ls -l ./spark-warehouse/my_cool_table/

total 32
drwxr-xr-x  8 matthew.powers  staff  256 Aug  9 17:46 [34m_delta_log[m[m/
-rw-r--r--  1 matthew.powers  staff  415 Aug  9 17:42 part-00000-20190e69-4f80-4707-94ac-21930bdd0f92-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  751 Aug  9 17:42 part-00003-7dc949c7-f8d5-4cad-ae6f-aa930811f63f-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  759 Aug  9 17:42 part-00006-5d124e5a-68c7-484e-997a-64bdc826a0fd-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  738 Aug  9 17:42 part-00009-c2c56321-09cf-4545-b753-d53d45461db5-c000.snappy.parquet


In [14]:
spark.sql("select * from `my_cool_table`").printSchema()

root
 |-- num_speakers: string (nullable = true)



## Drop column from Delta Lake pre Delta 1.2

In [54]:
spark.sql("drop table if exists `another_cool_table`")

DataFrame[]

In [56]:
columns = ["language", "num_speakers"]
data = [("Spanish", "0.5"), ("French", "0.3"), ("Arabic", "0.3")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [57]:
df.write.format("delta").saveAsTable("default.another_cool_table")

                                                                                

In [58]:
df = spark.sql("select * from another_cool_table")

In [59]:
df.show()

+--------+------------+
|language|num_speakers|
+--------+------------+
| Spanish|         0.5|
|  Arabic|         0.3|
|  French|         0.3|
+--------+------------+



In [60]:
%ls -l ./spark-warehouse/another_cool_table/

total 32
drwxr-xr-x  4 matthew.powers  staff  128 Aug  7 11:54 [34m_delta_log[m[m/
-rw-r--r--  1 matthew.powers  staff  415 Aug  7 11:54 part-00000-47ce65fc-44a8-4099-a7a2-e6697a1b4842-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  752 Aug  7 11:54 part-00003-3586c1ea-0e01-4ea3-ae0d-79bc43e5c0d5-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  745 Aug  7 11:54 part-00006-f60f37e4-7913-4948-aeeb-46feec516bbd-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  745 Aug  7 11:54 part-00009-f41a2472-2678-473c-92de-7b02d0ff56c9-c000.snappy.parquet


In [63]:
df = df.drop("num_speakers")

In [64]:
df.show()

+--------+
|language|
+--------+
| Spanish|
|  Arabic|
|  French|
+--------+



In [65]:
df.write.format("delta").mode("OVERWRITE").option(
    "overwriteSchema", "true"
).saveAsTable("default.another_cool_table")

                                                                                

In [66]:
spark.sql("select * from another_cool_table").show()

+--------+
|language|
+--------+
| Spanish|
|  French|
|  Arabic|
+--------+



In [67]:
%ls -l ./spark-warehouse/another_cool_table/

total 56
drwxr-xr-x  6 matthew.powers  staff  192 Aug  7 11:58 [34m_delta_log[m[m/
-rw-r--r--  1 matthew.powers  staff  501 Aug  7 11:58 part-00000-14ad4a76-98fd-4e32-9b76-f54bcc7d5cb0-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  415 Aug  7 11:54 part-00000-47ce65fc-44a8-4099-a7a2-e6697a1b4842-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  494 Aug  7 11:58 part-00001-b6977598-cc3d-4064-8be6-0cc3baef0112-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  494 Aug  7 11:58 part-00002-54c84cf0-7a42-4ef9-a660-429f663de904-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  752 Aug  7 11:54 part-00003-3586c1ea-0e01-4ea3-ae0d-79bc43e5c0d5-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  745 Aug  7 11:54 part-00006-f60f37e4-7913-4948-aeeb-46feec516bbd-c000.snappy.parquet
-rw-r--r--  1 matthew.powers  staff  745 Aug  7 11:54 part-00009-f41a2472-2678-473c-92de-7b02d0ff56c9-c000.snappy.parquet


## Cleanup

In [64]:
spark.sql("drop table if exists `my_cool_table`")

DataFrame[]

In [None]:
spark.sql("drop table if exists `another_cool_table`")