# Delta Lake Managed vs External Tables

This notebook demonstrates how to create managed and external tables with Delta Lake.

In [1]:
import pyspark
from delta import *
from pyspark.sql.types import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-210/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c78bb9b5-aa4b-49a2-89fd-d3ade02e74a1;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 304ms :: artifacts dl 23ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

22/10/11 16:46:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Create Delta Lake table with save

In [5]:
columns = ["movie", "release_date"]
data = [("The Godfather", 1972), ("Detective Pikachu", 2019), ("Donny Darko", 2001)]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [6]:
df.write.format("delta").save("tmp/some_delta_lake")

                                                                                

In [8]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [00mpart-00000-43141f85-4de4-45d6-b17b-7b329382628b-c000.snappy.parquet[0m
├── [00mpart-00003-14759774-2b5e-444f-a53f-7a26ccd7c51a-c000.snappy.parquet[0m
├── [00mpart-00006-83a16b8f-3a18-43da-b2fc-39f50677c942-c000.snappy.parquet[0m
└── [00mpart-00009-a7f2c1e6-c6e0-4098-9342-597ecba0e035-c000.snappy.parquet[0m

1 directory, 5 files


In [9]:
spark.read.format("delta").load("tmp/some_delta_lake").show()

+-----------------+------------+
|            movie|release_date|
+-----------------+------------+
|Detective Pikachu|        2019|
|    The Godfather|        1972|
|      Donny Darko|        2001|
+-----------------+------------+



In [10]:
df.createOrReplaceTempView("some_view")

In [13]:
spark.sql("SELECT * FROM some_view").show()

+-----------------+------------+
|            movie|release_date|
+-----------------+------------+
|    The Godfather|        1972|
|Detective Pikachu|        2019|
|      Donny Darko|        2001|
+-----------------+------------+



In [11]:
spark.sql("DESCRIBE TABLE EXTENDED some_view").show(truncate=False)

+------------+---------+-------+
|col_name    |data_type|comment|
+------------+---------+-------+
|movie       |string   |null   |
|release_date|bigint   |null   |
+------------+---------+-------+



In [7]:
%rm -rf tmp/some_delta_lake

## Create external Delta Lake table with saveAsTable

In [15]:
df.write.format("delta").option("path", "tmp/some_external_table").saveAsTable(
    "default.my_external_table"
)

                                                                                

In [16]:
spark.sql("select * from my_external_table").show()

+-----------------+------------+
|            movie|release_date|
+-----------------+------------+
|Detective Pikachu|        2019|
|    The Godfather|        1972|
|      Donny Darko|        2001|
+-----------------+------------+



In [17]:
spark.table("my_external_table").show()

+-----------------+------------+
|            movie|release_date|
+-----------------+------------+
|Detective Pikachu|        2019|
|    The Godfather|        1972|
|      Donny Darko|        2001|
+-----------------+------------+



In [19]:
spark.sql("DESCRIBE TABLE my_external_table").show(truncate=False)

+---------------+---------+-------+
|col_name       |data_type|comment|
+---------------+---------+-------+
|movie          |string   |       |
|release_date   |bigint   |       |
|               |         |       |
|# Partitioning |         |       |
|Not partitioned|         |       |
+---------------+---------+-------+



In [18]:
spark.sql("DESCRIBE TABLE EXTENDED my_external_table").show(truncate=False)

+----------------------------+--------------------------------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                                                 |comment|
+----------------------------+--------------------------------------------------------------------------------------------------------------------------+-------+
|movie                       |string                                                                                                                    |       |
|release_date                |bigint                                                                                                                    |       |
|                            |                                                                                                                          |       |
|# Partitioning             

In [15]:
%rm -rf some_external_table

## Creating Delta Lake Managed Table

In [20]:
df.write.format("delta").mode("overwrite").saveAsTable("some_managed_table")

                                                                                

In [26]:
spark.table("some_managed_table").show()

+-----------------+------------+
|            movie|release_date|
+-----------------+------------+
|Detective Pikachu|        2019|
|    The Godfather|        1972|
|      Donny Darko|        2001|
+-----------------+------------+



In [21]:
!tree spark-warehouse/some_managed_table

[01;34mspark-warehouse/some_managed_table[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [00mpart-00000-3d163841-6018-4dea-b8ac-40ffdb0d9641-c000.snappy.parquet[0m
├── [00mpart-00003-ec50c15d-26f5-4dce-a09b-93ada38f75be-c000.snappy.parquet[0m
├── [00mpart-00006-55b380fd-acbc-4298-ba42-0938909a524c-c000.snappy.parquet[0m
└── [00mpart-00009-6668ddd5-f0b8-410f-97a5-d9fa9c224978-c000.snappy.parquet[0m

1 directory, 5 files


In [22]:
spark.sql("select * from some_managed_table").show()

+-----------------+------------+
|            movie|release_date|
+-----------------+------------+
|Detective Pikachu|        2019|
|    The Godfather|        1972|
|      Donny Darko|        2001|
+-----------------+------------+



In [25]:
spark.sql("DESCRIBE TABLE EXTENDED some_managed_table").show(truncate=False)

+----------------------------+---------------------------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                                            |comment|
+----------------------------+---------------------------------------------------------------------------------------------------------------------+-------+
|movie                       |string                                                                                                               |       |
|release_date                |bigint                                                                                                               |       |
|                            |                                                                                                                     |       |
|# Partitioning              |                            

In [27]:
spark.sql("DESCRIBE TABLE EXTENDED some_managed_table").toPandas()

                                                                                

Unnamed: 0,col_name,data_type,comment
0,movie,string,
1,release_date,bigint,
2,,,
3,# Partitioning,,
4,Not partitioned,,
5,,,
6,# Detailed Table Information,,
7,Name,default.some_managed_table,
8,Location,file:/Users/matthew.powers/Documents/code/my_a...,
9,Provider,delta,


In [31]:
spark.sql("DESCRIBE TABLE EXTENDED some_managed_table").toPandas().iloc[10][1]

'[delta.minReaderVersion=1,delta.minWriterVersion=2]'