# Create table Delta Lake

In [2]:
import pyspark
from delta import *

In [3]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [4]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-210/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c14fab4a-bc3e-4db1-9e77-531ac05201ec;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 337ms :: artifacts dl 12ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

22/10/06 20:30:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/06 20:30:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Create Table for Delta Lake from DataFrame

In [5]:
spark.sql("DROP TABLE IF EXISTS table1").show()

++
||
++
++



In [6]:
columns = ["character", "franchise"]
data = [("link", "zelda"), ("king k rool", "donkey kong"), ("samus", "metroid")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

                                                                                

In [7]:
df.show()

+-----------+-----------+
|  character|  franchise|
+-----------+-----------+
|       link|      zelda|
|king k rool|donkey kong|
|      samus|    metroid|
+-----------+-----------+



In [8]:
df.write.format("delta").saveAsTable("table1")

                                                                                

In [9]:
spark.sql("DESCRIBE HISTORY table1").select("version", "timestamp").show(truncate=False)

+-------+-----------------------+
|version|timestamp              |
+-------+-----------------------+
|0      |2022-10-06 20:31:10.015|
+-------+-----------------------+



In [10]:
dt = DeltaTable.forName(spark, "table1")

In [11]:
dt.history().select("version", "timestamp").show(truncate=False)

+-------+-----------------------+
|version|timestamp              |
+-------+-----------------------+
|0      |2022-10-06 20:31:10.015|
+-------+-----------------------+



In [12]:
spark.sql("SELECT * FROM table1").show(truncate=False)

+-----------+-----------+
|character  |franchise  |
+-----------+-----------+
|king k rool|donkey kong|
|samus      |metroid    |
|link       |zelda      |
+-----------+-----------+



In [16]:
spark.table("table1").show()

+-----------+-----------+
|  character|  franchise|
+-----------+-----------+
|king k rool|donkey kong|
|      samus|    metroid|
|       link|      zelda|
+-----------+-----------+



In [13]:
DeltaTable.isDeltaTable(spark, "spark-warehouse/table1")

True

In [14]:
df.write.format("parquet").saveAsTable("table1_as_parquet")

In [15]:
spark.sql("SELECT * FROM table1_as_parquet").show()

+-----------+-----------+
|  character|  franchise|
+-----------+-----------+
|king k rool|donkey kong|
|      samus|    metroid|
|       link|      zelda|
+-----------+-----------+



## Create Delta Lake table with SQL

In [17]:
spark.sql(
    """
  CREATE TABLE table2 (country STRING, continent STRING) USING delta
"""
)

                                                                                

DataFrame[]

In [18]:
spark.sql(
    """
  INSERT INTO table2 VALUES
      ('china', 'asia'),
      ('argentina', 'south america')
"""
)

                                                                                

DataFrame[]

In [19]:
spark.sql("SELECT * FROM table2").show()

+---------+-------------+
|  country|    continent|
+---------+-------------+
|argentina|south america|
|    china|         asia|
+---------+-------------+



In [20]:
DeltaTable.isDeltaTable(spark, "spark-warehouse/table2")

True

In [22]:
spark.sql("DESCRIBE DETAIL table2").select("format").show()

+------+
|format|
+------+
| delta|
+------+



## Create table with Delta Lake Python API

In [1]:
from pyspark.sql.types import *

In [7]:
dt1 = (
    DeltaTable.create(spark)
    .tableName("testTable1")
    .addColumn("c1", dataType="INT", nullable=False)
    .addColumn("c2", dataType=IntegerType(), generatedAlwaysAs="c1 + 1")
    .partitionedBy("c1")
    .execute()
)

                                                                                

In [8]:
dt1

<delta.tables.DeltaTable at 0x11026f550>

In [12]:
spark.sql("DESCRIBE TABLE testTable1").show()

+--------------+---------+-------+
|      col_name|data_type|comment|
+--------------+---------+-------+
|            c1|      int|       |
|            c2|      int|       |
|              |         |       |
|# Partitioning|         |       |
|        Part 0|       c1|       |
+--------------+---------+-------+



In [10]:
dt2 = (
    DeltaTable.createIfNotExists(spark)
    .tableName("testTable2")
    .addColumn("c1", dataType="INT", nullable=False)
    .addColumn("c2", dataType=IntegerType(), generatedAlwaysAs="c1 + 1")
    .partitionedBy("c1")
    .execute()
)

                                                                                

In [11]:
dt2

<delta.tables.DeltaTable at 0x110454880>

## Create Delta Lake table from CSV

In [23]:
df = spark.read.option("header", True).csv("../../data/students/students1.csv")

In [24]:
df.show()

+------------+---------------+-------+
|student_name|graduation_year|  major|
+------------+---------------+-------+
|someXXperson|           2023|   math|
|     liXXyao|           2025|physics|
+------------+---------------+-------+



In [25]:
df.write.format("delta").saveAsTable("students")

                                                                                

In [26]:
spark.sql("SELECT * from students").show()

+------------+---------------+-------+
|student_name|graduation_year|  major|
+------------+---------------+-------+
|someXXperson|           2023|   math|
|     liXXyao|           2025|physics|
+------------+---------------+-------+



In [27]:
DeltaTable.isDeltaTable(spark, "spark-warehouse/students")

True

## Create from Parquet files

In [4]:
columns = ["letter", "number"]
data = [("a", 1), ("b", 2), ("c", 3)]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

                                                                                

In [5]:
df.write.format("parquet").save("tmp/lake1")

                                                                                

In [6]:
!tree tmp/lake1

[01;34mtmp/lake1[0m
├── [00m_SUCCESS[0m
├── [00mpart-00000-1f1cc136-76ea-4185-84d6-54f7e758bfb7-c000.snappy.parquet[0m
├── [00mpart-00003-1f1cc136-76ea-4185-84d6-54f7e758bfb7-c000.snappy.parquet[0m
├── [00mpart-00006-1f1cc136-76ea-4185-84d6-54f7e758bfb7-c000.snappy.parquet[0m
└── [00mpart-00009-1f1cc136-76ea-4185-84d6-54f7e758bfb7-c000.snappy.parquet[0m

0 directories, 5 files


In [7]:
DeltaTable.convertToDelta(spark, "parquet.`tmp/lake1`")

                                                                                

<delta.tables.DeltaTable at 0x11041de50>

In [8]:
!tree tmp/lake1

[01;34mtmp/lake1[0m
├── [00m_SUCCESS[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.checkpoint.parquet[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m_last_checkpoint[0m
├── [00mpart-00000-1f1cc136-76ea-4185-84d6-54f7e758bfb7-c000.snappy.parquet[0m
├── [00mpart-00003-1f1cc136-76ea-4185-84d6-54f7e758bfb7-c000.snappy.parquet[0m
├── [00mpart-00006-1f1cc136-76ea-4185-84d6-54f7e758bfb7-c000.snappy.parquet[0m
└── [00mpart-00009-1f1cc136-76ea-4185-84d6-54f7e758bfb7-c000.snappy.parquet[0m

1 directory, 8 files


## Cleanup

In [11]:
!rm -rf spark-warehouse