# Convert Parquet to Delta Lake

In [41]:
import pyspark
from delta import *

In [42]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [43]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [44]:
columns = ["language", "num_speakers"]
data = [("English", "1.5"), ("Mandarin", "1.1"), ("Hindi", "0.6")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

## Unpartitioned Parquet to Delta Lake

In [5]:
df.write.format("parquet").save("tmp/lake1")

                                                                                

In [8]:
!tree tmp/lake1

[01;34mtmp/lake1[0m
├── [00m_SUCCESS[0m
├── [00mpart-00000-b84573b6-b805-4162-9143-9c598b80c289-c000.snappy.parquet[0m
├── [00mpart-00003-b84573b6-b805-4162-9143-9c598b80c289-c000.snappy.parquet[0m
├── [00mpart-00006-b84573b6-b805-4162-9143-9c598b80c289-c000.snappy.parquet[0m
└── [00mpart-00009-b84573b6-b805-4162-9143-9c598b80c289-c000.snappy.parquet[0m

0 directories, 5 files


In [10]:
deltaTable = DeltaTable.convertToDelta(spark, "parquet.`tmp/lake1`")

                                                                                

In [11]:
!tree tmp/lake1

[01;34mtmp/lake1[0m
├── [00m_SUCCESS[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.checkpoint.parquet[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m_last_checkpoint[0m
├── [00mpart-00000-b84573b6-b805-4162-9143-9c598b80c289-c000.snappy.parquet[0m
├── [00mpart-00003-b84573b6-b805-4162-9143-9c598b80c289-c000.snappy.parquet[0m
├── [00mpart-00006-b84573b6-b805-4162-9143-9c598b80c289-c000.snappy.parquet[0m
└── [00mpart-00009-b84573b6-b805-4162-9143-9c598b80c289-c000.snappy.parquet[0m

1 directory, 8 files


## Partitioned Parquet to Delta Lake

In [45]:
df.write.partitionBy("language").format("parquet").save("tmp/lake2")

                                                                                

In [46]:
!tree tmp/lake2

[01;34mtmp/lake2[0m
├── [00m_SUCCESS[0m
├── [01;34mlanguage=English[0m
│   └── [00mpart-00003-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m
├── [01;34mlanguage=Hindi[0m
│   └── [00mpart-00009-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m
└── [01;34mlanguage=Mandarin[0m
    └── [00mpart-00006-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m

3 directories, 4 files


In [47]:
deltaTable = DeltaTable.convertToDelta(spark, "parquet.`tmp/lake2`")

AnalysisException: Expecting 0 partition column(s): [], but found 1 partition column(s): [`language`] from parsing the file name: file:/Users/matthew.powers/Documents/code/my_apps/delta-examples/notebooks/pyspark/tmp/lake2/language=English/part-00003-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet

In [48]:
!tree tmp/lake2

[01;34mtmp/lake2[0m
├── [00m_SUCCESS[0m
├── [01;34m_delta_log[0m
├── [01;34mlanguage=English[0m
│   └── [00mpart-00003-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m
├── [01;34mlanguage=Hindi[0m
│   └── [00mpart-00009-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m
└── [01;34mlanguage=Mandarin[0m
    └── [00mpart-00006-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m

4 directories, 4 files


In [49]:
deltaTable = DeltaTable.convertToDelta(spark, "parquet.`tmp/lake2`", "language STRING")

                                                                                

In [50]:
!tree tmp/lake2

[01;34mtmp/lake2[0m
├── [00m_SUCCESS[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.checkpoint.parquet[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m_last_checkpoint[0m
├── [01;34mlanguage=English[0m
│   └── [00mpart-00003-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m
├── [01;34mlanguage=Hindi[0m
│   └── [00mpart-00009-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m
└── [01;34mlanguage=Mandarin[0m
    └── [00mpart-00006-fa662100-1eff-4609-a0dd-794b5eec991a.c000.snappy.parquet[0m

4 directories, 7 files


## Convert Parquet Lake with Multiple Appends

In [None]:
df.write.format("parquet").save("tmp/lake3")

In [51]:
columns = ["language", "num_speakers"]
data = [("Spanish", "0.5"), ("French", "0.3"), ("Arabic", "0.3")]
rdd = spark.sparkContext.parallelize(data)
df = rdd.toDF(columns)

In [52]:
df.write.mode("append").format("parquet").save("tmp/lake3")

                                                                                

In [53]:
deltaTable = DeltaTable.convertToDelta(spark, "parquet.`tmp/lake3`")

                                                                                

In [56]:
deltaTable.version(0).show()

AttributeError: 'DeltaTable' object has no attribute 'version'

## Cleanup

In [40]:
%rm -rf tmp