# Why deltalake for pandas?

This notebook demonstrates why Delta Lake is the best storage format for pandas analyses.

In [16]:
import itertools
import os
import pathlib
from datetime import datetime, timedelta

import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
from deltalake import DeltaTable, write_deltalake

## Reason 1: File skipping makes queries run faster

Let's compare the runtime for a filter/aggregation query on data stored in a CSV file vs the same data in a Delta Lake table.

We will see that the query runs much faster when the data is stored in a Delta table.

In [9]:
pd.read_csv("~/data/G1_1e8_1e2_0_0.csv")

Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id016,id046,id0000109363,88,13,146094,4,6,18.837686
1,id039,id087,id0000466766,14,30,111330,4,14,46.797328
2,id047,id098,id0000307804,85,23,187639,3,5,47.577311
3,id043,id017,id0000344864,87,76,256509,2,5,80.462924
4,id054,id027,id0000433679,99,67,32736,1,7,15.796662
...,...,...,...,...,...,...,...,...,...
99999995,id080,id025,id0000598386,43,72,56728,3,9,27.479070
99999996,id064,id012,id0000844471,19,33,203895,4,5,5.323666
99999997,id046,id053,id0000544024,31,71,711000,5,3,27.827385
99999998,id081,id090,id0000802094,53,60,57466,1,15,23.319917


In [10]:
%%time

(
    pd.read_csv("~/data/G1_1e8_1e2_0_0.csv", usecols=["id1", "id2", "v1"])
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 18.6 s, sys: 2.01 s, total: 20.6 s
Wall time: 20.6 s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,29918
id002,30343
id003,30180
id004,30581
id005,30769
...,...
id096,30011
id097,29728
id098,30131
id099,30141


In [11]:
%%time

dt = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e8_1e2_0_0")
dataset = dt.to_pyarrow_dataset()
condition = ds.field("id1") == "id016"
(
    dataset.to_table(filter=condition, columns=["id1", "id2", "v1"])
    .to_pandas()
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 2.12 s, sys: 241 ms, total: 2.36 s
Wall time: 1.02 s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,29918
id002,30343
id003,30180
id004,30581
id005,30769
...,...
id096,30011
id097,29728
id098,30131
id099,30141


In [12]:
!tree ~/data/delta/G1_1e8_1e2_0_0

[01;34m/Users/matthew.powers/data/delta/G1_1e8_1e2_0_0[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
├── [00mpart-00000-963f5914-a816-443c-bc30-cfc59c545830-c000.snappy.parquet[0m
├── [00mpart-00000-ec2ee652-7ad5-4ee2-a991-86c7e166dbc7-c000.snappy.parquet[0m
├── [00mpart-00001-12860cab-145a-4e74-8a0c-b5ad846b4f7b-c000.snappy.parquet[0m
├── [00mpart-00001-bb6e847f-2f99-4594-8282-383a944de4e0-c000.snappy.parquet[0m
├── [00mpart-00002-02303d25-73ae-412c-b7c1-963f5ab2cf56-c000.snappy.parquet[0m
├── [00mpart-00003-2fe74bfc-6422-4911-8858-00ea4318f7af-c000.snappy.parquet[0m
├── [00mpart-00004-a9a3a44e-751e-444a-ba34-528fd07ce880-c000.snappy.parquet[0m
├── [00mpart-00005-426c8fae-4d6c-4101-bff8-d5594fdccbfe-c000.snappy.parquet[0m
├── [00mpart-00006-4f506872-d826-447a-9b09-532927108965-c000.snappy.parquet[0m
├── [00mpart-00007-8afb5a64-9633-4374-a47a-1866d5c136ae-c000.snappy.parquet[0m
├── [00mpart-000

## Reason 2: Time travel / versioned data

Delta Lake versions your data table, so you can time travel between different versions.  This is great for advanced analytics, to maintain data quality, and for undoing mistakes.

Let's create a Delta table with three different versions to illustrate.

### Create Delta Lake

In [13]:
df = pd.DataFrame({"x": [1, 2, 3]})

In [14]:
df

Unnamed: 0,x
0,1
1,2
2,3


In [17]:
os.makedirs("tmp/some_delta_lake", exist_ok=True)

In [18]:
write_deltalake("tmp/some_delta_lake", df)

In [19]:
dt = DeltaTable("tmp/some_delta_lake")

In [20]:
dt.to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [21]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-19bd224f-e590-4917-a5b6-ac7110a4aabf-0.parquet[0m
└── [01;34m_delta_log[0m
    └── [00m00000000000000000000.json[0m

1 directory, 2 files


In [22]:
!jq . tmp/some_delta_lake/_delta_log/00000000000000000000.json

[1;39m{
  [0m[34;1m"commitInfo"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"delta-rs"[0m[1;39m: [0m[0;32m"0.8.0"[0m[1;39m,
    [0m[34;1m"timestamp"[0m[1;39m: [0m[0;39m1684252189986[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"protocol"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"minReaderVersion"[0m[1;39m: [0m[0;39m1[0m[1;39m,
    [0m[34;1m"minWriterVersion"[0m[1;39m: [0m[0;39m1[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"metaData"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"id"[0m[1;39m: [0m[0;32m"8e24978e-9957-4c6b-806c-1d2bf90a1e5b"[0m[1;39m,
    [0m[34;1m"name"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"description"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"format"[0m[1;39m: [0m[1;39m{
      [0m[34;1m"provider"[0m[1;39m: [0m[0;32m"parquet"[0m[1;39m,
      [0m[34;1m"options"[0m[1;39m: [0m[1;39m{}[0m[1;39m
    [1;39m}[0m[1;39m,
    [0m[34;1m"schemaString"[0m

### Append to Delta Lake

In [23]:
df2 = pd.DataFrame({"x": [8, 9, 10]})

In [24]:
write_deltalake("tmp/some_delta_lake", df2, mode="append")

In [25]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-19bd224f-e590-4917-a5b6-ac7110a4aabf-0.parquet[0m
├── [00m1-b94f89f4-3ca1-47d7-86a6-4d73f56c1d61-0.parquet[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.json[0m
    └── [00m00000000000000000001.json[0m

1 directory, 4 files


In [26]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,8
4,9
5,10


### Overwrite Delta Lake

In [27]:
df3 = pd.DataFrame({"x": [55, 66, 77]})

In [28]:
df3

Unnamed: 0,x
0,55
1,66
2,77


In [29]:
write_deltalake("tmp/some_delta_lake", df3, mode="overwrite")

In [30]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-19bd224f-e590-4917-a5b6-ac7110a4aabf-0.parquet[0m
├── [00m1-b94f89f4-3ca1-47d7-86a6-4d73f56c1d61-0.parquet[0m
├── [00m2-1c22d1ec-db38-4679-b19e-1c05cd682a7b-0.parquet[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.json[0m
    ├── [00m00000000000000000001.json[0m
    └── [00m00000000000000000002.json[0m

1 directory, 6 files


In [31]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,55
1,66
2,77


In [32]:
!jq . tmp/some_delta_lake/_delta_log/00000000000000000002.json

[1;39m{
  [0m[34;1m"add"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"2-1c22d1ec-db38-4679-b19e-1c05cd682a7b-0.parquet"[0m[1;39m,
    [0m[34;1m"size"[0m[1;39m: [0m[0;39m1654[0m[1;39m,
    [0m[34;1m"partitionValues"[0m[1;39m: [0m[1;39m{}[0m[1;39m,
    [0m[34;1m"modificationTime"[0m[1;39m: [0m[0;39m1684252211992[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;39mtrue[0m[1;39m,
    [0m[34;1m"stats"[0m[1;39m: [0m[0;32m"{\"numRecords\": 3, \"minValues\": {\"x\": 55}, \"maxValues\": {\"x\": 77}, \"nullCount\": {\"x\": 0}}"[0m[1;39m,
    [0m[34;1m"tags"[0m[1;39m: [0m[1;30mnull[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"remove"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"0-19bd224f-e590-4917-a5b6-ac7110a4aabf-0.parquet"[0m[1;39m,
    [0m[34;1m"deletionTimestamp"[0m[1;39m: [0m[0;39m1684252211992[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;

### Confirm other versions are still accessible

In [33]:
DeltaTable("tmp/some_delta_lake", version=0).to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [34]:
DeltaTable("tmp/some_delta_lake", version=1).to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,8
4,9
5,10


In [35]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,55
1,66
2,77


In [36]:
DeltaTable("tmp/some_delta_lake", version=2).to_pandas()

Unnamed: 0,x
0,55
1,66
2,77


## Reason 3: Schema enforcement prevents bad appends

In [37]:
df4 = pd.DataFrame({"y": [111, 222]})

In [38]:
write_deltalake("tmp/some_delta_lake", df4, mode="append")

ValueError: Schema of data does not match table schema
Table schema:
y: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 358
Data Schema:
x: int64

## Reason 4: Better partition management (adding & removing partitions)

In [39]:
df = pd.DataFrame(
    {"name": ["li", "xi", "sally", "fred"], "country": ["china", "china", "us", "us"]}
)

In [40]:
df

Unnamed: 0,name,country
0,li,china
1,xi,china
2,sally,us
3,fred,us


In [41]:
write_deltalake(
    "tmp/some_people",
    df,
    partition_by=["country"],
)

In [42]:
!tree tmp/some_people

[01;34mtmp/some_people[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [01;34mcountry=china[0m
│   └── [00m0-5d6a45de-7ccd-4107-91a2-d2239142a608-0.parquet[0m
└── [01;34mcountry=us[0m
    └── [00m0-5d6a45de-7ccd-4107-91a2-d2239142a608-0.parquet[0m

3 directories, 3 files


In [43]:
DeltaTable("tmp/some_people").to_pandas()

Unnamed: 0,name,country
0,sally,us
1,fred,us
2,li,china
3,xi,china


In [44]:
df = pd.DataFrame(
    {"name": ["jack", "bruce", "yao"], "country": ["china", "china", "china"]}
)

In [45]:
df

Unnamed: 0,name,country
0,jack,china
1,bruce,china
2,yao,china


In [46]:
write_deltalake(
    "tmp/some_people",
    df,
    mode="overwrite",
    partition_filters=[("country", "=", "china")],
)

In [47]:
DeltaTable("tmp/some_people").to_pandas()

Unnamed: 0,name,country
0,sally,us
1,fred,us
2,jack,china
3,bruce,china
4,yao,china


In [48]:
DeltaTable("tmp/some_people", version=0).to_pandas()

Unnamed: 0,name,country
0,sally,us
1,fred,us
2,li,china
3,xi,china


## Reason 5: Small file compaction

In [49]:
def record_observations(date: datetime) -> pa.Table:
    """Pulls data for a certain datetime"""
    nrows = 1000
    return pa.table(
        {
            "date": pa.array([date.date()] * nrows),
            "timestamp": pa.array([date] * nrows),
            "value": pc.random(nrows),
        }
    )


# Example of output
record_observations(datetime(2021, 1, 1, 12)).to_pandas()

Unnamed: 0,date,timestamp,value
0,2021-01-01,2021-01-01 12:00:00,0.052734
1,2021-01-01,2021-01-01 12:00:00,0.043073
2,2021-01-01,2021-01-01 12:00:00,0.834527
3,2021-01-01,2021-01-01 12:00:00,0.741786
4,2021-01-01,2021-01-01 12:00:00,0.217829
...,...,...,...
995,2021-01-01,2021-01-01 12:00:00,0.118247
996,2021-01-01,2021-01-01 12:00:00,0.856370
997,2021-01-01,2021-01-01 12:00:00,0.489655
998,2021-01-01,2021-01-01 12:00:00,0.168350


In [50]:
# Every hour starting at midnight on 2021-01-01
hours_iter = (datetime(2021, 1, 1) + timedelta(hours=i) for i in itertools.count())

# Write 100 hours worth of data
for timestamp in itertools.islice(hours_iter, 100):
    write_deltalake(
        "observation_data",
        record_observations(timestamp),
        partition_by=["date"],
        mode="append",
    )

In [51]:
dt = DeltaTable("observation_data")
# We now have 100 files in our table
len(dt.files())

100

In [52]:
# But there are only 5 unique partitions
dt.get_add_actions(flatten=True).column("partition.date").unique().sort()

<pyarrow.lib.Date32Array object at 0x15a8a5460>
[
  2021-01-01,
  2021-01-02,
  2021-01-03,
  2021-01-04,
  2021-01-05
]

In [53]:
dt.optimize()

AttributeError: 'DeltaTable' object has no attribute 'optimize'

In [None]:
# After running optimize, we have an equal number of files as partitions
len(dt.files())

In [None]:
dt.vacuum(retention_hours=0, enforce_retention_duration=False, dry_run=False)

## Cleanup

In [7]:
!rm -rf tmp observation_data