# Version pandas with deltalake

This notebook works with the `pandas-deltalake` environment.

In [25]:
import os

import pandas as pd
from deltalake import DeltaTable
from deltalake.writer import write_deltalake

## Create Delta Lake

In [26]:
df = pd.DataFrame({"x": [1, 2, 3]})

In [27]:
df

Unnamed: 0,x
0,1
1,2
2,3


In [28]:
os.makedirs("tmp/some_delta_lake", exist_ok=True)

In [29]:
write_deltalake("tmp/some_delta_lake", df)

In [30]:
dt = DeltaTable("tmp/some_delta_lake")

In [31]:
dt.to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [32]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-7aa8fde5-a55b-434a-b436-a8c8180f446e-0.parquet[0m
└── [01;34m_delta_log[0m
    └── [00m00000000000000000000.json[0m

1 directory, 2 files


In [33]:
!jq . tmp/some_delta_lake/_delta_log/00000000000000000000.json

[1;39m{
  [0m[34;1m"commitInfo"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"delta-rs"[0m[1;39m: [0m[0;32m"0.4.1"[0m[1;39m,
    [0m[34;1m"timestamp"[0m[1;39m: [0m[0;39m1680730058520[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"protocol"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"minReaderVersion"[0m[1;39m: [0m[0;39m1[0m[1;39m,
    [0m[34;1m"minWriterVersion"[0m[1;39m: [0m[0;39m1[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"metaData"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"id"[0m[1;39m: [0m[0;32m"bf7e6bcd-37de-4a39-aa49-82d211b92611"[0m[1;39m,
    [0m[34;1m"name"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"description"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"format"[0m[1;39m: [0m[1;39m{
      [0m[34;1m"provider"[0m[1;39m: [0m[0;32m"parquet"[0m[1;39m,
      [0m[34;1m"options"[0m[1;39m: [0m[1;39m{}[0m[1;39m
    [1;39m}[0m[1;39m,
    [0m[34;1m"schemaString"[0m

## Append to Delta Lake

In [34]:
df2 = pd.DataFrame({"x": [8, 9, 10]})

In [35]:
write_deltalake("tmp/some_delta_lake", df2, mode="append")

In [36]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-7aa8fde5-a55b-434a-b436-a8c8180f446e-0.parquet[0m
├── [00m1-f416093b-56cf-4991-b54b-919721a0ced4-0.parquet[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.json[0m
    └── [00m00000000000000000001.json[0m

1 directory, 4 files


In [37]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,8
4,9
5,10


## pandas time travel

In [38]:
dt = DeltaTable("tmp/some_delta_lake", version=0)

In [39]:
dt.to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [40]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,8
4,9
5,10


## Overwrite Delta Lake

In [41]:
df3 = pd.DataFrame({"x": [55, 66, 77]})

In [42]:
df3

Unnamed: 0,x
0,55
1,66
2,77


In [43]:
write_deltalake("tmp/some_delta_lake", df3, mode="overwrite")

In [44]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-7aa8fde5-a55b-434a-b436-a8c8180f446e-0.parquet[0m
├── [00m1-f416093b-56cf-4991-b54b-919721a0ced4-0.parquet[0m
├── [00m2-cb17f8fb-a680-46a5-a55f-f3da52ecf7df-0.parquet[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.json[0m
    ├── [00m00000000000000000001.json[0m
    └── [00m00000000000000000002.json[0m

1 directory, 6 files


In [45]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,55
1,66
2,77


## Confirm other versions are still accessible

In [46]:
DeltaTable("tmp/some_delta_lake", version=0).to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [50]:
DeltaTable("tmp/some_delta_lake", version=1).to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,8
4,9
5,10


## Schema enforcement prevents bad appends

In [48]:
df4 = pd.DataFrame({"y": [111, 222]})

In [49]:
write_deltalake("tmp/some_delta_lake", df4, mode="append")

ValueError: Schema of data does not match table schema
Table schema:
y: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 357
Data Schema:
x: int64

## Cleanup

In [None]:
!rm -rf tmp