# Create Delta table with pandas

In [1]:
import os

import pandas as pd
from deltalake import DeltaTable, write_deltalake

## Create Delta Lake

In [2]:
df = pd.DataFrame({"x": [1, 2, 3]})

In [3]:
os.makedirs("tmp/some_delta_lake", exist_ok=True)

In [4]:
write_deltalake("tmp/some_delta_lake", df)

In [5]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [6]:
DeltaTable("tmp/some_delta_lake").to_pandas().to_clipboard()

## Append to Delta Lake

In [10]:
df2 = pd.DataFrame({"x": [9, 8, 10]})

In [11]:
write_deltalake("tmp/some_delta_lake", df2, mode="append")

In [12]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,9
4,8
5,10


In [13]:
DeltaTable("tmp/some_delta_lake").to_pandas().to_clipboard()

## pandas time travel

In [10]:
dt = DeltaTable("tmp/some_delta_lake", version=0)

In [11]:
dt.to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [12]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,9
4,8
5,10


## Overwrite Delta Lake

In [18]:
df3 = pd.DataFrame({"x": [55, 66, 77]})

In [19]:
write_deltalake("tmp/some_delta_lake", df3, mode="overwrite")

In [20]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,55
1,66
2,77


In [22]:
DeltaTable("tmp/some_delta_lake").to_pandas().to_clipboard()

## Confirm other versions are still accessible

In [16]:
DeltaTable("tmp/some_delta_lake", version=0).to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [17]:
DeltaTable("tmp/some_delta_lake", version=1).to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,9
4,8
5,10


## Schema enforcement prevents bad appends

In [18]:
df4 = pd.DataFrame({"y": [111, 222]})

In [19]:
df4

Unnamed: 0,y
0,111
1,222


In [20]:
write_deltalake("tmp/some_delta_lake", df4, mode="append")

ValueError: Schema of data does not match table schema
Table schema:
y: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 357
Data Schema:
x: int64

## Overwrite partition of Delta table

In [36]:
os.makedirs("tmp/some_people", exist_ok=True)

In [37]:
df = pd.DataFrame(
    {"name": ["li", "xi", "sally", "fred"], "country": ["china", "china", "us", "us"]}
)

In [38]:
df

Unnamed: 0,name,country
0,li,china
1,xi,china
2,sally,us
3,fred,us


In [39]:
write_deltalake(
    "tmp/some_people",
    df,
    partition_by=["country"],
)

In [40]:
!tree tmp/some_people

[01;34mtmp/some_people[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [01;34mcountry=china[0m
│   └── [00m0-dd1deda9-b862-47fb-8ffd-4c91d410ad31-0.parquet[0m
└── [01;34mcountry=us[0m
    └── [00m0-dd1deda9-b862-47fb-8ffd-4c91d410ad31-0.parquet[0m

3 directories, 3 files


In [41]:
DeltaTable("tmp/some_people").to_pandas()

Unnamed: 0,name,country
0,li,china
1,xi,china
2,sally,us
3,fred,us


In [43]:
DeltaTable("tmp/some_people").to_pandas().to_clipboard()

In [45]:
df = pd.DataFrame(
    {"name": ["jack", "bruce", "yao"], "country": ["china", "china", "china"]}
)

In [46]:
write_deltalake(
    "tmp/some_people",
    df,
    mode="overwrite",
    partition_filters=[("country", "=", "china")],
)

In [47]:
!tree tmp/some_people

[01;34mtmp/some_people[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   └── [00m00000000000000000001.json[0m
├── [01;34mcountry=china[0m
│   ├── [00m0-dd1deda9-b862-47fb-8ffd-4c91d410ad31-0.parquet[0m
│   └── [00m1-45cf731b-382f-4244-b156-d1f009f02a80-0.parquet[0m
└── [01;34mcountry=us[0m
    └── [00m0-dd1deda9-b862-47fb-8ffd-4c91d410ad31-0.parquet[0m

3 directories, 5 files


In [48]:
DeltaTable("tmp/some_people").to_pandas()

Unnamed: 0,name,country
0,sally,us
1,fred,us
2,jack,china
3,bruce,china
4,yao,china


In [49]:
DeltaTable("tmp/some_people").to_pandas().to_clipboard()

In [51]:
DeltaTable("tmp/some_people", version=0).to_pandas()

Unnamed: 0,name,country
0,li,china
1,xi,china
2,sally,us
3,fred,us


In [52]:
DeltaTable("tmp/some_people", version=0).to_pandas().to_clipboard()

In [34]:
!jq . tmp/some_people/_delta_log/00000000000000000001.json

[1;39m{
  [0m[34;1m"add"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"country=china/1-41f18aa2-9707-4716-b5ae-4089cf778756-0.parquet"[0m[1;39m,
    [0m[34;1m"size"[0m[1;39m: [0m[0;39m1859[0m[1;39m,
    [0m[34;1m"partitionValues"[0m[1;39m: [0m[1;39m{
      [0m[34;1m"country"[0m[1;39m: [0m[0;32m"china"[0m[1;39m
    [1;39m}[0m[1;39m,
    [0m[34;1m"modificationTime"[0m[1;39m: [0m[0;39m1679455801261[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;39mtrue[0m[1;39m,
    [0m[34;1m"stats"[0m[1;39m: [0m[0;32m"{\"numRecords\": 3, \"minValues\": {\"name\": \"bruce\"}, \"maxValues\": {\"name\": \"yao\"}, \"nullCount\": {\"name\": 0}}"[0m[1;39m,
    [0m[34;1m"tags"[0m[1;39m: [0m[1;30mnull[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"remove"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"country=china/0-7220ecd3-1497-485d-9b85-583cf4fd6be7-0.parquet"[0m[1;39m,
  

## Cleanup

In [53]:
!rm -rf tmp

In [1]:
import pandas as pd
from deltalake import DeltaTable, write_deltalake

In [2]:
df = pd.DataFrame({"x": [1, 2, 3]})
write_deltalake("my_table", df)

In [3]:
DeltaTable("my_table").to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
