# Python deltalake documentation

This notebook contains the code snippets used in the Python deltalake documentation.

In [24]:
import pathlib

import deltalake as dl
import pandas as pd
from deltalake import DeltaTable, write_deltalake
from tabulate import tabulate

In [27]:
import polars as pl

df = pl.DataFrame({"num": [1, 2, 3], "letter": ["a", "b", "c"]})
df.write_delta("~/tmp/cool/some-table")

## Create Delta table

In [25]:
df = pd.DataFrame({"num": [1, 2, 3], "letter": ["a", "b", "c"]})

In [26]:
write_deltalake("~/tmp/cool/some-table", df)

OSError: Encountered object with invalid path: Error parsing Path "/Users/matthew.powers/Documents/code/delta/delta-examples/notebooks/python-deltalake/~/tmp/cool/some-table": Encountered illegal character sequence "~" whilst parsing path segment "~"

In [4]:
write_deltalake("tmp/some-table", df)

In [5]:
dt = DeltaTable("tmp/some-table")

In [6]:
print(tabulate(dt.to_pandas(), headers="keys", tablefmt="psql", showindex=False))

+-------+----------+
|   num | letter   |
|-------+----------|
|     1 | a        |
|     2 | b        |
|     3 | c        |
+-------+----------+


In [7]:
!tree tmp/some-table

[01;34mtmp/some-table[0m
├── [00m0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet[0m
└── [01;34m_delta_log[0m
    └── [00m00000000000000000000.json[0m

1 directory, 2 files


In [8]:
!jq . tmp/some-table/_delta_log/00000000000000000000.json

[1;39m{
  [0m[34;1m"protocol"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"minReaderVersion"[0m[1;39m: [0m[0;39m1[0m[1;39m,
    [0m[34;1m"minWriterVersion"[0m[1;39m: [0m[0;39m1[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"metaData"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"id"[0m[1;39m: [0m[0;32m"b96ea1a2-1830-4da2-8827-5334cc6104ed"[0m[1;39m,
    [0m[34;1m"name"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"description"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"format"[0m[1;39m: [0m[1;39m{
      [0m[34;1m"provider"[0m[1;39m: [0m[0;32m"parquet"[0m[1;39m,
      [0m[34;1m"options"[0m[1;39m: [0m[1;39m{}[0m[1;39m
    [1;39m}[0m[1;39m,
    [0m[34;1m"schemaString"[0m[1;39m: [0m[0;32m"{\"type\":\"struct\",\"fields\":[{\"name\":\"num\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"letter\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}"[0m[1;39m,
    [0m[34;1m"partiti

## Append

In [13]:
df = pd.DataFrame({"num": [8, 9], "letter": ["dd", "ee"]})

In [10]:
write_deltalake("tmp/some-table", df, mode="append")

In [14]:
dt = DeltaTable("tmp/some-table")

In [15]:
print(tabulate(dt.to_pandas(), headers="keys", tablefmt="psql", showindex=False))

+-------+----------+
|   num | letter   |
|-------+----------|
|     1 | a        |
|     2 | b        |
|     3 | c        |
|     8 | dd       |
|     9 | ee       |
+-------+----------+


In [16]:
!tree tmp/some-table

[01;34mtmp/some-table[0m
├── [00m0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet[0m
├── [00m1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.json[0m
    └── [00m00000000000000000001.json[0m

1 directory, 4 files


In [17]:
!jq . tmp/some-table/_delta_log/00000000000000000001.json

[1;39m{
  [0m[34;1m"add"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet"[0m[1;39m,
    [0m[34;1m"size"[0m[1;39m: [0m[0;39m2204[0m[1;39m,
    [0m[34;1m"partitionValues"[0m[1;39m: [0m[1;39m{}[0m[1;39m,
    [0m[34;1m"modificationTime"[0m[1;39m: [0m[0;39m1701740386169[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;39mtrue[0m[1;39m,
    [0m[34;1m"stats"[0m[1;39m: [0m[0;32m"{\"numRecords\": 2, \"minValues\": {\"num\": 8, \"letter\": \"dd\"}, \"maxValues\": {\"num\": 9, \"letter\": \"ee\"}, \"nullCount\": {\"num\": 0, \"letter\": 0}}"[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"commitInfo"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"timestamp"[0m[1;39m: [0m[0;39m1701740386169[0m[1;39m,
    [0m[34;1m"operation"[0m[1;39m: [0m[0;32m"WRITE"[0m[1;39m,
    [0m[34;1m"operationParameters"[0m[1;39m: [0m[1;39m{
      [0m[34;1m"partitionBy"

## Overwrite

In [18]:
df = pd.DataFrame({"num": [11, 22], "letter": ["aa", "bb"]})

In [19]:
write_deltalake("tmp/some-table", df, mode="overwrite")

In [20]:
dt = dl.DeltaTable("tmp/some-table")

In [21]:
print(tabulate(dt.to_pandas(), headers="keys", tablefmt="psql", showindex=False))

+-------+----------+
|   num | letter   |
|-------+----------|
|    11 | aa       |
|    22 | bb       |
+-------+----------+


In [22]:
!tree tmp/some-table

[01;34mtmp/some-table[0m
├── [00m0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet[0m
├── [00m1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet[0m
├── [00m2-95ef2108-480c-4b89-96f0-ff9185dab9ad-0.parquet[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.json[0m
    ├── [00m00000000000000000001.json[0m
    └── [00m00000000000000000002.json[0m

1 directory, 6 files


In [23]:
!jq . tmp/some-table/_delta_log/00000000000000000002.json

[1;39m{
  [0m[34;1m"add"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"2-95ef2108-480c-4b89-96f0-ff9185dab9ad-0.parquet"[0m[1;39m,
    [0m[34;1m"size"[0m[1;39m: [0m[0;39m2204[0m[1;39m,
    [0m[34;1m"partitionValues"[0m[1;39m: [0m[1;39m{}[0m[1;39m,
    [0m[34;1m"modificationTime"[0m[1;39m: [0m[0;39m1701740465102[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;39mtrue[0m[1;39m,
    [0m[34;1m"stats"[0m[1;39m: [0m[0;32m"{\"numRecords\": 2, \"minValues\": {\"num\": 11, \"letter\": \"aa\"}, \"maxValues\": {\"num\": 22, \"letter\": \"bb\"}, \"nullCount\": {\"num\": 0, \"letter\": 0}}"[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"remove"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet"[0m[1;39m,
    [0m[34;1m"deletionTimestamp"[0m[1;39m: [0m[0;39m1701740465102[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;3

## Time travel to previous version

In [14]:
dt = DeltaTable("tmp/some-table", version=0)

In [15]:
print(tabulate(dt.to_pandas(), headers="keys", tablefmt="psql", showindex=False))

+-------+----------+
|   num | letter   |
|-------+----------|
|     1 | a        |
|     2 | b        |
|     3 | c        |
+-------+----------+


In [16]:
# time travel to version 1

DeltaTable("tmp/some-table", version=1).to_pandas()

Unnamed: 0,num,letter
0,1,a
1,2,b
2,3,c
3,8,dd
4,9,ee


In [17]:
# it reads the latest version by default
DeltaTable("tmp/some-table").to_pandas()

Unnamed: 0,num,letter
0,11,aa
1,22,bb


## Schema enforcement

In [18]:
df = pd.DataFrame({"num": [5, 6], "animal": ["cat", "dog"]})

In [19]:
write_deltalake("tmp/some-table", df)

ValueError: Schema of data does not match table schema
Data schema:
num: int64
animal: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 474
Table Schema:
num: int64
letter: string

## Overwriting schema

In [21]:
write_deltalake("tmp/some-table", df, mode="overwrite", overwrite_schema=True)

In [23]:
dt = DeltaTable("tmp/some-table")

In [24]:
print(tabulate(dt.to_pandas(), headers="keys", tablefmt="psql", showindex=False))

+-------+----------+
|   num | animal   |
|-------+----------|
|     5 | cat      |
|     6 | dog      |
+-------+----------+


## Merge

## Delete rows

In [29]:
df = pd.DataFrame({"num": [1, 2, 3, 4], "letter": ["a", "b", "c", "d"]})

In [31]:
dl.writer.write_deltalake("tmp/my-table", df)

In [32]:
dt = dl.DeltaTable("tmp/my-table")

In [33]:
print(tabulate(dt.to_pandas(), headers="keys", tablefmt="psql", showindex=False))

+-------+----------+
|   num | letter   |
|-------+----------|
|     1 | a        |
|     2 | b        |
|     3 | c        |
|     4 | d        |
+-------+----------+


In [35]:
dt.delete("num > 2")

{'num_added_files': 1,
 'num_removed_files': 1,
 'num_deleted_rows': 2,
 'num_copied_rows': 2,
 'execution_time_ms': 8936,
 'scan_time_ms': 6637,
 'rewrite_time_ms': 2}

In [36]:
print(tabulate(dt.to_pandas(), headers="keys", tablefmt="psql", showindex=False))

+-------+----------+
|   num | letter   |
|-------+----------|
|     1 | a        |
|     2 | b        |
+-------+----------+
