# Mage + Delta Lake

This notebook highlights Delta Lake features that are useful for Mage users.

In [1]:
import pandas as pd
from deltalake import DeltaTable
from deltalake.writer import write_deltalake

In [2]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/mage-ai/datasets/master/battle_history.csv",
)

In [3]:
planets = [
    "Gaia",
    "Kamigawa\u200e\u200e",
    "Eos",
    "Ravnica\u200e\u200e",
    "Aiur",
    "Korhal",
]

In [4]:
df.query("`planet` == 'Kamigawa\u200e\u200e'")

Unnamed: 0,enemy_unit_type,odds_of_victory,planet,population,size_of_force,universe
2,drone,0.600266,Kamigawa‎‎,3.4,61000,Dominaria‎‎
6,dreadnought,0.868513,Kamigawa‎‎,3.4,94000,Dominaria‎‎
7,tank,0.709135,Kamigawa‎‎,3.4,70000,Dominaria‎‎
10,scout,0.207542,Kamigawa‎‎,3.4,29000,Dominaria‎‎
19,battleship,0.235602,Kamigawa‎‎,3.4,81000,Dominaria‎‎
...,...,...,...,...,...,...
9968,drone,0.340780,Kamigawa‎‎,3.4,78000,Dominaria‎‎
9975,zealot,0.569718,Kamigawa‎‎,3.4,65000,Dominaria‎‎
9983,zealot,0.007990,Kamigawa‎‎,3.4,2000,Dominaria‎‎
9984,overlord,0.174568,Kamigawa‎‎,3.4,91000,Dominaria‎‎


In [5]:
for planet in planets:
    planet_df = df.query(f"`planet` == '{planet}'")
    write_deltalake(
        "tmp/battle_history",
        data=planet_df,
        mode="append",
    )

In [6]:
dt = DeltaTable("tmp/battle_history")

In [7]:
df = dt.to_pandas()

In [8]:
df.head()

Unnamed: 0,enemy_unit_type,odds_of_victory,planet,population,size_of_force,universe,__index_level_0__
0,drone,0.957991,Gaia,6.7,1000,Mirrodin,0
1,zealot,0.581632,Gaia,6.7,15000,Mirrodin,1
2,scout,0.050675,Gaia,6.7,10000,Mirrodin,15
3,zealot,0.422055,Gaia,6.7,75000,Mirrodin,16
4,battleship,0.609103,Gaia,6.7,44000,Mirrodin,18


In [9]:
df

Unnamed: 0,enemy_unit_type,odds_of_victory,planet,population,size_of_force,universe,__index_level_0__
0,drone,0.957991,Gaia,6.7,1000,Mirrodin,0
1,zealot,0.581632,Gaia,6.7,15000,Mirrodin,1
2,scout,0.050675,Gaia,6.7,10000,Mirrodin,15
3,zealot,0.422055,Gaia,6.7,75000,Mirrodin,16
4,battleship,0.609103,Gaia,6.7,44000,Mirrodin,18
...,...,...,...,...,...,...,...
9995,mecha,0.366808,Korhal,6.7,87000,Zendikar‎‎,9966
9996,mecha,0.030586,Korhal,6.7,37000,Zendikar‎‎,9969
9997,mecha,0.857502,Korhal,6.7,89000,Zendikar‎‎,9971
9998,battleship,0.264858,Korhal,6.7,94000,Zendikar‎‎,9974


## Time travel with versioned data

In [10]:
# Version 0 only contains data from Gaia
DeltaTable("tmp/battle_history", version=0).to_pandas()

Unnamed: 0,enemy_unit_type,odds_of_victory,planet,population,size_of_force,universe,__index_level_0__
0,drone,0.957991,Gaia,6.7,1000,Mirrodin,0
1,zealot,0.581632,Gaia,6.7,15000,Mirrodin,1
2,scout,0.050675,Gaia,6.7,10000,Mirrodin,15
3,zealot,0.422055,Gaia,6.7,75000,Mirrodin,16
4,battleship,0.609103,Gaia,6.7,44000,Mirrodin,18
...,...,...,...,...,...,...,...
1728,mecha,0.842349,Gaia,6.7,49000,Mirrodin,9958
1729,battleship,0.997295,Gaia,6.7,42000,Mirrodin,9982
1730,zealot,0.226190,Gaia,6.7,84000,Mirrodin,9991
1731,overlord,0.320292,Gaia,6.7,65000,Mirrodin,9994


In [11]:
# Latest version contains data from all planets
DeltaTable("tmp/battle_history").to_pandas()

Unnamed: 0,enemy_unit_type,odds_of_victory,planet,population,size_of_force,universe,__index_level_0__
0,drone,0.957991,Gaia,6.7,1000,Mirrodin,0
1,zealot,0.581632,Gaia,6.7,15000,Mirrodin,1
2,scout,0.050675,Gaia,6.7,10000,Mirrodin,15
3,zealot,0.422055,Gaia,6.7,75000,Mirrodin,16
4,battleship,0.609103,Gaia,6.7,44000,Mirrodin,18
...,...,...,...,...,...,...,...
9995,mecha,0.366808,Korhal,6.7,87000,Zendikar‎‎,9966
9996,mecha,0.030586,Korhal,6.7,37000,Zendikar‎‎,9969
9997,mecha,0.857502,Korhal,6.7,89000,Zendikar‎‎,9971
9998,battleship,0.264858,Korhal,6.7,94000,Zendikar‎‎,9974


## Make a partitioned Delta table

In [12]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/mage-ai/datasets/master/battle_history.csv",
)

In [14]:
for planet in planets:
    planet_df = df.query(f"`planet` == '{planet}'")
    write_deltalake(
        "tmp/battle_history_partitioned",
        data=planet_df,
        mode="append",
        partition_by=["planet"],
    )

In [15]:
!tree tmp/battle_history_partitioned

[01;34mtmp/battle_history_partitioned[0m
├── [01;34m_delta_log[0m
│   ├── [00m00000000000000000000.json[0m
│   ├── [00m00000000000000000001.json[0m
│   ├── [00m00000000000000000002.json[0m
│   ├── [00m00000000000000000003.json[0m
│   ├── [00m00000000000000000004.json[0m
│   └── [00m00000000000000000005.json[0m
├── [01;34mplanet=Aiur[0m
│   └── [00m4-67b6d6f5-9753-4438-b9cf-dd3f84dee467-0.parquet[0m
├── [01;34mplanet=Eos[0m
│   └── [00m2-887d613f-974a-444a-a327-374296a6c70c-0.parquet[0m
├── [01;34mplanet=Gaia[0m
│   └── [00m0-34797246-8733-446f-b54e-16b7034abd1e-0.parquet[0m
├── [01;34mplanet=Kamigawa%25E2%2580%258E%25E2%2580%258E[0m
│   └── [00m1-d406fd6f-f852-4f34-8307-14307b7642f8-0.parquet[0m
├── [01;34mplanet=Korhal[0m
│   └── [00m5-545dfd2d-8ac7-43a4-9da7-1c6451593225-0.parquet[0m
└── [01;34mplanet=Ravnica%25E2%2580%258E%25E2%2580%258E[0m
    └── [00m3-0c33c5fe-054f-47dc-a3f4-37a7859e4504-0.parquet[0m

7 directories, 12 files


## Filtering by partitioning allows for file skipping, which is faster

In [23]:
%%time

df = DeltaTable("tmp/battle_history").to_pandas()
df.query("planet == 'Gaia'")["enemy_unit_type"].unique()

CPU times: user 19.1 ms, sys: 11.7 ms, total: 30.8 ms
Wall time: 19.6 ms


array(['drone', 'zealot', 'scout', 'battleship', 'dreadnought',
       'overlord', 'mecha', 'negator', 'tank'], dtype=object)

In [24]:
%%time

df = DeltaTable("tmp/battle_history_partitioned").to_pandas(partitions=[("planet", "=", "Gaia")])
df.query("planet == 'Gaia'")["enemy_unit_type"].unique()

CPU times: user 9.5 ms, sys: 6.73 ms, total: 16.2 ms
Wall time: 12.5 ms


array(['drone', 'zealot', 'scout', 'battleship', 'dreadnought',
       'overlord', 'mecha', 'negator', 'tank'], dtype=object)

## Schema enforcement

You can not append DataFrames with different schemas by default.  This saves you from corrupting your data.

In [11]:
df = pd.DataFrame({"num": [1, 2, 3], "letter": ["a", "b", "c"]})

In [12]:
df

Unnamed: 0,num,letter
0,1,a
1,2,b
2,3,c


In [13]:
write_deltalake("tmp/battle_history", df, mode="append")

ValueError: Schema of data does not match table schema
Table schema:
num: int64
letter: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 474
Data Schema:
enemy_unit_type: string
odds_of_victory: double
planet: string
population: double
size_of_force: int64
universe: string

## Other features

* small file compaction with OPTIMIZE
* VACUUM command

In [None]:
write_deltalake(
    "tmp/battle_history_partitioned",
    data=planet_df,
    mode="append",
    allow_schema_evolution=True,
)

deltaTable.update(condition="gender = 'F'", set={"gender": "'Female'"})

deltaTable.delete("birthDate < '1955-01-01'")

In [None]:
(
    DeltaTable.create(spark)
    .tableName("default.people10m")
    .addColumn("id", "INT")
    .addColumn("firstName", "STRING")
    .addColumn("middleName", "STRING")
    .addColumn("lastName", "STRING", comment="surname")
    .addColumn("gender", "STRING")
    .addColumn("birthDate", "TIMESTAMP")
    .addColumn("dateOfBirth", DateType(), generatedAlwaysAs="CAST(birthDate AS DATE)")
    .partitionedBy("gender")
    .execute()
)

deltaTable = DeltaTable.convertToDelta(spark, "parquet.`<path-to-table>`")

ALTER TABLE table_name ALTER [COLUMN] col_name (COMMENT col_comment | FIRST | AFTER colA_name)