In [2]:
import pathlib

import pyarrow.dataset as ds
from datafusion import SessionContext, col, lit
from deltalake import DeltaTable

# DataFusion Read Delta

In [3]:
table = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e8_1e2_0_0")

In [4]:
ctx = SessionContext()

### With PyArrow dataset

In [5]:
%%time
ctx.register_dataset("my_dataset", table.to_pyarrow_dataset())

CPU times: user 3.82 ms, sys: 4.53 ms, total: 8.34 ms
Wall time: 8.49 ms


In [6]:
%%time
ctx.sql("select * from my_dataset where v2 > 5")

CPU times: user 1.36 ms, sys: 5.43 ms, total: 6.79 ms
Wall time: 9.67 ms


DataFrame()
+-------+-------+--------------+-----+-----+--------+----+----+-----------+
| id1   | id2   | id3          | id4 | id5 | id6    | v1 | v2 | v3        |
+-------+-------+--------------+-----+-----+--------+----+----+-----------+
| id082 | id049 | id0000022715 | 97  | 55  | 756924 | 2  | 11 | 74.161136 |
| id053 | id052 | id0000113549 | 19  | 56  | 139048 | 1  | 10 | 95.178444 |
| id090 | id043 | id0000637409 | 94  | 50  | 12448  | 3  | 12 | 60.21896  |
| id071 | id092 | id0000537978 | 52  | 51  | 638154 | 2  | 11 | 41.181681 |
| id099 | id017 | id0000324904 | 100 | 98  | 650130 | 5  | 6  | 53.967191 |
| id068 | id010 | id0000645245 | 85  | 15  | 392202 | 5  | 15 | 47.74931  |
| id080 | id005 | id0000722692 | 54  | 90  | 737105 | 4  | 9  | 37.710627 |
| id060 | id084 | id0000269745 | 70  | 93  | 496965 | 2  | 10 | 45.513606 |
| id085 | id004 | id0000020734 | 65  | 10  | 861078 | 3  | 9  | 35.692509 |
| id068 | id095 | id0000888093 | 37  | 3   | 395630 | 5  | 11 | 11.12151  |


### With PyArrow table

In [7]:
ctx.register_table("my_table", table.to_pyarrow_table())

TypeError: argument 'table': 'Table' object cannot be converted to 'Table'

### With batches

In [8]:
%%time
df = ctx.create_dataframe([table.to_pyarrow_table().to_batches()])

CPU times: user 4.99 s, sys: 1.14 s, total: 6.13 s
Wall time: 1.28 s


In [9]:
%%time
df.filter(col("v2") > lit(5))

CPU times: user 144 µs, sys: 325 µs, total: 469 µs
Wall time: 1.83 ms


DataFrame()
+-------+-------+--------------+-----+-----+--------+----+----+-----------+
| id1   | id2   | id3          | id4 | id5 | id6    | v1 | v2 | v3        |
+-------+-------+--------------+-----+-----+--------+----+----+-----------+
| id007 | id096 | id0000445327 | 76  | 60  | 566466 | 2  | 11 | 82.691753 |
| id018 | id078 | id0000576456 | 52  | 92  | 193437 | 3  | 8  | 32.02124  |
| id019 | id049 | id0000790237 | 11  | 85  | 979819 | 4  | 13 | 17.001827 |
| id052 | id058 | id0000041914 | 42  | 12  | 198611 | 5  | 13 | 66.46749  |
| id010 | id016 | id0000921752 | 11  | 99  | 166079 | 1  | 7  | 61.637281 |
| id033 | id027 | id0000054357 | 100 | 79  | 793235 | 1  | 13 | 65.844356 |
| id049 | id031 | id0000237885 | 21  | 96  | 849387 | 3  | 7  | 40.847662 |
| id036 | id100 | id0000840597 | 26  | 92  | 51226  | 2  | 8  | 38.039983 |
| id010 | id033 | id0000128874 | 33  | 18  | 493856 | 3  | 15 | 91.906635 |
| id026 | id045 | id0000045901 | 35  | 94  | 785336 | 4  | 13 | 11.126579 |
