In [1]:
import pathlib

import duckdb
import lance
import pyarrow.dataset as ds
from datafusion import SessionContext, col, lit
from deltalake import DeltaTable
import pyarrow as pa

## 1e9 duckdb + Delta Lake

In [7]:
%%time
delta_1e9_v1 = DeltaTable(
    f"{pathlib.Path.home()}/data/deltalake/G1_1e9_1e2_0_0", version=1
).to_pyarrow_dataset()
duckdb.query("select * from delta_1e9_v1 where v3 = 75.21594").to_df()

CPU times: user 56 s, sys: 17 s, total: 1min 13s
Wall time: 24.2 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


In [8]:
%%time
delta_1e9_v1 = DeltaTable(
    f"{pathlib.Path.home()}/data/deltalake/G1_1e9_1e2_0_0", version=2
).to_pyarrow_dataset()
duckdb.query("select * from delta_1e9_v1 where v3 = 75.21594").to_df()

CPU times: user 55.9 s, sys: 17.4 s, total: 1min 13s
Wall time: 24.6 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


In [9]:
%%time
delta_1e9_v1 = DeltaTable(
    f"{pathlib.Path.home()}/data/deltalake/G1_1e9_1e2_0_0", version=3
).to_pyarrow_dataset()
duckdb.query("select * from delta_1e9_v1 where v3 = 75.21594").to_df()

CPU times: user 2.66 s, sys: 819 ms, total: 3.48 s
Wall time: 1.32 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


## 1e9 duckdb + Parquet

In [12]:
%%time
parquet_1e9_path = f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.parquet"
parquet_1e9 = pa.dataset.dataset(parquet_1e9_path, format="parquet")
duckdb.query("select * from parquet_1e9 where v3 = 75.21594").to_df()

CPU times: user 1min 30s, sys: 13.1 s, total: 1min 43s
Wall time: 17.9 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


In [14]:
import pyarrow.parquet as pq
parquet_file = pq.ParquetFile(parquet_1e9_path)
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x12967d1d0>
  created_by: parquet-cpp-arrow version 11.0.0
  num_columns: 9
  num_rows: 1000000000
  num_row_groups: 50467
  format_version: 2.6
  serialized_size: 47438984

## Convert 1e8 to lance

In [6]:
import duckdb
import lance
import pyarrow as pa
import pyarrow.dataset

In [4]:
path = f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.parquet"

In [5]:
%%time
parquet = pa.dataset.dataset(path, format="parquet")
lance.write_dataset(parquet, f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.lance")

CPU times: user 13.6 s, sys: 12.2 s, total: 25.8 s
Wall time: 17.7 s


<lance.dataset.LanceDataset at 0x12221cd00>

In [14]:
%%time
dataset = lance.dataset(f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.lance")
duckdb.query("select * from dataset where v3 = 75.21594").to_df()

CPU times: user 1.83 s, sys: 2.6 s, total: 4.43 s
Wall time: 4.92 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id002,id041,id0000451508,86,81,364984,3,9,75.21594


In [12]:
duckdb.query(
    "select v3, count(*) as count from dataset group by v3 having count = 1 limit 3"
)

┌───────────┬───────┐
│    v3     │ count │
│  double   │ int64 │
├───────────┼───────┤
│ 88.294451 │     1 │
│ 66.172531 │     1 │
│ 99.491438 │     1 │
└───────────┴───────┘

## Convert 1e9 to lance

In [9]:
path = f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.parquet"

In [10]:
%%time
parquet = pa.dataset.dataset(path, format="parquet")
lance.write_dataset(parquet, f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.lance")

CPU times: user 2min 18s, sys: 2min 4s, total: 4min 23s
Wall time: 3min 3s


<lance.dataset.LanceDataset at 0x12229f220>

In [13]:
%%time
dataset_1e9 = lance.dataset(f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.lance")
duckdb.query("select * from dataset_1e9 where v3 = 75.21594").to_df()

CPU times: user 18.3 s, sys: 26.5 s, total: 44.9 s
Wall time: 49 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


In [13]:
duckdb.query(
    "select v3, count(*) as count from dataset_1e9 group by v3 having count = 1 limit 3"
)

┌───────────┬───────┐
│    v3     │ count │
│  double   │ int64 │
├───────────┼───────┤
│  75.21594 │     1 │
│ 54.307981 │     1 │
│ 55.345451 │     1 │
└───────────┴───────┘

In [2]:
dataset = lance.dataset(f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.lance")

In [4]:
%%time
dataset.take([1550])

CPU times: user 2.29 ms, sys: 5.77 ms, total: 8.05 ms
Wall time: 13.2 ms


pyarrow.Table
id1: string
id2: string
id3: string
id4: int64
id5: int64
id6: int64
v1: int64
v2: int64
v3: double
----
id1: [["id042"]]
id2: [["id033"]]
id3: [["id0002281669"]]
id4: [[81]]
id5: [[91]]
id6: [[1160598]]
v1: [[1]]
v2: [[8]]
v3: [[76.588427]]

In [4]:
import numpy as np
import time

In [9]:
ii = np.random.choice(1, 1000)
t0 = time.time()
for i in ii:
    dataset.take([i])
t = time.time() - t0
t

0.396435022354126

In [None]:
ii = np.random.choice(1, 1000)
t0 = time.time()
for i in range(dataset.num_samples):
    dataset.take([i])
t = time.time() - t0
t

In [6]:
t0 = time.time()
for i in range(1_000_000):
    dataset.take([i])
t = time.time() - t0
t

380.55812191963196

## 1e9 DataFusion + Delta Lake

In [24]:
table_1e9 = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e9_1e2_0_0")

In [25]:
%%time
ctx.register_dataset("my_dataset_1e9", table.to_pyarrow_dataset())

CPU times: user 4.16 ms, sys: 3.43 ms, total: 7.58 ms
Wall time: 5.07 ms


In [26]:
%%time
res = ctx.sql("select * from my_dataset_1e9 where v3 = 75.21594")
print(res)

DataFrame()
+-------+-------+--------------+-----+-----+--------+----+----+----------+
| id1   | id2   | id3          | id4 | id5 | id6    | v1 | v2 | v3       |
+-------+-------+--------------+-----+-----+--------+----+----+----------+
| id002 | id041 | id0000451508 | 86  | 81  | 364984 | 3  | 9  | 75.21594 |
+-------+-------+--------------+-----+-----+--------+----+----+----------+
CPU times: user 9.47 s, sys: 1.93 s, total: 11.4 s
Wall time: 2.03 s


## MDS 1e8

In [1]:
import pathlib
from streaming.base.converters import dataframeToMDS
from streaming import MDSWriter, StreamingDataset
from pyspark.sql import SparkSession
import pyspark

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.executor.memory", "10G")
    .config("spark.driver.memory", "25G")
)

spark = builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/11 11:37:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

In [6]:
parquet_1e8_path = f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.parquet"
pdf = spark.read.parquet(parquet_1e8_path)

                                                                                

In [4]:
out_path = f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.mds"

In [8]:
mds_kwargs = {'out': out_path}

In [9]:
%%time
dataframeToMDS(pdf, merge_index=True, mds_kwargs=mds_kwargs)

User's discretion required: columns arg is missing from mds_kwargs. Will be auto inferred
Auto inferred schema: {'id1': 'str', 'id2': 'str', 'id3': 'str', 'id4': 'int64', 'id5': 'int64', 'id6': 'int64', 'v1': 'int64', 'v2': 'int64', 'v3': 'float64'}

CPU times: user 68.8 ms, sys: 28.2 ms, total: 97 ms
Wall time: 3min 38s


                                                                                

('/Users/matthew.powers/data/G1_1e8_1e2_0_0.mds', 0)

In [5]:
%%time
dataset = StreamingDataset(local=f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.mds")

CPU times: user 129 ms, sys: 76.1 ms, total: 205 ms
Wall time: 205 ms


In [6]:
%%time
sample = dataset[1337]
sample

CPU times: user 1.18 ms, sys: 1.86 ms, total: 3.04 ms
Wall time: 1.76 ms


{'id1': 'id089',
 'id2': 'id080',
 'id3': 'id0000880705',
 'id4': 3,
 'id5': 49,
 'id6': 781230,
 'v1': 3,
 'v2': 8,
 'v3': 67.077797}

In [8]:
import numpy as np
import time

In [8]:
time.time()

1696972498.07495

In [9]:
ii = np.random.choice(1, 1000)
t0 = time.time()
for i in ii:
    dataset[i]
t = time.time() - t0

In [10]:
t

0.0715641975402832

## MDS 1e9

In [4]:
parquet_1e9_path = f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.parquet"
pdf = spark.read.parquet(parquet_1e9_path)

In [5]:
pdf.rdd.getNumPartitions()

229

In [12]:
pdf.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-----+-----+------------+---+---+-------+---+---+---------+
|  id1|  id2|         id3|id4|id5|    id6| v1| v2|       v3|
+-----+-----+------------+---+---+-------+---+---+---------+
|id016|id059|id0009584273| 31| 54|7579268|  5|  2|92.709317|
|id039|id028|id0008226858| 32| 73|1462759|  3| 14|23.308717|
|id047|id073|id0004357983| 52| 71| 354157|  2| 15|98.462728|
|id043|id069|id0006903604| 37| 35| 372382|  5|  5|32.566149|
|id054|id095|id0005719264| 94| 99|6957127|  5| 11| 97.89284|
|id029|id027|id0007119528| 11| 41|6768037|  2|  7|26.394021|
|id047|id053|id0003186028| 93| 64|3300443|  3| 14|79.319642|
|id091|id097|id0007718026| 22| 50|3609381|  5| 15|94.510853|
|id090|id033|id0007857423|  5| 65|3618630|  1|  5| 1.579951|
|id070|id062|id0001399833| 90| 99|6131090|  5| 14|24.892749|
|id039|id030|id0000654974| 22| 18|1298417|  1| 10|15.321252|
|id023|id095|id0005131426| 52| 38|7811474|  4| 12| 25.65414|
|id070|id013|id0009420524| 14| 33|7075062|  5| 14|43.468912|
|id022|id026|id000519192

                                                                                

In [13]:
out_path = f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.mds"

In [14]:
mds_kwargs = {'out': out_path}

In [15]:
%%time
dataframeToMDS(pdf, merge_index=True, mds_kwargs=mds_kwargs)

User's discretion required: columns arg is missing from mds_kwargs. Will be auto inferred
Auto inferred schema: {'id1': 'str', 'id2': 'str', 'id3': 'str', 'id4': 'int64', 'id5': 'int64', 'id6': 'int64', 'v1': 'int64', 'v2': 'int64', 'v3': 'float64'}

CPU times: user 493 ms, sys: 180 ms, total: 673 ms
Wall time: 28min 40s


                                                                                

('/Users/matthew.powers/data/G1_1e9_1e2_0_0.mds', 0)

In [9]:
%%time
dataset = StreamingDataset(local=f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.mds")

CPU times: user 1.27 s, sys: 785 ms, total: 2.06 s
Wall time: 2.08 s


In [17]:
%%time
sample = dataset[1337]
sample

CPU times: user 1.38 ms, sys: 2.16 ms, total: 3.55 ms
Wall time: 2.12 ms


{'id1': 'id089',
 'id2': 'id044',
 'id3': 'id0005002667',
 'id4': 35,
 'id5': 86,
 'id6': 4105338,
 'v1': 3,
 'v2': 8,
 'v3': 46.657833}

In [18]:
ii = np.random.choice(1, 1000)
t0 = time.time()
for i in ii:
    dataset[i]
t = time.time() - t0
t

0.08199691772460938

23/10/10 17:46:36 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /private/var/folders/19/_52w4zps3xjc6plz_f63j8sh0000gp/T/blockmgr-f9ff5e70-56c5-4b72-ad92-ec4912968530. Falling back to Java IO way
java.io.IOException: Failed to delete: /private/var/folders/19/_52w4zps3xjc6plz_f63j8sh0000gp/T/blockmgr-f9ff5e70-56c5-4b72-ad92-ec4912968530
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:177)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:113)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:94)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1231)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:364)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.Indexe

In [4]:
dataset.num_samples

1000000000

In [13]:
t0 = time.time()
for i in range(1_000_000):
    dataset[i]
t = time.time() - t0
t

53.09250783920288