In [11]:
import pathlib

import duckdb
import lance
import pyarrow.dataset as ds
from datafusion import SessionContext, col, lit
from deltalake import DeltaTable
import pyarrow as pa

## 1e9 duckdb + Delta Lake

In [7]:
%%time
delta_1e9_v1 = DeltaTable(
    f"{pathlib.Path.home()}/data/deltalake/G1_1e9_1e2_0_0", version=1
).to_pyarrow_dataset()
duckdb.query("select * from delta_1e9_v1 where v3 = 75.21594").to_df()

CPU times: user 56 s, sys: 17 s, total: 1min 13s
Wall time: 24.2 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


In [8]:
%%time
delta_1e9_v1 = DeltaTable(
    f"{pathlib.Path.home()}/data/deltalake/G1_1e9_1e2_0_0", version=2
).to_pyarrow_dataset()
duckdb.query("select * from delta_1e9_v1 where v3 = 75.21594").to_df()

CPU times: user 55.9 s, sys: 17.4 s, total: 1min 13s
Wall time: 24.6 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


In [9]:
%%time
delta_1e9_v1 = DeltaTable(
    f"{pathlib.Path.home()}/data/deltalake/G1_1e9_1e2_0_0", version=3
).to_pyarrow_dataset()
duckdb.query("select * from delta_1e9_v1 where v3 = 75.21594").to_df()

CPU times: user 2.66 s, sys: 819 ms, total: 3.48 s
Wall time: 1.32 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


## 1e9 duckdb + Parquet

In [12]:
%%time
parquet_1e9_path = f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.parquet"
parquet_1e9 = pa.dataset.dataset(parquet_1e9_path, format="parquet")
duckdb.query("select * from parquet_1e9 where v3 = 75.21594").to_df()

CPU times: user 1min 30s, sys: 13.1 s, total: 1min 43s
Wall time: 17.9 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


In [14]:
import pyarrow.parquet as pq
parquet_file = pq.ParquetFile(parquet_1e9_path)
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x12967d1d0>
  created_by: parquet-cpp-arrow version 11.0.0
  num_columns: 9
  num_rows: 1000000000
  num_row_groups: 50467
  format_version: 2.6
  serialized_size: 47438984

## Convert 1e8 to lance

In [6]:
import duckdb
import lance
import pyarrow as pa
import pyarrow.dataset

In [4]:
path = f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.parquet"

In [5]:
%%time
parquet = pa.dataset.dataset(path, format="parquet")
lance.write_dataset(parquet, f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.lance")

CPU times: user 13.6 s, sys: 12.2 s, total: 25.8 s
Wall time: 17.7 s


<lance.dataset.LanceDataset at 0x12221cd00>

In [14]:
%%time
dataset = lance.dataset(f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.lance")
duckdb.query("select * from dataset where v3 = 75.21594").to_df()

CPU times: user 1.83 s, sys: 2.6 s, total: 4.43 s
Wall time: 4.92 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id002,id041,id0000451508,86,81,364984,3,9,75.21594


In [12]:
duckdb.query(
    "select v3, count(*) as count from dataset group by v3 having count = 1 limit 3"
)

┌───────────┬───────┐
│    v3     │ count │
│  double   │ int64 │
├───────────┼───────┤
│ 88.294451 │     1 │
│ 66.172531 │     1 │
│ 99.491438 │     1 │
└───────────┴───────┘

## Convert 1e9 to lance

In [9]:
path = f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.parquet"

In [10]:
%%time
parquet = pa.dataset.dataset(path, format="parquet")
lance.write_dataset(parquet, f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.lance")

CPU times: user 2min 18s, sys: 2min 4s, total: 4min 23s
Wall time: 3min 3s


<lance.dataset.LanceDataset at 0x12229f220>

In [13]:
%%time
dataset_1e9 = lance.dataset(f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.lance")
duckdb.query("select * from dataset_1e9 where v3 = 75.21594").to_df()

CPU times: user 18.3 s, sys: 26.5 s, total: 44.9 s
Wall time: 49 s


Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id043,id035,id0004875485,8,18,733428,4,4,75.21594


In [13]:
duckdb.query(
    "select v3, count(*) as count from dataset_1e9 group by v3 having count = 1 limit 3"
)

┌───────────┬───────┐
│    v3     │ count │
│  double   │ int64 │
├───────────┼───────┤
│  75.21594 │     1 │
│ 54.307981 │     1 │
│ 55.345451 │     1 │
└───────────┴───────┘

## 1e9 DataFusion + Delta Lake

In [24]:
table_1e9 = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e9_1e2_0_0")

In [25]:
%%time
ctx.register_dataset("my_dataset_1e9", table.to_pyarrow_dataset())

CPU times: user 4.16 ms, sys: 3.43 ms, total: 7.58 ms
Wall time: 5.07 ms


In [26]:
%%time
res = ctx.sql("select * from my_dataset_1e9 where v3 = 75.21594")
print(res)

DataFrame()
+-------+-------+--------------+-----+-----+--------+----+----+----------+
| id1   | id2   | id3          | id4 | id5 | id6    | v1 | v2 | v3       |
+-------+-------+--------------+-----+-----+--------+----+----+----------+
| id002 | id041 | id0000451508 | 86  | 81  | 364984 | 3  | 9  | 75.21594 |
+-------+-------+--------------+-----+-----+--------+----+----+----------+
CPU times: user 9.47 s, sys: 1.93 s, total: 11.4 s
Wall time: 2.03 s


## MDS 1e8

In [1]:
import pathlib
from streaming.base.converters import dataframeToMDS
from streaming import MDSWriter, StreamingDataset
from pyspark.sql import SparkSession

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/10 08:31:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/10 08:31:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
parquet_1e8_path = f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.parquet"
pdf = spark.read.parquet(parquet_1e8_path)

                                                                                

In [7]:
out_path = f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.mds"

In [8]:
mds_kwargs = {'out': out_path}

In [9]:
%%time
dataframeToMDS(pdf, merge_index=True, mds_kwargs=mds_kwargs)

User's discretion required: columns arg is missing from mds_kwargs. Will be auto inferred
Auto inferred schema: {'id1': 'str', 'id2': 'str', 'id3': 'str', 'id4': 'int64', 'id5': 'int64', 'id6': 'int64', 'v1': 'int64', 'v2': 'int64', 'v3': 'float64'}

CPU times: user 68.8 ms, sys: 28.2 ms, total: 97 ms
Wall time: 3min 38s


                                                                                

('/Users/matthew.powers/data/G1_1e8_1e2_0_0.mds', 0)

In [3]:
%%time
dataset = StreamingDataset(local=f"{pathlib.Path.home()}/data/G1_1e8_1e2_0_0.mds")

CPU times: user 136 ms, sys: 71 ms, total: 207 ms
Wall time: 224 ms


In [5]:
%%time
sample = dataset[1337]
sample

CPU times: user 707 µs, sys: 643 µs, total: 1.35 ms
Wall time: 1.02 ms


{'id1': 'id089',
 'id2': 'id080',
 'id3': 'id0000880705',
 'id4': 3,
 'id5': 49,
 'id6': 781230,
 'v1': 3,
 'v2': 8,
 'v3': 67.077797}

## MDS 1e9

In [11]:
parquet_1e9_path = f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.parquet"
pdf = spark.read.parquet(parquet_1e9_path)

In [12]:
out_path = f"{pathlib.Path.home()}/data/G1_1e9_1e2_0_0.mds"

In [13]:
mds_kwargs = {'out': out_path}

In [14]:
%%time
dataframeToMDS(pdf, merge_index=True, mds_kwargs=mds_kwargs)

User's discretion required: columns arg is missing from mds_kwargs. Will be auto inferred
Auto inferred schema: {'id1': 'str', 'id2': 'str', 'id3': 'str', 'id4': 'int64', 'id5': 'int64', 'id6': 'int64', 'v1': 'int64', 'v2': 'int64', 'v3': 'float64'}
23/10/10 00:49:55 ERROR Utils: Uncaught exception in thread stdout writer for python3
java.lang.OutOfMemoryError: GC overhead limit exceeded
	at shaded.parquet.org.apache.thrift.protocol.TCompactProtocol.readBinary(TCompactProtocol.java:708)
	at org.apache.parquet.format.InterningProtocol.readBinary(InterningProtocol.java:220)
	at shaded.parquet.org.apache.thrift.protocol.TProtocolUtil.skip(TProtocolUtil.java:102)
	at shaded.parquet.org.apache.thrift.protocol.TProtocolUtil.skip(TProtocolUtil.java:138)
	at shaded.parquet.org.apache.thrift.protocol.TProtocolUtil.skip(TProtocolUtil.java:112)
	at shaded.parquet.org.apache.thrift.protocol.TProtocolUtil.skip(TProtocolUtil.java:112)
	at shaded.parquet.org.apache.thrift.protocol.TProtocolUtil.skip(

KeyboardInterrupt: 