# CSV vs Parquet vs Delta Lake

In [1]:
from deltalake import DeltaTable
import pandas as pd
from pathlib import Path

In [None]:
path = f"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0"

## CSV

In [4]:
%%time
(
    pd.read_csv(f"{Path.home()}/data/G1_1e9_1e2_0_0.csv")
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

KeyboardInterrupt: 

## CSV with usecols

## CSV => Parquet

In [6]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv

In [6]:
in_path = f"{Path.home()}/data/G1_1e8_1e2_0_0.csv"
out_path = f"{Path.home()}/data/G1_1e8_1e2_0_0.parquet"

In [7]:
# convert_options = pyarrow.csv.ConvertOptions()
# convert_options.column_types = {
#     'rate_code': pa.utf8(),
#     'store_and_fwd_flag': pa.utf8()
# }

writer = None
with pyarrow.csv.open_csv(in_path) as reader:
    for next_chunk in reader:
        if next_chunk is None:
            break
        if writer is None:
            writer = pq.ParquetWriter(out_path, next_chunk.schema)
        next_table = pa.Table.from_batches([next_chunk])
        writer.write_table(next_table)
writer.close()

In [3]:
%%time
(
    pd.read_parquet(f"{Path.home()}/data/G1_1e8_1e2_0_0.parquet", columns=["id1", "id2", "v1"])
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 7.58 s, sys: 1.09 s, total: 8.67 s
Wall time: 4.68 s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,29918
id002,30343
id003,30180
id004,30581
id005,30769
...,...
id096,30011
id097,29728
id098,30131
id099,30141


In [4]:
in_path = f"{Path.home()}/data/G1_1e9_1e2_0_0.csv"
out_path = f"{Path.home()}/data/G1_1e9_1e2_0_0.parquet"

In [7]:
writer = None
with pyarrow.csv.open_csv(in_path) as reader:
    for next_chunk in reader:
        if next_chunk is None:
            break
        if writer is None:
            writer = pq.ParquetWriter(out_path, next_chunk.schema)
        next_table = pa.Table.from_batches([next_chunk])
        writer.write_table(next_table)
writer.close()

In [8]:
%%time
(
    pd.read_parquet(f"{Path.home()}/data/G1_1e9_1e2_0_0.parquet", columns=["id1", "id2", "v1"])
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 1min 15s, sys: 31.5 s, total: 1min 47s
Wall time: 1min 27s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,301302
id002,299602
id003,300751
id004,300182
id005,298422
...,...
id096,299284
id097,300429
id098,301122
id099,298842


## Delta Lake

In [10]:
%%time
dt = DeltaTable(f"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0")
(
    dt.to_pandas(filters=[("id1", "==", "id016")], columns=["id1", "id2", "v1"])
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 1min 7s, sys: 3.33 s, total: 1min 10s
Wall time: 1min 6s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,5413334115411131444111211215135424141111135122...
id002,3513425325315321123254523434122334525144115542...
id003,3331513322443552124514455231251533425214413135...
id004,3554452514412353423231533425541345311225345542...
id005,2534353242344422134143544543222423415313131335...
...,...
id096,5135221131514421342325135152131515241145235435...
id097,1144141245311113545155444421332544134544542445...
id098,5342321411412142311452342514212524224215222534...
id099,5152244212344324112521514243524452543334311541...


### Switch to PySpark to compact and Z Order the data

In [9]:
import delta
from delta import configure_spark_with_delta_pip
import pyspark
from pathlib import Path

In [5]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.executor.memory", '10G')
    .config("spark.driver.memory", '25G')
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-eb5cf25a-4ac4-4b1d-a3a2-6a9c52822e2a;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 105ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default    

In [10]:
delta_table = delta.DeltaTable.forPath(spark, f"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0")

In [11]:
%%time
delta_table.optimize().executeCompaction()

23/06/14 12:25:38 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

CPU times: user 2.76 ms, sys: 2.53 ms, total: 5.29 ms
Wall time: 2.85 s


DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

In [12]:
%%time
delta_table.optimize().executeZOrderBy("id1")

                                                                                

CPU times: user 455 ms, sys: 155 ms, total: 611 ms
Wall time: 10min 48s


DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

In [14]:
delta_table.history().show()

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      2|2023-06-14 12:36:...|  null|    null| OPTIMIZE|{predicate -> [],...|null|    null|     null|          1|SnapshotIsolation|        false|{numRemovedFiles ...|        null|Apache-Spark/3.4....|
|      1|2023-05-29 09:10:...|  null|    null| OPTIMIZE|{predicate -> [],...|null|    null|     null|          0|SnapshotIsolation|        false|{numRemovedFiles ...|        null|Apache-Spark/3.4.

### Switch back to deltalake to query the Delta table

In [1]:
from deltalake import DeltaTable
import pandas as pd
from pathlib import Path

In [2]:
%%time
dt = DeltaTable(f"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0", version=1)
(
    dt.to_pandas(filters=[("id1", "==", "id016")], columns=["id1", "id2", "v1"])
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 1min 6s, sys: 1.11 s, total: 1min 7s
Wall time: 1min 5s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,5413334115411131444111211215135424141111135122...
id002,3513425325315321123254523434122334525144115542...
id003,3331513322443552124514455231251533425214413135...
id004,3554452514412353423231533425541345311225345542...
id005,2534353242344422134143544543222423415313131335...
...,...
id096,5135221131514421342325135152131515241145235435...
id097,1144141245311113545155444421332544134544542445...
id098,5342321411412142311452342514212524224215222534...
id099,5152244212344324112521514243524452543334311541...


In [3]:
%%time
dt = DeltaTable(f"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0", version=2)
(
    dt.to_pandas(filters=[("id1", "==", "id016")], columns=["id1", "id2", "v1"])
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 1min 6s, sys: 1.02 s, total: 1min 7s
Wall time: 1min 6s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,5413334115411131444111211215135424141111135122...
id002,3513425325315321123254523434122334525144115542...
id003,3331513322443552124514455231251533425214413135...
id004,3554452514412353423231533425541345311225345542...
id005,2534353242344422134143544543222423415313131335...
...,...
id096,5135221131514421342325135152131515241145235435...
id097,1144141245311113545155444421332544134544542445...
id098,5342321411412142311452342514212524224215222534...
id099,5152244212344324112521514243524452543334311541...


In [4]:
import pyarrow.dataset as ds

In [5]:
%%time

dt = DeltaTable(f"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0", version=2)
dataset = dt.to_pyarrow_dataset()
condition = ds.field("id1") == "id016"
(
    dataset.to_table(filter=condition, columns=["id1", "id2", "v1"])
    .to_pandas()
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 1min 9s, sys: 1.64 s, total: 1min 11s
Wall time: 1min 9s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,5413334115411131444111211215135424141111135122...
id002,3513425325315321123254523434122334525144115542...
id003,3331513322443552124514455231251533425214413135...
id004,3554452514412353423231533425541345311225345542...
id005,2534353242344422134143544543222423415313131335...
...,...
id096,5135221131514421342325135152131515241145235435...
id097,1144141245311113545155444421332544134544542445...
id098,5342321411412142311452342514212524224215222534...
id099,5152244212344324112521514243524452543334311541...


In [6]:
import levi

ModuleNotFoundError: No module named 'levi'