In [1]:
import json
import os
import pickle
from pathlib import Path
from time import time
from typing import List

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
FNAME: str = "test_data"
F_ENDINGS: List[str] = ["csv", "json", "parquet", "pickle"]

In [None]:
RNG = np.random.default_rng(42)
DATA_NP: np.array = RNG.random((10000, 10000))
DATA_NP

In [None]:
DATA_DF: pd.DataFrame = pd.DataFrame(DATA_NP)
DATA_DF

In [None]:
DATA_PQ = pa.Table.from_pandas(DATA_DF)
DATA_PQ

In [None]:
f_path = Path(f"{FNAME}.bin")

tic = time()
with open(f_path, "wb") as f:
    np.save(f, DATA_NP)
print(f"Write NPY: {time()-tic:.3f} s")

tic = time()
with open(f_path, "rb") as f:
    np.load(f)
print(f"Read NPY: {time()-tic:.3f} s")

print("File Size is :", f"{os.path.getsize(f_path)/1024/1024/1024:.2f}", "GB")
f_path.unlink()

In [None]:
f_path = Path(f"{FNAME}.csv")

tic = time()
DATA_DF.to_csv(f_path)
print(f"Write CSV: {time()-tic:.3f} s")

tic = time()
pd.read_csv(f_path)
print(f"Read CSV: {time()-tic:.3f} s")

print("File Size is :", f"{os.path.getsize(f_path)/1024/1024/1024:.2f}", "GB")
f_path.unlink()

In [None]:
f_path = Path(f"{FNAME}.json")

tic = time()
DATA_DF.to_json(f_path)
print(f"Write JSON: {time()-tic:.3f} s")

tic = time()
with open(f_path) as f:
    json.load(f)
print(f"Read JSON: {time()-tic:.3f} s")

print("File Size is :", f"{os.path.getsize(f_path)/1024/1024/1024:.2f}", "GB")
f_path.unlink()

In [None]:
f_path = Path(f"{FNAME}.pickle")

tic = time()
DATA_DF.to_pickle(f_path)
print(f"Write PICKLE: {time()-tic:.3f} s")

tic = time()
with open(f_path, "rb") as f:
    pickle.load(f)
print(f"Read PICKLE: {time()-tic:.3f} s")

print("File Size is :", f"{os.path.getsize(f_path)/1024/1024/1024:.2f}", "GB")
f_path.unlink()

In [None]:
f_path = Path(f"{FNAME}.parquet")

tic = time()
pq.write_table(DATA_PQ, f_path)
print(f"Write parquet: {time()-tic:.3f} s")

tic = time()
pq.read_table(f_path)
print(f"Read parquet: {time()-tic:.3f} s")

print("File Size is :", f"{os.path.getsize(f_path)/1024/1024/1024:.2f}", "GB")
f_path.unlink()