In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## csv, xls, hdf5, ncdf, json, pkl, pyarrow and parquet, gzip,  etc

csv is plain text format (not binary), data saved as plain text separated by commas

## (?) pandas .memory_usage() vs space in file on disk for different formats

## import pandas as pd
import time
from pathlib import Path

## COMMAND_START_TIME = None

def tic():
    global COMMAND_START_TIME
    COMMAND_START_TIME = time.time()

def toc(name='Execution time'):
    global COMMAND_START_TIME
    print(f'\n ---------- {name}:  {time.time() - COMMAND_START_TIME} seconds \n')
    COMMAND_START_TIME = None

In [4]:
from numpy.random import default_rng
rng = default_rng()

In [32]:
nb_rows = 1000000
data = pd.DataFrame(rng.standard_normal(size=(nb_rows, 10)))
data.memory_usage(0).sum()/1e6
data.index = pd.date_range('2010-01-01', periods=nb_rows, freq='5min')
data.columns = list('ABCDEFJHKL')
data

80.0

Unnamed: 0,A,B,C,D,E,F,J,H,K,L
2010-01-01 00:00:00,-1.003622,-0.049134,-0.326186,0.062108,-0.057368,-1.368083,-0.780119,0.914958,1.676413,1.831727
2010-01-01 00:05:00,-0.118157,1.130713,0.004049,1.269853,0.488006,-0.210061,0.555004,0.114502,0.084040,0.460183
2010-01-01 00:10:00,-1.048491,0.303878,-1.520755,1.217532,-1.851124,-0.155474,-0.438002,-0.092491,0.549864,-0.812130
2010-01-01 00:15:00,-1.323914,-0.759893,0.059542,-0.372727,2.328100,-1.248215,1.750482,-0.954200,-1.900064,0.817214
2010-01-01 00:20:00,1.187695,-0.041976,1.172901,-0.385393,-0.151192,0.148803,-0.492217,0.323543,0.682622,-0.013872
...,...,...,...,...,...,...,...,...,...,...
2019-07-05 04:55:00,0.953457,0.082534,-1.343335,0.410405,1.584895,0.128722,-0.381379,-0.553342,-0.545461,-0.900247
2019-07-05 05:00:00,2.117599,-0.219048,-0.346014,1.657786,-1.994065,0.633052,0.533565,0.027153,-0.167526,1.809655
2019-07-05 05:05:00,0.297714,-0.905382,0.521154,-1.162442,0.824354,1.444755,-0.591865,0.164547,1.696901,0.839050
2019-07-05 05:10:00,-1.156262,1.021879,1.277241,0.541207,-2.058688,2.813037,-1.353650,0.506960,-0.817149,0.321601


In [33]:
data_root = Path('/home/vlad/tmp')

In [34]:
tic()
data.to_csv(data_root/'data.csv')
toc('Save to csv')


 ---------- Save to csv:  10.68740701675415 seconds 



In [35]:
tic()
data.to_pickle(data_root/'data.pkl')
toc('Pickle time')


 ---------- Pickle time:  0.46996498107910156 seconds 



In [None]:
which format can save dtype, other metadata? provide custom access to a part of data (filtering before loading to disk)? (parquet)

which formats are more and less stable in terms of python/package versions (pickle depends on python exact version etc)

using buffers instead of files, any use cases?

In [36]:
tic()
data.to_hdf(data_root/'data.h5', key='data')
toc('HDF5 time')


 ---------- HDF5 time:  0.03911566734313965 seconds 



In [37]:
tic()
data.to_parquet(data_root/'data.parquet', engine='pyarrow')
toc('Parquet')


 ---------- Parquet:  0.3030660152435303 seconds 



In [38]:
import xarray as xr
xrvar = xr.DataArray(data)
tic()
xrvar.to_netcdf(data_root/'data.ncdf')
toc('xarray NetCDF')


 ---------- xarray NetCDF:  0.25287628173828125 seconds 



In [46]:
tic()
pd.read_parquet(data_root/'data.parquet', columns=list('ABF'), filters=[('index', '=', '2010')])
toc('parquet load')

ArrowInvalid: Field named 'index' not found or not unique in the schema.

## HDF 5 format

In [47]:
s = pd.HDFStore(data_root/'store.h5')

In [48]:
tic()
s['data'] = data
toc()

sqr_data = data**2
tic()
s['sqr'] = sqr_data
toc()


 ---------- Execution time:  0.04439902305603027 seconds 


 ---------- Execution time:  0.03567767143249512 seconds 



In [52]:
import numpy as np
data = data.astype(np.float32)

In [53]:
data

Unnamed: 0,A,B,C,D,E,F,J,H,K,L
2010-01-01 00:00:00,-1.003622,-0.049134,-0.326186,0.062108,-0.057368,-1.368083,-0.780119,0.914958,1.676413,1.831728
2010-01-01 00:05:00,-0.118157,1.130713,0.004049,1.269853,0.488006,-0.210061,0.555004,0.114502,0.084040,0.460183
2010-01-01 00:10:00,-1.048491,0.303878,-1.520755,1.217532,-1.851124,-0.155474,-0.438002,-0.092491,0.549864,-0.812130
2010-01-01 00:15:00,-1.323914,-0.759893,0.059542,-0.372727,2.328100,-1.248215,1.750482,-0.954199,-1.900064,0.817214
2010-01-01 00:20:00,1.187695,-0.041976,1.172901,-0.385393,-0.151192,0.148803,-0.492217,0.323543,0.682622,-0.013872
...,...,...,...,...,...,...,...,...,...,...
2019-07-05 04:55:00,0.953457,0.082534,-1.343335,0.410405,1.584895,0.128722,-0.381379,-0.553342,-0.545461,-0.900247
2019-07-05 05:00:00,2.117599,-0.219048,-0.346014,1.657786,-1.994065,0.633052,0.533565,0.027153,-0.167526,1.809655
2019-07-05 05:05:00,0.297714,-0.905382,0.521154,-1.162442,0.824354,1.444755,-0.591865,0.164547,1.696901,0.839050
2019-07-05 05:10:00,-1.156262,1.021879,1.277241,0.541207,-2.058688,2.813037,-1.353650,0.506960,-0.817149,0.321601


In [55]:
data.to_hdf(data_root/'data.h5', 'df')
df_read = pd.read_hdf(data_root/'data.h5', 'df')

In [61]:
s = pd.HDFStore(data_root/'data.h5')
s

<class 'pandas.io.pytables.HDFStore'>
File path: /home/vlad/tmp/data.h5

In [64]:
s

<class 'pandas.io.pytables.HDFStore'>
File path: /home/vlad/tmp/data.h5

pandas categorical variables (different in term of their storage)

what is partitioning? how to use it with spark/parquet?

which databases/storages support verion control (similar to different dated folders at GSA)

Is dask useful at all, should I make a notebook on it?


"I have some experience with Parquet, some experience with how not to use it. I have two massive problems with it.

I use it for a system where I only need a small slice of data at a time. Such as, amongst the last 10 years, only give me 2 days of data. This is a horrible usecase for parquet, this is what an index is for, the sort of thing you get with a database.

I use the parquet to mirror a database which is constantly mutated. Again, horrible situation. You can't edit a record in parquet, it is append only. If I were using it correctly I would have mechanisms to allow for adding new modified records that will take precedence over the original documents."


For SE even databases is super naive solution; read briefly about Kafka, etc, to understand the problematics

databases with revision control (DB tables and list of revisions with old value, new value and timestamp)