Compare CSV and Castra speeds with artificial data
=============================

In this notebook we create a fake dataset, store it in CSV, HDF5, and Castra and look at read/write performance differences.

### Artificial dataset

In [1]:
import dask.dataframe as dd

df = dd.demo.make_timeseries('2000', '2015', {'A': float, 'B': float, 'C': float, 'D': float,
                                              'volume': int, 'id': int, 
                                              'name': str},
                             freq='10s', partition_freq='3M')

df.head()

Unnamed: 0,A,B,C,D,id,name,volume
2000-01-31 00:00:00,0.897725,-0.543282,-0.618194,-0.542592,1061,Victor,964
2000-01-31 00:00:10,-0.710055,-0.575592,-0.530583,-0.617038,1011,Ursula,973
2000-01-31 00:00:20,0.561353,-0.602298,-0.71285,0.331397,989,Sarah,996
2000-01-31 00:00:30,-0.320206,0.525118,0.936817,0.903456,1013,Wendy,974
2000-01-31 00:00:40,0.524895,-0.952178,0.464947,-0.430365,1057,Bob,965


### Set up ProgressBar

This lets us time operations and watch our progress

In [2]:
from dask.diagnostics import ProgressBar
pbar = ProgressBar()
pbar.register()      # turn it on globally

In [3]:
# pbar.unregister()  # turn it off

### Write speeds

In [4]:
!rm -f myfile.csv
!rm -f myfile.hdf5
!rm -rf myfile.castra
!rm -rf myfile-categories.castra
!ls

In [6]:
df.to_csv('myfile.csv')

[########################################] | 100% Completed |  4min 15.5s


In [7]:
df.to_hdf('myfile.hdf5', '/data')

[########################################] | 100% Completed | 47.5s


In [8]:
df.to_castra('myfile.castra')

[########################################] | 100% Completed | 29.5s


<castra.core.Castra at 0x7fa4e0ee8610>

In [9]:
!du -h myfile.csv

4.3G	myfile.csv


In [10]:
!du -h myfile.hdf5

2.9G	myfile.hdf5


In [11]:
!du -hs myfile.castra

1.8G	myfile.castra


### Read speeds:  Full table read

In [12]:
csv = dd.read_csv('myfile.csv')
hdf = dd.read_hdf('myfile.hdf5', '/data')
castra = dd.from_castra('myfile.castra')

In [13]:
len(csv)

[########################################] | 100% Completed |  1min  5.1s


46543739

In [14]:
len(hdf)

[########################################] | 100% Completed | 15.1s


46543739

In [15]:
len(castra)

[########################################] | 100% Completed | 45.8s


46543739

### Read speeds: single column and compuation

In [16]:
csv.volume.mean().compute()

[########################################] | 100% Completed |  1min 12.5s


999.99482512567374

In [17]:
hdf.volume.mean().compute()

[########################################] | 100% Completed | 15.9s


999.99482512567374

In [18]:
castra.volume.mean().compute()

[########################################] | 100% Completed |  0.8s


999.99482512567374

### Groupby cost

In [19]:
csv.groupby('name').id.count().compute().head()

[########################################] | 100% Completed |  1min 12.2s


name
Alice      1791624
Bob        1790435
Charlie    1790024
Dan        1791553
Edith      1790820
Name: id, dtype: int64

In [20]:
hdf.groupby('name').id.count().compute().head()

[########################################] | 100% Completed | 19.8s


name
Alice      1791624
Bob        1790435
Charlie    1790024
Dan        1791553
Edith      1790820
Name: id, dtype: int64

In [21]:
castra.groupby('name').id.count().compute().head()

[########################################] | 100% Completed | 50.6s


name
Alice      1791624
Bob        1790435
Charlie    1790024
Dan        1791553
Edith      1790820
Name: id, dtype: int64