In [1]:
"""
This task explores the different file storage types used with dask: csv, hdf5, and parquets. It then compares performance
differences
"""


# be sure to shut down other kernels running distributed clients
from dask.distributed import Client
client = Client()

In [2]:

from prep import accounts_csvs
accounts_csvs(3, 1000000, 500)

Create CSV accounts for dataframe exercise


In [33]:
import os
filename = os.path.join('../data', 'accounts.*.csv')
filename

'../data/accounts.*.csv'

In [34]:
import dask.dataframe as dd
df_csv = dd.read_csv(filename)
df_csv.head()

Unnamed: 0,id,names,amount
0,381,Alice,38
1,177,Zelda,3114
2,98,Yvonne,1466
3,289,Quinn,21
4,152,Charlie,-815


In [35]:

target = os.path.join('../data', 'accounts.h5')
target

'../data/accounts.h5'

In [36]:
%time df_csv.to_hdf(target, '../data')

CPU times: user 128 ms, sys: 24 ms, total: 152 ms
Wall time: 3.79 s


[None]

In [38]:
df_hdf = dd.read_hdf(target, '../data')
df_hdf.head()

Unnamed: 0,id,names,amount
0,381,Alice,38
1,177,Zelda,3114
2,98,Yvonne,1466
3,289,Quinn,21
4,152,Charlie,-815


In [39]:
%time df_csv.amount.sum().compute()

CPU times: user 76 ms, sys: 8 ms, total: 84 ms
Wall time: 1.12 s


2699093237

In [40]:
%time df_hdf.amount.sum().compute()


CPU times: user 156 ms, sys: 8 ms, total: 164 ms
Wall time: 2.87 s


2699093237

In [41]:
# Categorize data, then store in HDFStore
%time df_hdf.categorize(columns=['names']).to_hdf(target, '../data2')

CPU times: user 416 ms, sys: 48 ms, total: 464 ms
Wall time: 8.18 s


[None]

In [42]:

# It looks the same
df_hdf = dd.read_hdf(target, '../data2')
df_hdf.head()

Unnamed: 0,id,names,amount
0,381,Alice,38
1,177,Zelda,3114
2,98,Yvonne,1466
3,289,Quinn,21
4,152,Charlie,-815


In [43]:
# But loads more quickly
%time df_hdf.amount.sum().compute()

CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 344 ms


2699093237

In [44]:
target = os.path.join('../data', 'accounts.parquet')
df_csv.categorize(columns=['names']).to_parquet(target, has_nulls=False)

In [46]:
ls -l ../data/accounts.parquet/

total 49820
-rw-rw-r-- 1 tom tom      491 Sep 25 23:30 _common_metadata
-rw-rw-r-- 1 tom tom     1337 Sep 25 23:30 _metadata
-rw-rw-r-- 1 tom tom 17000945 Sep 25 23:30 part.0.parquet
-rw-rw-r-- 1 tom tom 17000945 Sep 25 23:30 part.1.parquet
-rw-rw-r-- 1 tom tom 17000945 Sep 25 23:30 part.2.parquet


In [47]:
df_p = dd.read_parquet(target)
# note that column names shows the type of the values - we could
# choose to load as a categorical column or not.
df_p.dtypes

id           int64
names     category
amount       int64
dtype: object

In [60]:
%time df_hdf.amount.sum().compute()

CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 429 ms


2699093237

In [69]:
%time print(df_csv.loc[100])

%time print(df_hdf.loc[100])

%time print(df_p.loc[100])

# was getting a weird error with this for some reason
df_new = df_p.set_index('id').to_parquet(target)
%time print(df_new.loc[100])

Dask DataFrame Structure:
                  id   names amount
npartitions=3                      
               int64  object  int64
                 ...     ...    ...
                 ...     ...    ...
                 ...     ...    ...
Dask Name: try_loc, 12 tasks
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 12.4 ms
Dask DataFrame Structure:
                  id            names amount
npartitions=3                               
               int64  category[known]  int64
                 ...              ...    ...
                 ...              ...    ...
                 ...              ...    ...
Dask Name: try_loc, 6 tasks
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 9.51 ms
Dask DataFrame Structure:
                  id              names amount
npartitions=3                                 
               int64  category[unknown]  int64
                 ...                ...    ...
                 ...                ...    ...
                

AssertionError: found 8311676 raw bytes (expected None)

In [67]:
# error using s3 protocol for some reason
taxi = dd.read_csv('s3://nyc-tlc/trip data/yellow_tripdata_2015-*.csv')

NotImplementedError: Unknown protocol s3 (s3://nyc-tlc/trip data/yellow_tripdata_2015-*.csv)