In [1]:
import dask_cudf
import cudf
import pandas as pd

In [2]:
output_file = 'timestamp.parquet'
output_dir = 'ts.parquet'

In [3]:
# Clean up before the demo here
!rm {output_dir} -r
!rm {output_file}

In [4]:
df = pd.DataFrame({'timestamp': pd.datetime(2019, 10, 21)}, index=[1])
df

Unnamed: 0,timestamp
1,2019-10-21


In [5]:
# Use pandas to write a parquet file
df.to_parquet(output_file)

In [6]:
# Use cudf to read the parquet file. This works fine
df = cudf.read_parquet(output_file)
df

Unnamed: 0,timestamp
1,2019-10-21


In [8]:
df = dask_cudf.read_parquet(output_file)
df.compute()

Unnamed: 0,timestamp
1,2019-10-21


In [9]:
# Try and write a parquet file using the cudf dataframe. Get an error because the cudf implementation seems to 
# be trying to write a directory with a partition inside of it. I would expect this to try and write a file called 
# "timestamp.parquet"
df = cudf.read_parquet(output_file)
df.to_parquet(output_dir)

  "Using CPU via PyArrow to write Parquet dataset, this will "


In [9]:
# Let's now make that directory so that cudf can write
!mkdir {output_dir}

In [10]:
# Now if we save to the directory, it seems to be outputting correctly
df.to_parquet(output_dir)

In [11]:
!ls {output_dir}

434f443d352c449bb0835c52d8db24f4.parquet


In [12]:
# Let's write to that directory again
df.to_parquet(output_dir)

In [13]:
!ls {output_dir}

434f443d352c449bb0835c52d8db24f4.parquet
73dc745063994495bfe444ea1b1cdf0b.parquet


In [14]:
# Seems as though we're writing a new parquet file out each time we call to_parquet. This is somewhat unintuitive.
# The to_parquet docs don't seem to suggest anything here either.
df.to_parquet??

[0;31mSignature:[0m [0mdf[0m[0;34m.[0m[0mto_parquet[0m[0;34m([0m[0mpath[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Write a DataFrame to the parquet format.

Parameters
----------
path : str
    File path or Root Directory path. Will be used as Root Directory path
    while writing a partitioned dataset.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
    Name of the compression to use. Use ``None`` for no compression.
index : bool, default None
    If ``True``, include the dataframe's index(es) in the file output. If
    ``False``, they will not be written to the file. If ``None``, the
    engine's default behavior will be used.
partition_cols : list, optional, default None
    Column names by which to partition the dataset
    Columns are partitioned in the order they are given

See Also
--------
cudf.io.parquet.read_parquet
cudf.io.orc.read_orc
[0;31mSource

# Read in with cudf

In [10]:
# Seems that you can't read in that directory though
df = cudf.read_parquet(output_dir)
df

FileNotFoundError: [Errno 2] No such file or directory: '/home/ericdill/dev/pydatanyc2019/RAPIDS/notebooks/ts.parquet'

In [16]:
# Need to remind myself what output_dir is here
output_dir

'ts.parquet'

In [17]:
# Let's try and make it obviously a directory
cudf.read_parquet(f'{output_dir}/')

FileNotFoundError: [Errno 2] No such file or directory: '/home/ericdill/dev/pydatanyc2019/RAPIDS/notebooks/ts.parquet'

In [18]:
# Ah that's the right syntax. Need to tell it to read all files in that dir
cudf.read_parquet(f'{output_dir}/*')

Unnamed: 0,timestamp
1,2019-10-21


In [19]:
# Though if you do read it in with pandas, things seem to work just fine, but now we have two rows instead of the one
pd.read_parquet(output_dir)

Unnamed: 0,timestamp
1,2019-10-21
1,2019-10-21


# Read in with dask_cudf

In [20]:
# Reading in with dask_cudf seems to work if we just pass it in a directory path. no need for the extra /* like with
# cudf, though in this case we get a TypeError because it seems to not understand how to handle a datetime64[ns] dtype
dask_cudf.read_parquet(output_dir)

TypeError: cannot convert datetimelike to dtype [datetime64[ms]]

In [21]:
# we get the same error no matter how we try and read these files in
dask_cudf.read_parquet(f'{output_dir}/')

TypeError: cannot convert datetimelike to dtype [datetime64[ms]]

In [22]:
# we get the same error no matter how we try and read these files in
dask_cudf.read_parquet(f'{output_dir}/*')

TypeError: cannot convert datetimelike to dtype [datetime64[ms]]

In [23]:
df = dask_cudf.read_parquet('tmp.parquet')
df.dtypes

TypeError: cannot convert datetimelike to dtype [datetime64[us]]