# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [25]:
import cudf
import numpy as np
import pandas as pd

### Sample DataFrame

In [2]:
df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
df['category'] = df['category'].astype('category')

---

# Object creation

---

In [14]:
cudf.DataFrame([1,2,3,4], columns=['ints'])

Unnamed: 0,ints
0,1
1,2
2,3
3,4


In [21]:
cudf.DataFrame({'ints': [1,2,3,4], 'strings': ['a','b','c',None]})

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


In [22]:
df_sample = cudf.DataFrame()
df_sample['ints'] = [1,2,3,4]
df_sample['strings'] = ['a','b','c',None]
df_sample

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


In [23]:
cudf.DataFrame([
    (1, 'a')
    , (2, 'b')
    , (3, 'c')
    , (4, None)
], columns=['ints', 'strings'])

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


## <span style="color:blue">DataFrame</span>

#### cudf.core.dataframe.DataFrame.as_gpu_matrix()

In [5]:
## ONLY NUMERIC COLUMNS AT THE MOMENT
df[['num', 'float']].as_gpu_matrix()

<numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7f9b45db2d50>

#### cudf.core.dataframe.DataFrame.as_matrix()

In [7]:
## ONLY NUMERIC COLUMNS
df[['num', 'float']].as_matrix()

array([[39.  ,  6.88],
       [11.  ,  4.21],
       [31.  ,  4.71],
       [40.  ,  0.93],
       [33.  ,  9.26],
       [42.  ,  4.21],
       [36.  ,  3.01],
       [38.  ,  6.44],
       [17.  ,  5.28],
       [10.  ,  8.28]])

#### cudf.core.dataframe.DataFrame.from_arrow()

In [10]:
cudf.DataFrame.from_arrow(df.to_arrow())

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


#### cudf.core.dataframe.DataFrame.from_gpu_matrix()

In [12]:
#### THIS SEEMS TO BE DEPRECATED CALL AND I THINK WE SHOULD REMOVE IT
cudf.DataFrame(df[['num', 'float']].as_gpu_matrix())

Unnamed: 0,0,1
0,39.0,6.88
1,11.0,4.21
2,31.0,4.71
3,40.0,0.93
4,33.0,9.26
5,42.0,4.21
6,36.0,3.01
7,38.0,6.44
8,17.0,5.28
9,10.0,8.28


#### cudf.core.dataframe.DataFrame.from_pandas()

In [27]:
cudf.DataFrame.from_pandas(pd.DataFrame([1,2,3,4], columns=['ints']))

Unnamed: 0,ints
0,1
1,2
2,3
3,4


#### cudf.core.dataframe.DataFrame.from_records()

In [32]:
cudf.DataFrame.from_records(df[['num', 'float']].to_records())

Unnamed: 0,index,num,float
0,0,39,6.88
1,1,11,4.21
2,2,31,4.71
3,3,40,0.93
4,4,33,9.26
5,5,42,4.21
6,6,36,3.01
7,7,38,6.44
8,8,17,5.28
9,9,10,8.28


#### cudf.core.dataframe.DataFrame.to_arrow()

In [33]:
df.to_arrow()

pyarrow.Table
num: int64
float: double
datetime: timestamp[s]
timedelta: duration[s]
char: string
category: dictionary<values=string, indices=int8, ordered=0>
word: string
string: string

#### cudf.core.dataframe.DataFrame.to_csv()

In [36]:
df.to_csv('../results/df_with_index.csv')

In [37]:
df.to_csv('../results/df_no_index_no_header.csv', index=False, header=False)

In [44]:
df.to_csv('../results/df_tab_sep.tsv', sep='\t')

In [46]:
with open('../results/df_buffer.csv', 'w') as f:
    df.to_csv(f)

#### cudf.core.dataframe.DataFrame.to_dlpack()

In [51]:
df[['num']].to_dlpack()

<capsule object "dltensor" at 0x7f9b45570c30>

#### cudf.core.dataframe.DataFrame.to_feather()

#### cudf.core.dataframe.DataFrame.to_gpu_matrix()

#### cudf.core.dataframe.DataFrame.to_hdf()

#### cudf.core.dataframe.DataFrame.to_json()

In [110]:
df.to_json('../results/df_default.json')

In [111]:
df.to_json('../results/df_records.json', orient='records', lines=True)

In [56]:
df.to_json('../results/df_iso_dttm.json', date_format='iso')

#### cudf.core.dataframe.DataFrame.to_orc()

#### cudf.core.dataframe.DataFrame.to_pandas()

In [57]:
df.to_pandas()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
,,,,,,,,
0.0,39.0,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1.0,11.0,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2.0,31.0,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3.0,40.0,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4.0,33.0,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5.0,42.0,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6.0,36.0,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7.0,38.0,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8.0,17.0,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...


#### cudf.core.dataframe.DataFrame.to_parquet()

In [62]:
df_parquet = df
df_parquet['category'] = df_parquet['category'].astype('str')

In [63]:
df_parquet.to_parquet('../results/df_default.parquet')

In [67]:
df_parquet.to_parquet('../results/df_partitioned.parquet', partition_cols=['category'], partition_file_name='cat_part')

#### cudf.core.dataframe.DataFrame.to_records()

#### cudf.core.dataframe.DataFrame.to_string()

#### cudf.io.avro.read_avro()

#### cudf.io.csv.read_csv()

In [145]:
df_csv_read = cudf.read_csv('../results/df_with_index.csv')
df_csv_read.head()

Unnamed: 0.1,Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08T12:12:01Z,0 days 03:59:38.000000000,C,2,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,2,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,2,31,4.71,2020-10-10T09:26:43Z,0 days 03:35:09.000000000,U,2,memory,cuDF allows for loading
3,3,40,0.93,2020-10-11T17:10:00Z,0 days 02:54:26.000000000,P,0,tabular,If your workflow is fast enough on a single GP...
4,4,33,9.26,2020-10-15T10:58:02Z,0 days 09:52:38.000000000,O,2,parallel,If you want to distribute your workflow across...


In [146]:
df_csv_read = cudf.read_csv('../results/df_with_index.csv', nrows=2)
df_csv_read.head()

Unnamed: 0.1,Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08T12:12:01Z,0 days 03:59:38.000000000,C,2,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,2,cuDF,cuDF is a Python GPU DataFrame (built on the A...


In [75]:
df_csv_read = cudf.read_csv(
    '../results/df_with_index.csv'
    , skiprows=1
    , names=['Index', 'num', 'float', 'datetime', 'timedelta', 'char',
       'category', 'word', 'string'])
df_csv_read.head()

Unnamed: 0,Index,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08T12:12:01Z,0 days 03:59:38.000000000,C,2,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,2,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,2,31,4.71,2020-10-10T09:26:43Z,0 days 03:35:09.000000000,U,2,memory,cuDF allows for loading
3,3,40,0.93,2020-10-11T17:10:00Z,0 days 02:54:26.000000000,P,0,tabular,If your workflow is fast enough on a single GP...
4,4,33,9.26,2020-10-15T10:58:02Z,0 days 09:52:38.000000000,O,2,parallel,If you want to distribute your workflow across...


In [144]:
df_csv_read = cudf.read_csv('../results/df_tab_sep.tsv', delimiter='\t', usecols=['num', 'float'])
df_csv_read.head()

Unnamed: 0,num,float
0,39,6.88
1,11,4.21
2,31,4.71
3,40,0.93
4,33,9.26


#### cudf.io.dlpack.from_dlpack()

#### cudf.io.feather.read_feather()

#### cudf.io.hdf.read_hdf()

#### cudf.io.hdf.to_hdf()

#### cudf.io.json.read_json()

In [117]:
df_json_read = cudf.read_json('../results/df_default.json')
df_json_read['timedelta'] = df_json_read['timedelta'].astype('timedelta64[ms]').head()
df_json_read.head()

  "Using CPU via Pandas to read JSON dataset, this may "


Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01.000000000,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43.000000000,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00.000000000,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02.000000000,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [114]:
df_json_read = cudf.read_json('../results/df_records.json', lines=True, engine='cudf')
df_json_read.head()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,1602159121000.0,14378000.0,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,1602322003000.0,12909000.0,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,1602436200000.0,10466000.0,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,1602759482000.0,35558000.0,O,D,parallel,If you want to distribute your workflow across...


#### cudf.io.orc.read_orc_metadata()

#### cudf.io.orc.read_orc()

#### cudf.io.orc.to_orc()

#### cudf.io.parquet.merge_parquet_filemetadata()

#### cudf.io.parquet.read_parquet_metadata()

#### cudf.io.parquet.read_parquet()

In [140]:
df_parquet = cudf.read_parquet('../results/df_default.parquet')
df_parquet.head()

Unnamed: 0,Unnamed: 1,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08 12:12:01.000,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,2,31,4.71,2020-10-10 09:26:43.000,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,3,40,0.93,2020-10-11 17:10:00.000,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,4,33,9.26,2020-10-15 10:58:02.000,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [141]:
df_parquet = cudf.read_parquet('../results/df_default.parquet', columns=['num', 'float'])
df_parquet.head()

Unnamed: 0,num,float
0,39,6.88
1,11,4.21
2,31,4.71
3,40,0.93
4,33,9.26


In [139]:
import os

parq_file = '../results/df_partitioned.parquet'
[f'{parq_file}/{d}/{f}' for d in [d for d in os.listdir(parq_file)] for f in os.listdir(os.path.join(parq_file, d))]
df_parquet = cudf.read_parquet(
    [f'{parq_file}/{d}/{f}' 
     for d in [d for d in os.listdir(parq_file)] 
     for f in os.listdir(os.path.join(parq_file, d))]
)
df_parquet.head()

Unnamed: 0,num,float,datetime,timedelta,char,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,1970-01-01 00:00:00,0 days 00:00:00,A,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,memory,
3,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,parallel,If you want to distribute your workflow across...
4,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,,BlazingSQL is built on the RAPIDS GPU data sci...


#### cudf.io.parquet.write_to_dataset()

## <span style="color:blue">Series</span>

#### cudf.core.series.Series.from_arrow()

#### cudf.core.series.Series.from_categorical()

#### cudf.core.series.Series.from_masked_array()

#### cudf.core.series.Series.from_pandas()

In [154]:
cudf.from_pandas(pd.Series([1,2,3,4]))

0    1
1    2
2    3
3    4
dtype: int64

#### cudf.core.series.Series.to_array()

#### cudf.core.series.Series.to_arrow()

#### cudf.core.series.Series.to_dlpack()

In [151]:
df['num'].to_dlpack()

<capsule object "dltensor" at 0x7f9b44bc3510>

#### cudf.core.series.Series.to_frame()

#### cudf.core.series.Series.to_gpu_array()

#### cudf.core.series.Series.to_hdf()

#### cudf.core.series.Series.to_json()

In [155]:
df['num'].to_json('../results/series_num.json')

  "Using CPU via Pandas to write JSON dataset, this may "


#### cudf.core.series.Series.to_pandas()

In [171]:
df['num'].to_pandas()


0    39
1    11
2    31
3    40
4    33
5    42
6    36
7    38
8    17
9    10
Name: num, dtype: int64

#### cudf.core.series.Series.to_string()

#### cudf.core.series.Series.values_host()