In [1]:
######Libraries
import cudf
import pandas as pd
import os
import numpy as np
from numba import cuda

In [2]:
#cudf.set_allocator(pool=True)
from librmm_cffi import librmm_config as rmm_cfg
rmm_cfg.use_pool_allocator = True # default is False
rmm_cfg.use_managed_memory = True # default is False

import cupy as cp
pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed)
cp.cuda.set_allocator(pool.malloc)

In [3]:
'''Read Files
cdf_file is the cudf file i.e. on GPU
pd_file is the pandas variant i.e. on CPU
Note cudf doesn't support read/write from/to excel,pickle files'''
%time cdf_file=cudf.read_csv("online_retail.csv")
%time pd_file=pd.read_csv("online_retail.csv")

CPU times: user 198 ms, sys: 173 ms, total: 371 ms
Wall time: 407 ms
CPU times: user 272 ms, sys: 23.7 ms, total: 296 ms
Wall time: 295 ms


In [4]:
#'''Data-set is the spends at an online retail store.It has 8 columns the metadata is as follows
#1. Invoice         invoice number          string
#2. StockCode       Stock  code             string
#3. Description     item name               string
#4. Quantity        Quantity bought         int
#5. InvoiceDate     Date of the invoce      date-time(yyyy-mm-dd hh:mm:ss)
#6. Price           Unit Price              float
#7. Customer ID     ID of the customer      string
#8. Country         origin of the customer  string'''

In [5]:
'''Group-by 2 variants
1. Using value_counts
2. Using group by as a substitute for value_count method

CuDF doesn't support value_counts on a string column.
Similarly other standard methods available in pandas for string data-types are 
unsupported unless we make us of nvstrings package
CuDF Documentation:https://rapidsai.github.io/projects/cudf/en/latest/api.html
'''
%time pd_count = pd_file['Country'].value_counts()
%time pd_count1 = pd_file.groupby(['Country'])['StockCode'].count() 
#%time cudf_count = cdf_file['Country'].value_counts()
%time cudf_count = cdf_file.groupby(['Country'])['StockCode'].count() 

CPU times: user 35.7 ms, sys: 0 ns, total: 35.7 ms
Wall time: 35.6 ms
CPU times: user 30.8 ms, sys: 0 ns, total: 30.8 ms
Wall time: 30.6 ms
CPU times: user 247 ms, sys: 3.42 ms, total: 251 ms
Wall time: 251 ms


In [6]:
'''Computation of invoice,item level net price
Approach-1:Vectorized approach
'''
%time cdf_file['Net_Price']=cdf_file.Quantity*cdf_file.Price
%time pd_file['Net_Price']=pd_file.Quantity*pd_file.Price

CPU times: user 1.12 ms, sys: 167 µs, total: 1.29 ms
Wall time: 834 µs
CPU times: user 5.13 ms, sys: 4.24 ms, total: 9.37 ms
Wall time: 4.24 ms


In [7]:
'''Computation of invoice,item level net price
Approach-2:Row-wise
apply chunk:incols i.e. columns required as input,outcols i.e. output generated post processing.
Note incols and outcols to be of int/float/datetime arrays.chunks is the number of rows to be processed at once and tpb is threads per block.
CUDA works on the principle of threads and not cores.The looping construct is automatically unrolled to the parallel
variant by the compiler.
'''
def set_net_item_price_cudf(Quantity, Price, out):
    for i, (x, y) in enumerate(zip(Quantity,Price)):
        out[i] = x * y
def set_net_item_price_pd(Quantity, Price):
    return(Quantity*Price)
%time outdf_cudf=cdf_file.apply_chunks(set_net_item_price_cudf,incols=['Quantity', 'Price'],outcols=dict(out=np.float64),kwargs=dict(),chunks=16,tpb=4)
%time outdf_pandas=pd_file.apply(lambda x: set_net_item_price_pd(Quantity=x['Quantity'],Price=x['Price']),axis=1)

CPU times: user 267 ms, sys: 35 µs, total: 267 ms
Wall time: 266 ms
CPU times: user 8.4 s, sys: 30.4 ms, total: 8.43 s
Wall time: 8.43 s


In [8]:
'''Get Handle of current context and compute memory consumption before filter'''
ctx=cuda.current_context()
mem=ctx.get_memory_info()
pre_cudf_mem_consumed=(mem.total-mem.free)/1e9

In [9]:
'''String Functionality and memory usage
CuDF supports only nvstrings for string based maipulations.The functionality for strings is quite similar to re
based maipulation.
Detailed Documentation:https://rapids.readthedocs.io/projects/nvstrings/en/latest/api.html
Evident that the memory consumption explodes on the GPU
'''
%time string_filter_cudf=cdf_file[cdf_file.Country.str.lower().str.contains('^un',regex=True)]
mem=ctx.get_memory_info()
post_cudf_mem_consumed=(mem.total-mem.free)/1e9
print('Memory consumption of the new cudf frame post filter:',(post_cudf_mem_consumed-pre_cudf_mem_consumed),"GB")
%time string_filter_pandas=pd_file[pd_file.Country.str.lower().str.contains('^un',regex=True)]
print('Memory consumption of the new pandas frame post filter:',sum(string_filter_pandas.memory_usage()/1e9),"GB")


CPU times: user 322 ms, sys: 32 ms, total: 354 ms
Wall time: 354 ms
Memory consumption of the new cudf frame post filter: 0.10905190399999998 GB
CPU times: user 339 ms, sys: 7.95 ms, total: 346 ms
Wall time: 346 ms
Memory consumption of the new pandas frame post filter: 0.03892752 GB
