In [1]:
######Libraries
import cudf
import pandas as pd
import os
import numpy as np
from numba import cuda
import torch
import os

In [2]:
#######Select GPU
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
print("pandas version:",pd.__version__)
print("cudf version:",cudf.__version__)
print("numpy version:",np.__version__)
print("cuda version:",torch.version.cuda)

pandas version: 1.0.1
cudf version: 0.8.0+0.g8fa7bd3.dirty
numpy version: 1.18.1
cuda version: 10.1


In [4]:
######Get handle of the current CUDA context to be able to compute the memory level stats
context=cuda.current_context()
cudf_mem_space=context.get_memory_info()

In [5]:
#'''Data-set is the spends at an online retail store.It has 8 columns the metadata is as follows
#1. Invoice         invoice number          string
#2. StockCode       Stock  code             string
#3. Description     item name               string
#4. Quantity        Quantity bought         int
#5. InvoiceDate     Date of the invoce      date-time(yyyy-mm-dd hh:mm:ss)
#6. Price           Unit Price              float
#7. Customer ID     ID of the customer      string
#8. Country         origin of the customer  string'''

In [6]:
'''Read Files
cdf_file is the cudf file i.e. on GPU
pd_file is the pandas variant i.e. on CPU
Note cudf doesn't support read/write from/to excel,pickle files'''
%time cdf_file=cudf.read_csv("online_retail.csv")
%time pd_file=pd.read_csv("online_retail.csv")

CPU times: user 162 ms, sys: 47.5 ms, total: 210 ms
Wall time: 211 ms
CPU times: user 273 ms, sys: 15.9 ms, total: 289 ms
Wall time: 288 ms


In [7]:
'''Memory consumption for both cudf and pandas'''
#context=cuda.current_context()
cudf_mem_space_post_load=context.get_memory_info()
print("Memory consumed by the CuDF:",(cudf_mem_space.free-cudf_mem_space_post_load.free)/1e9,"GB")
print("Memory consumed by the pandas frame:",(sum(pd_file.memory_usage()))/1e9,"GB")

Memory consumed by the CuDF: 0.216006656 GB
Memory consumed by the pandas frame: 0.033629632 GB


In [8]:
'''Sub-setting Data-1
Using loc'''
cdf_file_subset=cdf_file.loc[1:1000]
pd_file_subset=pd_file.loc[1:1000]

In [9]:
'''Sub-setting Data-2
Using iloc'''
cdf_file_subset=cdf_file.iloc[1:5]
pd_file_subset=pd_file.iloc[1:5]

In [28]:
'''Querying data'''
cdf_file_query=cdf_file[cdf_file.Price>10]
print(cdf_file_query.shape)
pd_file_query=pd_file[pd_file.Price>10]
print(pd_file_query.shape)

(26673, 9)
(26673, 9)


In [11]:
'''Frequency counts
1. Using value_counts
2. Using group by as a substitute for value_count method

CuDF doesn't support value_counts on a string column.
Similarly other standard methods available in pandas for string data-types are unsupported unless we make us of nvstrings package
CuDF Documentation:https://rapidsai.github.io/projects/cudf/en/latest/api.html
'''
%time pd_count = pd_file['Country'].value_counts()
%time pd_count1 = pd_file.groupby(['Country'])['StockCode'].count() 
#%time cudf_count = cdf_file['Country'].value_counts()
%time cudf_count = cdf_file.groupby(['Country'])['StockCode'].count() 

CPU times: user 32.2 ms, sys: 42 µs, total: 32.2 ms
Wall time: 31.7 ms
CPU times: user 30.9 ms, sys: 0 ns, total: 30.9 ms
Wall time: 30.4 ms
CPU times: user 14.9 ms, sys: 0 ns, total: 14.9 ms
Wall time: 14.8 ms


In [12]:
del pd_count,pd_count1

In [13]:
'''Sorting data'''
cdf_file=cdf_file.sort_values(by=['Invoice','StockCode'],ascending=True)
pd_file=pd_file.sort_values(by=['Invoice','StockCode'],ascending=True)

In [14]:
'''Extending frames'''
cdf_file1=cudf.concat([cdf_file,cdf_file],ignore_index=True)
pd_file1=pd.concat([pd_file,pd_file],ignore_index=True)

In [15]:
'''Merging frames'''
cdf_file1=cdf_file.merge(cdf_file[['Invoice','StockCode','Quantity']].rename(columns={'Quantity':'Qty_y','Invoice':'inv','StockCode':"stk"}),left_on=['Invoice','StockCode'],right_on=['inv','stk'],how="inner")
pd_file1=pd_file.merge(pd_file[['Invoice','StockCode','Quantity']].rename(columns={'Quantity':'Qty_y','Invoice':'inv','StockCode':"stk"}),left_on=['Invoice','StockCode'],right_on=['inv','stk'],how="inner")

In [16]:
del cdf_file1,pd_file1

In [17]:
'''Computation of invoice,item level net price
Approach-1:Vectorized approach
'''
%time cdf_file['Net_Price']=cdf_file.Quantity*cdf_file.Price
%time pd_file['Net_Price']=pd_file.Quantity*pd_file.Price

CPU times: user 74.8 ms, sys: 0 ns, total: 74.8 ms
Wall time: 74.6 ms
CPU times: user 13.6 ms, sys: 1.03 ms, total: 14.6 ms
Wall time: 11.2 ms


In [18]:
'''Computation of invoice,item level net price
Approach-2:Row-wise
apply chunk:incols i.e. columns required as input,outcols i.e. output generated post processing.
Note incols and outcols to be of int/float/datetime arrays.chunks is the number of rows to be allotted to each block and tpb is threads per block.
CUDA works on the principle of threads and not cores.The looping construct is automatically unrolled to the parallel 
variant by the compiler.
'''
def set_net_item_price_cudf(Quantity, Price, out):
    for i, (x, y) in enumerate(zip(Quantity,Price)):
        out[i] = x * y
def set_net_item_price_pd(Quantity, Price):
    return(Quantity*Price)
%time outdf_cudf=cdf_file.apply_chunks(set_net_item_price_cudf,incols=['Quantity', 'Price'],outcols=dict(out=np.float64),kwargs=dict(),chunks=16,tpb=10)
%time outdf_pandas=pd_file.apply(lambda x: set_net_item_price_pd(Quantity=x['Quantity'],Price=x['Price']),axis=1)

CPU times: user 150 ms, sys: 76 µs, total: 150 ms
Wall time: 152 ms
CPU times: user 8.06 s, sys: 40.1 ms, total: 8.1 s
Wall time: 8.1 s


In [19]:
del outdf_cudf,outdf_pandas

In [20]:
'''Describe doesn't work with cudf if string columns are present'''
print(cdf_file[['Price','Quantity','Net_Price']].describe())
print(pd_file.describe())

   stats       Price   Quantity   Net_Price
0  count    525461.0   525461.0    525461.0
1   mean    4.688834  10.337667   18.154506
2    std  146.126914  107.42411  160.333083
3    min   -53594.36    -9600.0   -53594.36
4    25%        1.25        1.0        3.75
5    50%         2.1        3.0        9.95
6    75%        4.21       10.0        17.7
7    max    25111.09    19152.0    25111.09
            Quantity          Price    Customer ID      Net_Price
count  525461.000000  525461.000000  417534.000000  525461.000000
mean       10.337667       4.688834   15360.645478      18.154506
std       107.424110     146.126914    1680.811316     160.333083
min     -9600.000000  -53594.360000   12346.000000  -53594.360000
25%         1.000000       1.250000   13983.000000       3.750000
50%         3.000000       2.100000   15311.000000       9.950000
75%        10.000000       4.210000   16799.000000      17.700000
max     19152.000000   25111.090000   18287.000000   25111.090000


In [21]:
'''Group-By on frames'''
cdf_group_by=cdf_file.groupby('Country',as_index=False).agg({'Price':['sum','min','max'],'Quantity' : ['sum', 'max','min'],'Net_Price':['sum','max','min']})
print(cdf_group_by.head())
pd_group_by=pd_file.groupby('Country').agg({'Price' : ['sum', 'max','min'], 'Quantity' : ['sum', 'max','min'],'Net_Price':['sum','max','min']})
pd_group_by.unstack(level=0)

     Country           sum_Price            min_Price           max_Price  sum_Quantity  max_Quantity  min_Quantity ...        min_Net_Price
22  Australia  4056.3199999999956  0.29000000000000004              662.25         20053           480           -24 ...              -662.25
2    Austria  2482.8000000000015  0.12000000000000001               130.0          6479           120           -36 ...               -130.0
12    Bahrain  352.91999999999985  0.42000000000000004  14.950000000000001          1015            96           -10 ...                -42.5
26    Belgium    7226.74999999997                  0.0  1508.6499999999999         11980           120           -30 ...  -1508.6499999999999
11    Bermuda                84.7  0.21000000000000002               12.75          2798          1152             2 ...   10.200000000000001
[2 more columns]


                Country             
Price      sum  Australia                4056.32
                Austria                  2482.80
                Bahrain                   352.92
                Belgium                  7226.75
                Bermuda                    84.70
                                          ...   
Net_Price  min  USA                       -25.50
                United Arab Emirates     -503.90
                United Kingdom         -53594.36
                Unspecified             -1189.94
                West Indies                 0.65
Length: 360, dtype: float64

In [22]:
del cdf_group_by,pd_group_by

In [23]:
'''CuDF doesn't support categories natively.
Categories are inherently converted to string while representing'''
pd_file2=pd_file.copy()
pd_file2['Country_Cat']=pd_file2.Country
pd_file2['Country_Cat']=pd_file2.Country_Cat.astype("category")
cdf_file2=cudf.DataFrame.from_pandas(pd_file2.copy())
type(cdf_file2.Country_Cat[1])

str

In [24]:
del pd_file2,cdf_file2

In [25]:
'''String Functionality
CuDF supports only nvstrings for string based maipulations.
The functionality for strings is quite similar to re based maipulation.
Simple regex block
Detailed Documentation:https://rapids.readthedocs.io/projects/nvstrings/en/latest/api.html
'''
%time string_filter_cudf=cdf_file[cdf_file.Country.str.lower().str.contains('^un',regex=True)]
%time string_filter_pandas=pd_file[pd_file.Country.str.lower().str.contains('^un',regex=True)]

CPU times: user 18.8 ms, sys: 8.24 ms, total: 27.1 ms
Wall time: 26.5 ms
CPU times: user 328 ms, sys: 4.01 ms, total: 332 ms
Wall time: 332 ms


In [26]:
'''String Functionality
CuDF supports only nvstrings for string based maipulations.The functionality for strings is quite similar to re
based maipulation.
Relatively complex regex block with the same search base
Major Performance boost only when the search space/computation space is the bottleneck
'''
%time string_filter_cudf1=cdf_file[cdf_file.Country.str.lower().str.contains('^un|and[a-z]+$',regex=True)]
%time string_filter_pandas1=pd_file[pd_file.Country.str.lower().str.contains('^un|and[a-z]+$',regex=True)]

CPU times: user 26.3 ms, sys: 7.98 ms, total: 34.3 ms
Wall time: 33.7 ms
CPU times: user 326 ms, sys: 7.93 ms, total: 334 ms
Wall time: 334 ms
