In [None]:
# default_exp rapids

In [1]:
# export
# Setup in notebook flag
try: from nbdev.imports import IN_NOTEBOOK
except: IN_NOTEBOOK=False


# Introduction to Rapids

<img src="../nb_images/rapids.png" alt="Drawing" style="width: 600px;"/>


Rapids is a data preparation and machine learning library that is designed to take maximum advantage of the Nvidia GPU.  The libraries are called cuDF and cuML and take a lot of the same design and API semantics from Pandas and Sklearn python libaries.   Speedups of over 10x are not uncommon for a lot of everyday tasks.

If you are familiar with Pandas and Sklearn, this code in this lab will look familiar, but if not thats ok too.  Rapids is still under development, so it is not as full featured as the Pandas and Sklearn libraries, but it is continually getting new functions.  

The following lab will walk you through how to use Rapids with a sample dataset.  **This lab will focus on the performance capabilities of RAPIDS by comparing it to Pandas and Sklearn equivalent operations.** It is not meant to be a machine learning tutorial. 


## A word on performance comparisons of RAPIDS vs Pandas

Pandas and Numpy are two of the most popular libraries for both data engineers and data scientists.  The libraries are very robust and perfomant, but one major drawback is that they are single threaded libraries.  When comparing RAPIDs vs Pandas/Numpy you are seeing the benefit of parallelizing these types of tasks overs potentially thousands of seperate threads.  

## CuDF basics

Built based on the Apache Arrow columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and manipulating data.

cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.

Definitions :
* GPU Dataframe : a dataframe from the RAPIDS cuDF library running on the GPU
* Apache Arrow  : common columnar in memory data format project
* Pandas        : data preparation and engineering library


### Helper functions

Execute the functions below, they are needed for follow-on parts of the lab.  Note the **pgdf** function is a convenience function to display the GPU dataframe in a nice format for jupyter notebook.

In [2]:
# export
# Imports
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
import time
import timeit

from datetime import datetime
import math

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import glob
import os
import sys
sys.path.append('../utils/') 

#dask
import dask
from dask import dataframe as dd

# Rapids
import cudf
from cudf import DataFrame as RapidsDataFrame
# cudf                      0.11.0          cuda10.2_py37_673.g45906b8    https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda-early-access
# libcudf                   0.11.0          cuda10.2_657.g7f5e265    https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda-early-access
import torch


In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print('CUDA available: {}  Using device {}'.format(torch.cuda.is_available(), os.environ['CUDA_VISIBLE_DEVICES']))
!nvidia-smi

CUDA available: True  Using device 0
Thu Sep 24 15:59:31 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.116.00   Driver Version: 418.116.00   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000035:03:00.0 Off |                    0 |
| N/A   34C    P0    35W / 300W |     10MiB / 16130MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000035:04:00.0 Off |                    0 |
| N/A   39C    P0    36W / 300W |     10MiB / 16130MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
                                                   

In [4]:
# export
# [print gpu dataframe] helper function to print GPU dataframes 
def pgdf(gdf) :
    display(gdf.to_pandas())

In [5]:
# export
def time_command(cmd,repeat=1) :
    avg_runtime = timeit.timeit(cmd, number=repeat)
    return float(avg_runtime / repeat)

In [6]:
# export
# Dictionary to store results ..
# example "describe" : {"gpu" : []}
# TODO : make display results look better ..
class COMPARE() :
        ## Abstract Custom Implementations
    def __init__(self) :
        #nprint("Loading Data.  Overriding __init__ from dfutils")
        self.tests = []
        self.gpu_results = {}
        self.cpu_results = {}
        self.df_shape = (0,0)
        self.df_memory_gb = 0 

    def add_result(self, test_name, gpu_result, runtime) :
        if test_name not in self.tests :
            self.tests.append(test_name)
            self.gpu_results[test_name] = []
            self.cpu_results[test_name] = []
        
        if(gpu_result == "gpu") :
            self.gpu_results[test_name].append(runtime)
        else :
            self.cpu_results[test_name].append(runtime)
            
    def display_results(self) :
        print("Dataframe size : {} {} GB".format(self.df_shape, self.df_memory_gb))
        print("{:<20} {:<20} {:<20} {:<20}".format("test", "CPU(s)", "GPU(s)", "GPU Speedup"))
        for i in self.tests :
            cpu_mean = sum(self.cpu_results[i]) / (len(self.cpu_results[i])+0.00001)
            gpu_mean = sum(self.gpu_results[i]) / (len(self.gpu_results[i])+0.00001)
            su = cpu_mean / (gpu_mean + .00001)
            print("{:<20} {:<20.4f} {:<20.4f} {:<20.2f}".format(i, cpu_mean, gpu_mean, su ))

run_times = COMPARE()


In [7]:
def pca_scree(pca_explained_variance, label) :
        
    # bin is my x axis variable
    bin = []
    for i in range (len(pca_explained_variance)):
        bin.append(i+1)
    # plot the cummulative variance against the index of PCA
    cum_var = np.cumsum(pca_explained_variance)
    plt.plot(bin, cum_var)
    # plot the 95% threshold, so we can read off count of principal components that matter
    plt.plot(bin, [.95]*n_components, '--')
    plt.plot(bin, [.75]*n_components, '--')
    plt.plot(bin, [.50]*n_components, '--')
    #turn on grid to make graph reading easier
    plt.grid(True)
    #plt.rcParams.update({'font.size': 24})
    plt.suptitle(label + ' PCA Variance Explained')
    plt.xlabel('Number of PCA Components', fontsize=18)
    plt.ylabel('Fraction of Variance \nExplained', fontsize=16)
    # control number of tick marks, 
    plt.xticks([i for i in range(0,n_components)])
    plt.show()


### Useful DataFrame attributes

When you create a GPU dataframe, there are a number of methods available for you to understand the composition.  The detailed list is found in the Rapids [cuDF documentation](https://docs.rapids.ai/api/cudf/0.7/) 

Below we will create a small cuDF dataframe and look at some of its attributes.  A few of these attributes come in handy when debugging 

* dtypes  :  Shows all the columns and associated data types 
* shape   :  Shows the shape (rows / columns) of the dataframe
* columns :  Show the column names in a python list


In [8]:
# Create a simple GPU dataframe
df = cudf.DataFrame()
df['column1'] = [0, 1, 2, 3, 4]
df['column2'] = [float(i + 10) for i in range(5)]  # insert column
df['column3'] = ["bbb","aaa","ccc","eee","Ddd"]  # insert column

In [9]:
#Print the dataframe
pgdf(df)

Unnamed: 0,column1,column2,column3
0,0,10.0,bbb
1,1,11.0,aaa
2,2,12.0,ccc
3,3,13.0,eee
4,4,14.0,Ddd


In [10]:
# Dataframe attributes
print("\nDataframe datatypes\n---------------------")
print(df.dtypes)
print("\nDataframe Shape\n---------------------")
print(df.shape)
print("\nDataframe dimensions\n---------------------")
print(df.ndim)
print("\nDataframe Column names\n---------------------")
print(df.columns)


Dataframe datatypes
---------------------
column1      int64
column2    float64
column3     object
dtype: object

Dataframe Shape
---------------------
(5, 3)

Dataframe dimensions
---------------------
2

Dataframe Column names
---------------------
Index(['column1', 'column2', 'column3'], dtype='object')


### Create a cuDF dataframe from Numpy/Pandas array
Rapids cuDF supports the conversion of pandas and numpy arrays to cuDF dataframes.  In the example below we show examples of how you can do this for each type

In [11]:
# Numpy array to cuDF
# Dataframe Operations : Create random large array 100x100
a = np.random.rand(100,100)
df_np = cudf.DataFrame()
df_np = df.from_records(a)
#df['random_column1'] = [0, 1, 2, 3, 4]
pgdf(df_np.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0.487063,0.107886,0.670371,0.026962,0.560609,0.432403,0.536552,0.545312,0.592241,0.00788,0.327313,0.05172,0.829418,0.958754,0.15842,0.963313,0.873846,0.608474,0.210962,0.644148,0.059504,0.61676,0.226736,0.444431,0.861092,0.295036,0.496265,0.369788,0.010312,0.363138,0.645245,0.809001,0.346995,0.273324,0.016856,0.678432,0.461251,0.039276,0.552048,0.10715,0.873116,0.360483,0.478771,0.414374,0.339244,0.126572,0.898901,0.245395,0.172477,0.104131,0.140801,0.147049,0.451177,0.129481,0.937017,0.791244,0.187821,0.898063,0.887043,0.477089,0.467038,0.027253,0.567575,0.432047,0.950369,0.547093,0.955857,0.816821,0.371312,0.243145,0.919843,0.326362,0.751768,0.223272,0.385622,0.63262,0.855637,0.128959,0.789657,0.235604,0.321292,0.561276,0.362554,0.725306,0.538064,0.764781,0.900475,0.118097,0.100477,0.145719,0.436386,0.02618,0.763663,0.868427,0.864073,0.147392,0.916518,0.322387,0.232444,0.711803
1,0.401781,0.475451,0.654213,0.420544,0.686244,0.730971,0.223353,0.7931,0.678607,0.055624,0.886125,0.442811,0.140621,0.80012,0.796192,0.123919,0.78538,0.761437,0.18134,0.926132,0.83754,0.340437,0.924454,0.89819,0.076367,0.456116,0.494565,0.732182,0.521672,0.332849,0.584471,0.642314,0.14785,0.929042,0.747092,0.647148,0.242122,0.022352,0.898393,0.223311,0.943879,0.310344,0.721164,0.413417,0.192213,0.241785,0.508413,0.417642,0.916746,0.683465,0.928794,0.559451,0.625767,0.309821,0.993527,0.8138,0.747653,0.707095,0.101306,0.077986,0.936933,0.156015,0.319661,0.788917,0.525045,0.985885,0.321876,0.669599,0.834269,0.379714,0.525923,0.727284,0.113982,0.08639,0.199212,0.863709,0.733633,0.628827,0.099083,0.69766,0.412346,0.32883,0.875561,0.72129,0.139925,0.783358,0.949737,0.1605,0.29387,0.768275,0.506015,0.830842,0.094348,0.7166,0.527955,0.370597,0.943197,0.029915,0.643944,0.691096
2,0.022744,0.951857,0.094303,0.234897,0.599863,0.41674,0.621469,0.371451,0.938372,0.711118,0.982421,0.071402,0.921898,0.117282,0.950755,0.133048,0.437475,0.84895,0.797318,0.120704,0.957598,0.565738,0.519686,0.096124,0.690008,0.735519,0.645847,0.014807,0.767014,0.451536,0.034897,0.275563,0.187501,0.683815,0.369588,0.109024,0.046202,0.531192,0.016567,0.77103,0.312422,0.052252,0.68844,0.942304,0.461906,0.395479,0.273915,0.516898,0.5673,0.529853,0.020673,0.640746,0.482178,0.078469,0.131987,0.278422,0.117902,0.436815,0.501116,0.48919,0.007,0.357277,0.410471,0.669231,0.931071,0.412294,0.669423,0.01564,0.14703,0.423312,0.585282,0.172574,0.490988,0.565234,0.530792,0.136624,0.227413,0.977547,0.649131,0.9233,0.183887,0.184856,0.023282,0.867577,0.490615,0.254135,0.893992,0.758705,0.886665,0.934053,0.602727,0.24938,0.266062,0.570837,0.785842,0.239003,0.936134,0.716941,0.781074,0.683984
3,0.22081,0.410815,0.543956,0.603065,0.566559,0.554298,0.144652,0.159751,0.47647,0.171508,0.592539,0.81216,0.655474,0.492858,0.446876,0.615389,0.332226,0.098528,0.87845,0.679908,0.184004,0.704519,0.635723,0.660086,0.939387,0.707578,0.484488,0.166905,0.444275,0.568951,0.04567,0.442688,0.077286,0.302675,0.252797,0.333097,0.708075,0.699449,0.265802,0.927745,0.748393,0.026172,0.060312,0.075304,0.310673,0.168267,0.3423,0.768772,0.562616,0.494859,0.719226,0.033519,0.404626,0.712734,0.772613,0.981675,0.451521,0.413048,0.702225,0.471117,0.549422,0.586451,0.641735,0.762174,0.498848,0.735894,0.876914,0.218558,0.071397,0.372307,0.589504,0.032764,0.351038,0.383755,0.0399,0.686366,0.762663,0.70787,0.970392,0.844811,0.936298,0.237357,0.534223,0.635976,0.897714,0.59747,0.591318,0.787614,0.916536,0.384936,0.326255,0.86785,0.182587,0.185236,0.971285,0.54266,0.234777,0.373119,0.322263,0.919679
4,0.385721,0.248046,0.547816,0.58536,0.470623,0.71872,0.053199,0.01562,0.477535,0.644032,0.272806,0.9471,0.825091,0.769404,0.767984,0.812603,0.987485,0.751756,0.961848,0.561456,0.772748,0.263457,0.291471,0.03848,0.944864,0.738175,0.683997,0.56698,0.635521,0.557066,0.423952,0.576722,0.149454,0.15072,0.65072,0.110922,0.350051,0.056796,0.417889,0.23975,0.861446,0.874237,0.87741,0.290066,0.963722,0.4308,0.068263,0.453051,0.82126,0.292518,0.296527,0.855491,0.813805,0.916173,0.533173,0.460046,0.957569,0.894071,0.921891,0.279941,0.698328,0.834685,0.325597,0.869973,0.629648,0.474953,0.954589,0.961897,0.251461,0.74695,0.804705,0.948099,0.235968,0.802481,0.848239,0.807711,0.826352,0.467043,0.944694,0.990046,0.328633,0.394857,0.557224,0.062083,0.854199,0.449028,0.348674,0.765135,0.492598,0.131543,0.24489,0.651983,0.844964,0.340633,0.749634,0.661969,0.397946,0.892203,0.04861,0.77917


In [12]:
# Pandas Example
pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})
df = cudf.from_pandas(pdf)
pgdf(df)

Unnamed: 0,a,b
0,0,0.1
1,1,0.2
2,2,
3,3,0.3


### Dataframe Operations : Slice  Example - grab 3 arbitrary columns

Sometimes you want to grab slices of dataframes.  Here you can just pass a list of column names to the GPU dataframe to return the columns you want.

In [13]:
# Use the df_np array we created above and grab columns 0,1, and 5
new_df = df_np[[0,1,5]] 
pgdf(new_df.head(10))

Unnamed: 0,0,1,5
0,0.487063,0.107886,0.432403
1,0.401781,0.475451,0.730971
2,0.022744,0.951857,0.41674
3,0.22081,0.410815,0.554298
4,0.385721,0.248046,0.71872
5,0.848052,0.650879,0.361669
6,0.492547,0.184379,0.045849
7,0.454711,0.2423,0.993297
8,0.998014,0.679075,0.702568
9,0.294045,0.003177,0.422079


### Optional Exercise : Create a Random numpy array 1000 x 1000 and then convert to GPU dataframe.  The select columns 444,555,888 from the array.



## CuML basics

CuML is the a machine learning library implemented on the Nvidia GPU.  This allows you to use many of the most common machine learning algorithms without having to write CUDA code.  The list of algorithms is growing with each release so its worth taking a look at the cuML github repo, but in general you can expect a 10x to 50x performance speedup when using the GPU enabled algorithm.  **Later in this lab we will see examples of PCA and linear regression.**

## Dask

<img src="../nb_images/dask.png" alt="Drawing" style="float :left; margin-right: 20px; width: 200px;" />


Dask is an extremely useful python library that enables parallel execution of arbitrary python programs allowing you to make maximum use of system resources.  It is typically used for libraries that are written in single threaded implementation like pandas and numpy, but its also very useful for running many a parallel tasks when using RAPIDS.  We will have a code sample to demonstrate this at the end of the lab.


# Lab Use Case 

**The main goal of this lab is to focus on the performance differences of Rapids(GPU) vs Pandas/Sklearn (CPU) implementations.**  
<br>

<img src="../nb_images/lendingclub.png" alt="Drawing" style="float :left; margin-right: 20px; width: 300px;" />
<br>To do this we will use the Lending Club publicly available dataset. 
This data set is published by lending club and contains information regarding prospective loan applicants.    
<br><br><br><br>

**As we go through the lab, we will show the similarity in the syntax/usage of the RAPIDS library using this real world dataset and keep track of the runtimes in a comparison report.**



# Lending Club data and Lab Details

Here we will load in the lending club dataset and perform some basic data preparation steps.  

Each section is composed of the same workflow

- timed cpu example
- timed gpu example
- comparison of results
- logging of runtimes into a comparison table


## Load the Lending Club Data

Here we will load the data twice.  Once into a pandas dataframe **loan_pdf** and once into a rapids dataframe **loan_rdf**.  

In [14]:
# export

filename = None
if not IN_NOTEBOOK:
    filename = "../Rapids/loan_project_df.parquet.gzip"
else :
    # import data
    filename = "./loan_project_df.parquet.gzip"

# Expand data to highlight performance difference
# 3 ~ 1GB dataset
# 4 ~ 2GB dataset 
# ... etc
DATA_DOUBLE_FACTOR=3

# Pandas dataframe
loan_pdf = pd.read_parquet(filename)#  , names=ts_cols,dtype=ts_dtypes,skiprows=1)

# Rapids Dataframe
loan_rdf = cudf.read_parquet(filename)#  , names=ts_cols,dtype=ts_dtypes,skiprows=1)

In [15]:
# export
# Scale up data to 10 million rows 
for i in range(DATA_DOUBLE_FACTOR) :
    loan_pdf = pd.concat([loan_pdf,loan_pdf],axis=0)
    loan_rdf = cudf.concat([loan_rdf,loan_rdf],axis=0)
    loan_rdf = loan_rdf.reset_index().drop("index",axis=1)
    loan_pdf = loan_pdf.reset_index().drop("index",axis=1)
    #pgdf(loan_rdf.head())
    #display(loan_pdf.head())
    

### RAPIDS Dataframe attributes
Take a look at both the Rapids dataframe and Pandas dataframe printouts below and convince yourself these are the same

In [16]:
# export

# Dataframe attributes
print("Rapids")
print("\nDataframe datatypes\n---------------------")
print(loan_rdf.dtypes)
print("\nDataframe Shape (rows,cols)\n---------------------")
print(loan_rdf.shape)
print("\nDataframe dimensions\n---------------------")
print(loan_rdf.ndim)
print("\nDataframe Column names\n---------------------")
print(loan_rdf.columns)


Rapids

Dataframe datatypes
---------------------
loan_amnt                  float64
annual_inc                 float64
dti                        float64
fico_range_high            float64
open_acc                   float64
funded_amnt                float64
total_acc                  float64
grade                       object
default                      int64
issue_d             datetime64[ms]
earliest_cr_line    datetime64[ms]
addr_state                  object
dtype: object

Dataframe Shape (rows,cols)
---------------------
(10224344, 12)

Dataframe dimensions
---------------------
2

Dataframe Column names
---------------------
Index(['loan_amnt', 'annual_inc', 'dti', 'fico_range_high', 'open_acc', 'funded_amnt', 'total_acc', 'grade', 'default', 'issue_d', 'earliest_cr_line', 'addr_state'], dtype='object')


### Pandas Dataframe attributes


In [17]:
# export

# Dataframe attributes
print("\n\nPandas")
print("\nDataframe datatypes\n---------------------")
print(loan_pdf.dtypes)
print("\nDataframe Shape (rows,cols)\n---------------------")
print(loan_pdf.shape)
print("\nDataframe dimensions\n---------------------")
print(loan_pdf.ndim)
print("\nDataframe Column names\n---------------------")
print(loan_pdf.columns)
print("\nDataframe Memory Usage\n---------------------")
print(loan_pdf.memory_usage(index=True).sum())




Pandas

Dataframe datatypes
---------------------
loan_amnt                  float64
annual_inc                 float64
dti                        float64
fico_range_high            float64
open_acc                   float64
funded_amnt                float64
total_acc                  float64
grade                       object
default                      int64
issue_d             datetime64[ns]
earliest_cr_line    datetime64[ns]
addr_state                  object
dtype: object

Dataframe Shape (rows,cols)
---------------------
(10224344, 12)

Dataframe dimensions
---------------------
2

Dataframe Column names
---------------------
Index(['loan_amnt', 'annual_inc', 'dti', 'fico_range_high', 'open_acc', 'funded_amnt', 'total_acc', 'grade', 'default', 'issue_d', 'earliest_cr_line', 'addr_state'], dtype='object')

Dataframe Memory Usage
---------------------
981537104


### Inspect DataSet and Size

In [18]:
%nbdev_export

run_times.df_shape = loan_pdf.shape
run_times.df_memory_gb = loan_pdf.memory_usage(index=True).sum() /10**9

print("Initial Data set size ~= {:0.3f} GB for this experiment".format(run_times.df_memory_gb))


Initial Data set size ~= 0.982 GB for this experiment


In [19]:
# Print out a small sample of the dataframe
pgdf(loan_rdf)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_high,open_acc,funded_amnt,total_acc,grade,default,issue_d,earliest_cr_line,addr_state
0,2500.0,53700.00,28.56,779.0,20.0,2500.0,38.0,A,0,2018-03-01,2001-11-01,PA
1,25000.0,40000.00,27.78,729.0,8.0,25000.0,16.0,C,0,2018-03-01,2003-03-01,OR
2,20000.0,19000.00,45.17,799.0,6.0,20000.0,9.0,B,0,2018-03-01,1987-10-01,FL
3,30000.0,51500.00,9.93,669.0,7.0,30000.0,9.0,C,0,2018-03-01,2011-03-01,FL
4,15000.0,195000.00,21.10,679.0,18.0,15000.0,34.0,C,0,2018-03-01,2000-10-01,FL
5,12000.0,240000.00,10.99,729.0,14.0,12000.0,23.0,A,0,2018-03-01,1990-01-01,CT
6,7000.0,180700.00,18.98,714.0,13.0,7000.0,17.0,A,0,2018-03-01,2004-06-01,IL
7,34000.0,105000.00,20.16,714.0,18.0,34000.0,22.0,B,0,2018-03-01,2007-04-01,HI
8,5000.0,36610.00,13.85,769.0,15.0,5000.0,20.0,A,0,2018-03-01,2005-09-01,NJ
9,26000.0,33000.00,20.05,709.0,9.0,26000.0,21.0,B,0,2018-03-01,1994-02-01,IL


## Descriptive Statistics - Describe Performance comparison

The first comparison we will make is using the describe function.  Describe is useful because it looks at all the descriptive statistics of the dataset.  It calculates **mean/standard deviation/medain statistics** for all the numerical columns.  If you have a large dataframe it can take some time to calculate.  Lets see how Rapids performs  with this dataset.

In [23]:
# CPU / pandas
loan_pdf.describe()

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_high,open_acc,funded_amnt,total_acc,default
count,10224340.0,10224340.0,10224340.0,10224340.0,10224340.0,10224340.0,10224340.0,10224340.0
mean,15500.27,80379.44,19.44029,706.1987,11.64129,15500.25,23.35774,0.04722298
std,9750.741,140596.1,17.78771,34.72743,5.863428,9750.743,12.06642,0.2121155
min,1000.0,0.0,-1.0,664.0,0.0,1000.0,2.0,0.0
25%,8000.0,47000.0,11.83,679.0,8.0,8000.0,15.0,0.0
50%,13000.0,66840.0,17.97,699.0,11.0,13000.0,21.0,0.0
75%,20500.0,96000.0,25.01,724.0,15.0,20500.0,30.0,0.0
max,40000.0,110000000.0,999.0,850.0,101.0,40000.0,176.0,1.0


In [21]:
# GPU / Rapids
pgdf(loan_rdf.describe())

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_high,open_acc,funded_amnt,total_acc,default
count,10224340.0,10224340.0,10224340.0,10224340.0,10224340.0,10224340.0,10224340.0,10224340.0
mean,15500.27,80379.44,19.44029,706.1987,11.64129,15500.25,23.35774,0.047223
std,9750.741,140596.1,17.78771,34.72743,5.863428,9750.743,12.06642,0.212115
min,1000.0,0.0,-1.0,664.0,0.0,1000.0,2.0,0.0
25%,8000.0,47000.0,11.83,679.0,8.0,8000.0,15.0,0.0
50%,13000.0,66840.0,17.97,699.0,11.0,13000.0,21.0,0.0
75%,20500.0,96000.0,25.01,724.0,15.0,20500.0,30.0,0.0
max,40000.0,110000000.0,999.0,850.0,101.0,40000.0,176.0,1.0


In [24]:
# export

# Record results
def describe_gpu():
    loan_rdf.describe()

def describe_cpu():
    loan_pdf.describe()

#display(loan_rdf.describe().to_pandas())

run_times.add_result("describe", "gpu", time_command(describe_gpu))
run_times.add_result("describe", "cpu", time_command(describe_cpu))

run_times.display_results()

Dataframe size : (10224344, 12) 0.981537104 GB
test                 CPU(s)               GPU(s)               GPU Speedup         
describe             4.0281               0.2536               15.88               


## One Hot Encoding (OHE) Performance Comparison

One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction.

Currently, one hot endcoding for Rapids requires the column that is to be encoded to be an integer or float, not a string.  You will need to create an integer column prior to using this!  You can use the hash_encode method to accomplish this, although you lose a little bit of readability.  In future versions of the software this is fixed.

In [None]:
# CPU / pandas example
ohe_cpu_df = pd.get_dummies(loan_pdf['grade'])

In [None]:
tmpdf = cudf.DataFrame()
tmpdf['grade'] = loan_pdf['grade']
ohe_gpu_df = cudf.get_dummies(tmpdf)
#loan_rdf['grade'].dtype

In [None]:
# Compare the results
print("Pandas ...")
display(ohe_cpu_df.head(10))
#pgdf(ohe_gpu_df[ohe_gpu_df['grade']=='A'].head(20))
print("Rapids ...")
display(ohe_gpu_df.head(10))



In [None]:
# export
# Record the results

def ohe_cpu() :
    x=pd.get_dummies(loan_pdf['grade'])

def ohe_gpu() :
    tmpdf = cudf.DataFrame()
    tmpdf['grade'] = loan_pdf['grade']
    x=ohe_gpu_df = cudf.get_dummies(tmpdf)

run_times.add_result("one_hot_encode", "cpu", time_command(ohe_cpu))
run_times.add_result("one_hot_encode", "gpu", time_command(ohe_gpu))
run_times.display_results()


## Filter with Date and Time ops - Performance comparison

Current datetime functionality is limited to filtering data set for specific times.  Datetime doesn't not yet support math operations.

Here we will find loan applicants that have a credit line prior to 2010.

In [None]:
# export
import datetime as dt

search_date = dt.datetime.strptime('2010-01-01', '%Y-%m-%d')

In [None]:
# CPU / pandas
query_cpu=loan_pdf.query('earliest_cr_line <= @search_date')


In [None]:
# GPU / Rapids
query_gpu=loan_rdf.query('earliest_cr_line <= @search_date')


In [None]:
# compare results
display(query_cpu.head())
pgdf(query_gpu.head())

In [None]:
# export
# Filter Record results
def filter_cpu():
    loan_pdf.query('earliest_cr_line <= @search_date')
    
def filter_gpu():
    loan_rdf.query('earliest_cr_line <= @search_date')
    
run_times.add_result("filter_dt", "cpu", time_command(filter_cpu,repeat=3))
run_times.add_result("filter_dt", "gpu", time_command(filter_gpu,repeat=3))
run_times.display_results()


## Sort by value

Sorting is a very expensive operation in data preparation so its useful to evaluate the performance of method.  Here we select a column to sort by and then compare the results.

In [None]:
# CPU / pandas
sort_cpu=loan_pdf.sort_values(by='fico_range_high')

In [None]:
# GPU / Rapids
sort_gpu=loan_rdf.sort_values(by='fico_range_high')

In [None]:
# compare results
display(sort_cpu.head())
pgdf(sort_gpu.head())

In [None]:
# export
# Sorting Record results
def sort_cpu():
    loan_pdf.sort_values(by='fico_range_high')
    
def sort_gpu():
    loan_rdf.sort_values(by='fico_range_high')
    
run_times.add_result("sorting", "cpu", time_command(sort_cpu, repeat=2))
run_times.add_result("sorting", "gpu", time_command(sort_gpu, repeat=2))
run_times.display_results()


## Histograms and Custom functions

Here we demonstrate how fast Rapids is at creating histogram bins.  We use the loan_amount column with a custom function to create a loan_bins column.  Then we grab the value counts using both Pandas and Rapids to get a rough comparison of the speed of these types of operations.


In [None]:
# export
# custom function example : creates simple bins for loan_amount histogram
def roundto(num):
    roundto=5000
    a = int(num / roundto)
    return float(a*roundto) 


In [None]:
# CPU / pandas

loan_pdf['loan_bins'] = loan_pdf.loan_amnt.apply(roundto)
loan_pdf['loan_bins'].value_counts()


In [None]:
# GPU / rapids
loan_rdf['loan_bins'] = loan_rdf.loan_amnt.applymap(roundto)
print(loan_rdf['loan_bins'].value_counts())



In [None]:
# export
# Record the results
def hist_cpu() :
    loan_pdf['loan_bins'] = loan_pdf.loan_amnt.apply(roundto)
    loan_pdf['loan_bins'].value_counts()

def hist_gpu() :
    loan_rdf['loan_bins'] = loan_rdf.loan_amnt.applymap(roundto)
    loan_rdf['loan_bins'].value_counts()

run_times.add_result("histogram_ops", "cpu", time_command(hist_cpu,repeat=3))
run_times.add_result("histogram_ops", "gpu", time_command(hist_gpu,repeat=3))
run_times.display_results()


## Groupby 

Here we perform some aggregation on the lending club data set to get some **per grade statistics**.  For this exercise we will compare the speed of aggregating over Pandas dataframes and Rapids dataframes using the **groupby** function as shown in the [Rapids documentation](https://docs.rapids.ai/api/cudf/stable/) .  Notice how the syntax is exactly the same!

In [None]:
# export
# CPU / Pandas
# stats by grade
#grade_stats_pdf = loan_pdf.groupby('grade', as_index=False).agg({"annual_inc": ["count","mean"], "loan_amnt": ["count","mean"]})
grade_stats_pdf = loan_pdf.groupby('grade', as_index=False).agg({"annual_inc": "mean", "loan_amnt": "mean"})


In [None]:
# export
#GPU / Rapids
# stats by grade
# grade_stats_rdf = loan_rdf.groupby('grade', as_index=False).agg({"annual_inc": "count","annual_inc": "mean", "loan_amnt": "count","loan_amnt":"mean"})
grade_stats_rdf = loan_rdf.groupby('grade', as_index=False).agg({"annual_inc": "mean","loan_amnt":"mean"})


In [None]:
# Grade summary statistics
display(grade_stats_pdf)
pgdf(grade_stats_rdf)

In [None]:
# export
# Record the results

def groupby_cpu() :
    loan_pdf.groupby('grade', as_index=False).agg({"annual_inc": "mean", "loan_amnt": "mean"})
# export

def groupby_gpu() :
    loan_rdf.groupby('grade', as_index=False).agg({"annual_inc": "mean","loan_amnt":"mean"})

run_times.add_result("groupby_ops", "cpu", time_command(groupby_cpu,repeat=4))
run_times.add_result("groupby_ops", "gpu", time_command(groupby_gpu,repeat=4))
run_times.display_results()

## Join 

Joining two dataframes can be an extremely computationally expensive task.  **Here we take the grade summary statistics computed in the groupby experiment above, and join it back with our table using grade as the key**.  This is a common practice in machine learning to apply average values per group back to the individual row.  This is a form of [mean encoding](https://towardsdatascience.com/why-you-should-try-mean-encoding-17057262cd0)

In [None]:
# Pandas Join
loan_join_pdf = loan_pdf.set_index('grade').join(grade_stats_pdf.set_index('grade'),lsuffix='_l',rsuffix='_r',on="grade",how="left").reset_index()

In [None]:
#cuDF Join
loan_join_rdf = loan_rdf.set_index('grade').join(grade_stats_rdf.set_index('grade'),lsuffix='_l',rsuffix='_r',how="left").reset_index()

In [None]:
# display the results
display(loan_join_pdf.head(5))
display(loan_join_rdf.head(5))


In [None]:
# export
# Record the results
def join_cpu() :
    loan_join_pdf = loan_pdf.set_index('grade').join(grade_stats_pdf.set_index('grade'),lsuffix='_l',rsuffix='_r',on="grade",how="left").reset_index()
def join_gpu() :
    loan_join_rdf = loan_rdf.set_index('grade').join(grade_stats_rdf.set_index('grade'),lsuffix='_l',rsuffix='_r',on="grade",how="left").reset_index()

run_times.add_result("join_ops", "cpu", time_command(join_cpu,repeat=1))
run_times.add_result("join_ops", "gpu", time_command(join_gpu,repeat=1))
run_times.display_results()

# Machine Learning

## PCA (cuML and sklearn) - Performance comparison

<img src="https://github.com/dustinvanstee/random-public-files/raw/master/techu-pca.png"  width="200" height="125" align="middle"/>

Principal component analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. The number of principal components is less than or equal to the number of original variables.

A simple way to think about PCA is that it helps compress the data in a lossy representation of the original dataset.

**Lets compare the performance of the Sklearn (cpu-based) implemenation vs cuML!**

In [None]:
# export
# Helper function to normalize GPU dataframe function
def normalize_df(gdf) :
    for col in gdf.columns :
        gdf[col] = (gdf[col] - gdf[col].mean()) / gdf[col].std()
    return gdf

### Prepare the data for PCA [not timed]
Here we do some initial data preparation to normalize the dataframe columns.  We arent comparing performance of this step, its just to get us ready to do the comparison.

In [None]:
# export
X_cols = list(loan_rdf.columns)
print("Analysis Continuing with {}".format(X_cols))
X_cols.remove('default')
X_cols.remove('grade')
# X_cols.remove('grade_hash')
X_cols = [x for x in X_cols if loan_rdf[x].dtype == "float64" or loan_rdf[x].dtype == "int8"]
print("Analysis Continuing with {}".format(X_cols))
# All types must be same ....
for x in X_cols :
    loan_rdf[x] = loan_rdf[x].astype("float64")

#print(loan_rdf[X_cols].dtypes)
print("Normalizing dataframe prior to PCA")
loan_norm_rdf = normalize_df(loan_rdf[X_cols])
print("Copying dataframe to pandas")
loan_norm_pdf = loan_norm_rdf.to_pandas()


In [None]:
print("Normalized Dataframe")
print(loan_norm_rdf[X_cols].dtypes)
pgdf(loan_norm_rdf.head(10)) #.describe()


## Principal Component Analysis (PCA) Performance 

We will compare the runtimes of PCA on CPU and then on GPU and also compare the results to make sure they are the same.  

Here we take the normalized frame we built above and copy to pandas.  The two dataframes we will be working with are 

* loan_norm_pdf : normalized pandas dataframe
* loan_norm_rdf : normalized GPU/RAPIDS dataframe

these are exactly the same dataframe ...

In [None]:
# export
# PCA
# Both import methods supported
from cuml import PCA
from cuml.decomposition import PCA as PCA_gpu
from sklearn.decomposition import PCA as PCA_cpu
n_components=5


In [None]:
# RUN PCA ! : CPU / Sklearn implementation
pca_loan_cpu = PCA_cpu(n_components=n_components)
pca_loan_cpu.fit(loan_norm_pdf)


In [None]:
# RUN PCA ! : GPU / cuML implementation
pca_loan_gpu = PCA_gpu(n_components=n_components)
pca_loan_gpu.fit(loan_norm_rdf)


**Compare results** : For PCA we use a scree plot to compare the results.  Scree plots show how much variance in the dataset is explained by each additional principal component.  Below, run the cell and just eyeball the graphs and convince yourself they are the same
    

In [None]:
# Compare results ...

display(pca_scree(pca_loan_cpu.explained_variance_ratio_, "CPU"))
pca_scree(pca_loan_gpu.explained_variance_ratio_.tolist(), "GPU")

In [None]:
# export
# record PCA performance results
def pca_cpu() :    
    print("cpu pca")
    pca_loan_cpu = PCA_cpu(n_components=n_components)
    pca_loan_cpu.fit(loan_norm_pdf)


def pca_gpu() :
    print("gpu pca")
    pca_loan_gpu = PCA_gpu(n_components=n_components)
    pca_loan_gpu.fit(loan_norm_rdf)

    
#print(loan_norm_rdf.shape)    
run_times.add_result("pca", "gpu", time_command(pca_gpu, repeat=2))
run_times.add_result("pca", "cpu", time_command(pca_cpu, repeat=2))

run_times.display_results()

## Linear Regression (cuML / sklearn // snapML)

Linear regression is one of the most common algorithms applied to structured data.  Its useful when trying to make a prediction of a continuous variable.  For example, you could use linear regression to try and predict the total expected payment of a loan given historical data about default rates.  Lets try this below with our data set.  (Note lending club doesn't explicity provide this data in its data set, so we will use a fictitious total_payment column in our analysis)

In [None]:
# export
from sklearn.linear_model import LinearRegression as LRSKL
from cuml.linear_model import LinearRegression as LRCUML


In [None]:
# Linear Regression : CPU / Sklearn
X = loan_norm_rdf.to_pandas()
y = loan_rdf['default'].to_pandas()    
lr_cpu = LRSKL(fit_intercept = True, normalize = False)
res_cpu = lr_cpu.fit(X,y)

In [None]:
# Linear Regression : GPU / Rapids cuML example
X = loan_norm_rdf
y2 = loan_rdf['default'].astype("float64")    
lr_gpu = LRCUML(fit_intercept = True, normalize = False) #, algorithm = "eig")
res_gpu = lr_gpu.fit(X,y2)


In [None]:
# Compare results
print("Coefficients:")
print(res_cpu.coef_)
print("intercept:")
print(res_cpu.intercept_)

print("Coefficients:")
print(res_gpu.coef_)
print("intercept:")
print(res_gpu.intercept_)


In [None]:
# export
#Record Results 

# CPU 
def lr_cpu() :
    lr_cpu = LRSKL(fit_intercept = True, normalize = False)
    res = lr_cpu.fit(X,y)
    
X = loan_norm_rdf.to_pandas()
y = loan_rdf['default'].to_pandas()    
run_times.add_result("linear_reg", "cpu", time_command(lr_cpu, repeat=5))


# GPU
def lr_gpu() :
    lr_gpu = LRCUML(fit_intercept = True, normalize = False, algorithm = "eig")
    res = lr_gpu.fit(X,y)

X = loan_norm_rdf
y = loan_rdf['default'].astype("float64")    
run_times.add_result("linear_reg", "gpu", time_command(lr_gpu, repeat=5))



run_times.display_results()    



# Summary

In this lab we covered a number of common functions used by both data engineers and data scientists to manipulate dataframes and also build machine learning models.  The RAPIDS implementation demonstrates how much time you can save by running a lot of these operations on the GPU.   As data set sizes grow, and the number of experiments required increase, this performance gain can be a real advantage for getting to the answers faster.  Lets recap your speedups here ...

In [None]:
run_times.display_results()    

Note, you can play with the dataset size and rerun the notebook to see how that impacts your run results!  TL;DR the larger your dataframe the better the GPU speedups ...

## Optional:  Dask and CuDF Short Demo

Scaling out is pretty easy with Dask.  You can reuse the same Rapids based code you used in the above examples with fairly minimal changes.  Below is a quick example of how to run on a cluster of Nodes using Summit.

In [None]:
from dask_jobqueue import LSFCluster
# Per node specification
cluster = LSFCluster(
    scheduler_options={"dashboard_address": ":3762"},
    cores=8,
    processes=1, # default sqrt(cores). set to one to max threads per machine.  better for numpy.  see https://docs.dask.org/en/latest/setup/single-machine.html
    memory="4 GB",
    project="VEN201",
    walltime="03:30",
    job_extra=["-nnodes 1"],          # <--- new!
    header_skip=["-R", "-n ", "-M"],  # <--- new!
    interface='ib0',
    use_stdin=False
)
cluster.scale(4)


In [None]:
from dask.distributed import Client
client = Client(cluster)
client

In [None]:
### Dask CuDF demo


import dask_cudf    
filename = "./loan_project_df.parquet.gzip"

# Rapids Dataframe
loan_ddf = dask_cudf.read_parquet(filename)#  , names=ts_cols,dtype=ts_dtypes,skiprows=1)

# Scale up data to 20 million rows 
for i in range(4) :
    loan_ddf = dask_cudf.concat([loan_ddf,loan_ddf],axis=0)
    loan_ddf = loan_ddf.reset_index().drop("index",axis=1)

loan_ddf.repartition(npartitions=4)
#df = dask_cudf.from_cudf(loan_rdf, npartitions=4)



loan_ddf.dtypes


In [None]:
ddf_mean = loan_ddf.select_dtypes(include='float64').mean()
ddf_mean.compute()

In [None]:
grade_stats_ddf = loan_ddf.groupby('grade').agg({"annual_inc": "mean", "loan_amnt": "mean"})
grade_stats_ddf.compute()

In [None]:
grade_stats_ddf.visualize()

## Credits

This notebook was built by  Dustin VanStee (vanstee@us.ibm.com) from IBM Garage for Systems AI team.  Special thanks to Steve LaFalce and Loic Fura for reviewing the content and suggesting edits.


In [None]:
# Export to python library
from nbdev.export import *
notebook2script()