# NVIDIA RAPIDS on Azure ML
## MLADS Fall'19

In this notebook we use NYC Taxi dataset to showcase the speedup and the ease of converting code to build a Random Forest regression model on CPU and GPU.

**AUTHORS**
* Tom Drabas (Microsoft)
* Brad Rees (NVIDIA)
* John Zedlewski (NVIDIA)
* Paul Mahler (NVIDIA)
* Nick Becker (NVIDIA)
* Chau Dang (NVIDIA)

**GREATER TEAM**
* Joshua Patterson (NVIDIA)
* Keith Kraus (NVIDIA)
* Michael Beaumont (NVIDIA)
* Manuel Reyes Gomez (NVIDIA)

# Load the modules

In [1]:
import numpy as np
import pandas as pd
import cudf

import os
from collections import OrderedDict
import datetime

from sklearn.model_selection import train_test_split as skTTS
from cuml.preprocessing.model_selection import train_test_split as cumlTTS

from sklearn.ensemble import RandomForestRegressor as skRF
from cuml.ensemble import RandomForestRegressor as cumlRF

from sklearn.metrics import r2_score as sk_r2
from cuml.metrics.regression import r2_score as cuml_r2

# Define helper functions

In [10]:
def print_message(msg, length=80, filler='#', pre_post=''):
    print(f'{pre_post} {msg} {pre_post}'.center(length, filler))
    
def print_time(t_curr, t_next, t_start, length=80):
    print('> Step time: {0}, elapsed time: {1}'
          .format(str(t_curr - t_next), str(t_curr - t_start)).rjust(length, '-'))

# Define data location

In [8]:
data_dir = '../../../'     #### REPLACE WITH THE DATA STORE PATH
data_path = os.path.join(data_dir, "data/nyctaxi/")
dataset   = os.path.join(data_path, "2016/featurized_yellow_tripdata_2016-01.csv")

print(data_path)
print(dataset)

../../../data/nyctaxi/
../../../data/nyctaxi/2016/featurized_yellow_tripdata_2016-01.csv


In [9]:
%%bash -s "$dataset"
head $1

"passenger_count","trip_distance","pickup_longitude","pickup_latitude","rate_code","dropoff_longitude","dropoff_latitude","fare_amount","hour","year","month","day","diff","pickup_latitude_r","pickup_longitude_r","dropoff_latitude_r","dropoff_longitude_r"
2,1.1,-73.9903717,40.73469543,1,-73.98184204,40.73240662,7.5,0,2016,1,1,0,40.73,-74.0,40.73,-73.99
5,4.9,-73.98078156,40.7299118,1,-73.94447327,40.71667862,18.0,0,2016,1,1,0,40.72,-73.99,40.71,-73.95
1,10.54,-73.98455048,40.67956543,1,-73.95027161,40.78892517,33.0,0,2016,1,1,0,40.67,-73.99,40.78,-73.96
1,4.75,-73.99346924,40.71899033,1,-73.96224213,40.65733337,16.5,0,2016,1,1,0,40.71,-74.0,40.65,-73.97
3,1.76,-73.96062469,40.78133011,1,-73.9772644,40.7585144,8.0,0,2016,1,1,0,40.78,-73.97,40.75,-73.98
2,5.52,-73.9801178,40.74304962,1,-73.9134903,40.76314163,19.0,0,2016,1,1,1110000,40.74,-73.99,40.76,-73.92
2,7.45,-73.9940567,40.71998978,1,-73.966362,40.78987122,26.0,0,2016,1,1,1605000,40.71,-74.0,40.78,-73.97
1,1.2,-73.97942352,40.74461

In [16]:
with open(dataset, 'r') as f:
    temp = f.readline()
    ncols = len(temp.split(','))
    del temp

# Define GPU workflow

In [39]:
def gpu_load_data(fname, ncols):
    dtypes = ["float32" for i in range(ncols)]
    return cudf.read_csv(fname, delimiter=',', dtype=dtypes)

def run_gpu_workflow(fname, ncols):
    t_start = datetime.datetime.now()
    print_message('LOADING DATA')
    df = gpu_load_data(fname, ncols)
    t_next = datetime.datetime.now()
    print_time(t_next, t_start, t_start)
    
    print_message('SPLITTING INTO TRAIN AND TEST')
    X_train, X_test, y_train, y_test = cumlTTS(df, 'fare_amount', train_size=0.75)
    t_curr = datetime.datetime.now()
    print_time(t_curr, t_next, t_start)
    t_next = t_curr
    
    print()
    print_message('Train size: {0:,}'.format(len(X_train)), filler='-', pre_post='+')
    print_message('Test  size: {0:,}'.format(len(X_test)), filler='-', pre_post='+')
    print()
    
    print_message('FITTING MODEL')
    model = cumlRF(
          max_features = 1.0
        , n_estimators = 40
        , split_algo = 1 # global_quantile
        , n_bins = 16
    )

    model.fit(X_train, y_train)
    t_curr = datetime.datetime.now()
    print_time(t_curr, t_next, t_start)
    t_next = t_curr
    
    print()
    print_message('PREDICTING')
    y_hat = model.predict(X_test)
    print()
    print_message('R^2 of the model: {0:.4f}'.format(cuml_r2(y_test, y_hat)), filler='-', pre_post='+')
    print()
    
    t_curr = datetime.datetime.now()
    print_time(t_curr, t_next, t_start)
    
    return t_curr - t_start

# Define CPU workflow

In [34]:
def cpu_load_data(fname, ncols):
    return pd.read_csv(fname, delimiter=',', dtype=np.float32)

def run_cpu_workflow(fname, ncols):
    t_start = datetime.datetime.now()
    print_message('LOADING DATA')
    df = cpu_load_data(fname, ncols)
    t_next = datetime.datetime.now()
    print_time(t_next, t_start, t_start)
    
    print_message('SPLITTING INTO TRAIN AND TEST')
    X = df.drop('fare_amount', axis=1)
    y = df['fare_amount']
    
    X_train, X_test, y_train, y_test = skTTS(X, y, train_size=0.75)
    t_curr = datetime.datetime.now()
    print_time(t_curr, t_next, t_start)
    t_next = t_curr
    
    print()
    print_message('Train size: {0:,}'.format(len(X_train)), filler='-', pre_post='+')
    print_message('Test  size: {0:,}'.format(len(X_test)), filler='-', pre_post='+')
    print()
    
    print_message('FITTING MODEL')
    model = skRF(
          max_features = 1.0
        , n_estimators = 40
        , n_jobs = 4
    )

    model.fit(X_train, y_train)
    t_curr = datetime.datetime.now()
    print_time(t_curr, t_next, t_start)
    t_next = t_curr
    
    print()
    print_message('PREDICTING')
    y_hat = model.predict(X_test)
    print()
    print_message('R^2 of the model: {0:.4f}'.format(sk_r2(y_test, y_hat)), filler='-', pre_post='+')
    print()
    
    t_curr = datetime.datetime.now()
    print_time(t_curr, t_next, t_start)
    
    return t_curr - t_start

In [40]:
gpu_runtime = run_gpu_workflow(dataset, ncols)

################################# LOADING DATA #################################
-----------------------> Step time: 0:00:00.859018, elapsed time: 0:00:00.859018
######################## SPLITTING INTO TRAIN AND TEST #########################
-----------------------> Step time: 0:00:01.426499, elapsed time: 0:00:02.285517

---------------------------+ Train size: 7,760,776 +----------------------------
---------------------------+ Test  size: 2,586,926 +----------------------------

################################ FITTING MODEL #################################
-----------------------> Step time: 0:02:04.595323, elapsed time: 0:02:06.880840

################################## PREDICTING ##################################

--------------------------+ R^2 of the model: 0.9287 +--------------------------

-----------------------> Step time: 0:00:01.586121, elapsed time: 0:02:08.466961


In [36]:
cpu_runtime = run_cpu_workflow(dataset, ncols)

################################# LOADING DATA #################################
-----------------------> Step time: 0:00:11.707485, elapsed time: 0:00:11.707485
######################## SPLITTING INTO TRAIN AND TEST #########################
-----------------------> Step time: 0:00:02.565499, elapsed time: 0:00:14.272984

---------------------------+ Train size: 7,760,776 +----------------------------
---------------------------+ Test  size: 2,586,926 +----------------------------

################################ FITTING MODEL #################################
-----------------------> Step time: 0:16:22.040610, elapsed time: 0:16:36.313594

################################## PREDICTING ##################################

--------------------------+ R^2 of the model: 0.9685 +--------------------------

-----------------------> Step time: 0:00:20.727196, elapsed time: 0:16:57.040790


In [41]:
print_message('Total CPU time: {0}'.format(str(cpu_runtime)))
print_message('Total GPU time: {0}'.format(str(gpu_runtime)))
print_message('Speedup over CPU: {0:.3f}'.format(cpu_runtime / gpu_runtime))

######################## Total CPU time: 0:16:57.040790 ########################
######################## Total GPU time: 0:02:08.466961 ########################
########################### Speedup over CPU: 7.917 ############################
