# NVIDIA RAPIDS on Azure ML
## GTC 2020 DLI WORKSHOP

In this notebook we use NYC Taxi dataset to showcase how easy it is to *translate* the single-GPU RAPIDS code into a multi-GPU equivalent.

**AUTHORS**
* Tom Drabas (Microsoft)
* Manuel Reyes Gomez (NVIDIA)


**GREATER TEAM**
* Joshua Patterson (NVIDIA)
* Keith Kraus (NVIDIA)
* Brad Rees (NVIDIA)
* John Zedlewski (NVIDIA)
* Paul Mahler (NVIDIA)
* Nick Becker (NVIDIA)
* Michael Beaumont (NVIDIA)
* Chau Dang (NVIDIA)

# Import modules

In [1]:
import dask
import dask_cudf
import cudf
import os
import socket
import dask_xgboost as dxgb
import distributed
import pickle
import glob
from collections import OrderedDict
import numpy as np
from math import cos, sin, asin, sqrt, pi

from dask.delayed import delayed

from azureml.core import Run

In [2]:
print("Setting dask settings...")
dask.config.set({'distributed.scheduler.work-stealing': False})
dask.config.set({'distributed.scheduler.bandwidth': 20})
print("Changes to dask settings")
print("-> Setting work-stealing to ", dask.config.get('distributed.scheduler.work-stealing'))
print("-> Setting scheduler bandwidth to ", dask.config.get('distributed.scheduler.bandwidth'))
print("Settings updates complete")

Setting dask settings...
Changes to dask settings
-> Setting work-stealing to  False
-> Setting scheduler bandwidth to  20
Settings updates complete


In [3]:
run = Run.get_context()
ip = socket.gethostbyname(socket.gethostname())
scheduler = run.get_metrics()["scheduler"]
client = distributed.Client(scheduler)
client.restart()

0,1
Client  Scheduler: tcp://10.7.0.5:8786  Dashboard: http://10.7.0.5:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 0 B


In [4]:
!nvidia-smi

Fri Feb 14 20:46:26 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  On   | 00009026:00:00.0 Off |                    0 |
| N/A   34C    P0    37W / 250W |    622MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

# Read, clean and featurize data

In [5]:
STORAGE_OPTIONS = {
    'account_name': run.experiment.workspace.datastores['datafiles'].account_name,
    'account_key' : run.experiment.workspace.datastores['datafiles'].account_key
}

protocol  = 'abfs'      # change to 'adl' for gen 1
container = 'datasets'

In [6]:
data_path = '../../../../../../datafiles'
datafiles = glob.glob(data_path + '/nyctaxi/2015/*')
datafiles = ['/'.join(f.split('/')[7:]) for f in datafiles[0:8]]
datafiles

['nyctaxi/2015/yellow_tripdata_2015-01.csv',
 'nyctaxi/2015/yellow_tripdata_2015-02.csv',
 'nyctaxi/2015/yellow_tripdata_2015-03.csv',
 'nyctaxi/2015/yellow_tripdata_2015-04.csv',
 'nyctaxi/2015/yellow_tripdata_2015-05.csv',
 'nyctaxi/2015/yellow_tripdata_2015-06.csv',
 'nyctaxi/2015/yellow_tripdata_2015-07.csv',
 'nyctaxi/2015/yellow_tripdata_2015-08.csv']

In [7]:
files = [f'{protocol}://{container}/{f}' for f in datafiles]
files

['abfs://datasets/nyctaxi/2015/yellow_tripdata_2015-01.csv',
 'abfs://datasets/nyctaxi/2015/yellow_tripdata_2015-02.csv',
 'abfs://datasets/nyctaxi/2015/yellow_tripdata_2015-03.csv',
 'abfs://datasets/nyctaxi/2015/yellow_tripdata_2015-04.csv',
 'abfs://datasets/nyctaxi/2015/yellow_tripdata_2015-05.csv',
 'abfs://datasets/nyctaxi/2015/yellow_tripdata_2015-06.csv',
 'abfs://datasets/nyctaxi/2015/yellow_tripdata_2015-07.csv',
 'abfs://datasets/nyctaxi/2015/yellow_tripdata_2015-08.csv']

# Define global vars and methods

In [8]:
columns_dtypes = OrderedDict(
    [
        ('vendor_id', 'int32'),
        ('pickup_datetime', 'date'),
        ('dropoff_datetime', 'date'),
        ('passenger_count', 'int32'),
        ('trip_distance', 'int32'),
        ('pickup_longitude', 'float64'),
        ('pickup_latitude', 'float64'),
        ('rate_code', 'int32'),
        ('store_and_fwd_flag', 'int32'),
        ('dropoff_longitude', 'float64'),
        ('dropoff_latitude', 'float64'),
        ('payment_type', 'int32'),
        ('fare_amount', 'float64'),
        ('extra', 'float64'),
        ('mta_tax', 'float64'),
        ('tip_amount', 'float64'),
        ('tolls_amount', 'float64'),
        ('surcharge', 'float64'),
        ('total_amount', 'float64')
    ]
)

use_col  = [
      'pickup_datetime'
    , 'dropoff_datetime'
    , 'passenger_count'
    , 'trip_distance'
    , 'pickup_longitude'
    , 'pickup_latitude'
    , 'rate_code'
    , 'dropoff_longitude'
    , 'dropoff_latitude'
    , 'fare_amount'
]

query_frags = [
    'fare_amount > 0 and fare_amount < 500',
    'passenger_count > 0 and passenger_count < 6',
    'pickup_longitude > -75 and pickup_longitude < -73',
    'dropoff_longitude > -75 and dropoff_longitude < -73',
    'pickup_latitude > 40 and pickup_latitude < 42',
    'dropoff_latitude > 40 and dropoff_latitude < 42'
]

In [9]:
def print_message(msg, length=80, filler='#', pre_post=''):
    print(f'{pre_post} {msg} {pre_post}'.center(length, filler))
    
def print_time(t_curr, t_next, t_start, length=80):
    print('> Step time: {0}, elapsed time: {1}'
          .format(str(t_curr - t_next), str(t_curr - t_start)).rjust(length, '-'))
    
def haversine_distance_kernel_gpu(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, h_distance):
    for i, (x_1, y_1, x_2, y_2) in enumerate(zip(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude)):
        x_1 = pi / 180 * x_1
        y_1 = pi / 180 * y_1
        x_2 = pi / 180 * x_2
        y_2 = pi / 180 * y_2
        
        dlon = y_2 - y_1
        dlat = x_2 - x_1
        a = sin(dlat / 2)**2 + cos(x_1) * cos(x_2) * sin(dlon / 2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 3959 # Radius of earth in miles
        
        h_distance[i] = c * r
        
def haversine_distance_kernel_cpu(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) 
    
    newlon = lon2 - lon1
    newlat = lat2 - lat1
 
    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2
 
    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 3959 * dist
    return km

def bin_fares(fare_amount, fare_bin):
    for i, fare in enumerate(fare_amount):
        fare_bin[i] = int(fare / 10.0) * 10
    
def add_features(df):
    df['pickup_datetime'] = df['pickup_datetime'].astype('datetime64[ms]')
    
    df['hour']  = df['pickup_datetime'].dt.hour
    df['year']  = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day']   = df['pickup_datetime'].dt.day
    
    df['pickup_latitude_r']   = (df['pickup_latitude']   / .01).astype('int') / 100.0
    df['pickup_longitude_r']  = (df['pickup_longitude']  / .01).astype('int') / 100.0
    df['dropoff_latitude_r']  = (df['dropoff_latitude']  / .01).astype('int') / 100.0
    df['dropoff_longitude_r'] = (df['dropoff_longitude'] / .01).astype('int') / 100.0
    

    df = df.drop('pickup_datetime', axis=1)
    df = df.drop('dropoff_datetime', axis=1)

    df = df.apply_rows(
        haversine_distance_kernel_gpu
        , incols=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
        , outcols=dict(h_distance=np.float32)
        , kwargs=dict()
    )
        
    return df

# Read the data

In [10]:
%%time
taxi_df = (
    dask_cudf
    .read_csv(
        files
        , storage_options=STORAGE_OPTIONS
        , names=list(columns_dtypes.keys())
        , dtype=list(columns_dtypes.values())
        , skiprows=1
        , usecols=use_col)
    .repartition(npartitions=72)
    .persist()
)

print(f' Number of records: {len(taxi_df):,} '.center(80, '#'))

######################## Number of records: 99,799,189 #########################
CPU times: user 1.05 s, sys: 347 ms, total: 1.4 s
Wall time: 1min 28s


# Featurize the data

In [11]:
print_message('SUBSETTING DATA')
# apply a list of filter conditions to throw out records with missing or outlier values
taxi_df = taxi_df.query(' and '.join(query_frags))

############################### SUBSETTING DATA ################################


In [12]:
print_message('FEATURIZING DATA')
taxi_df = taxi_df.map_partitions(add_features)

############################### FEATURIZING DATA ###############################


In [14]:
%%time
print_message('GROUPING DATA')

def bin_fares_frame(df):
    ### PUT THE FARE IN BINS OF $10
    df = df.apply_rows(
        bin_fares
        , incols = {'fare_amount': 'fare_amount'}
        , outcols = {'fare_bin': np.int32}
        , kwargs = {}
    )
    
    return df

taxi_df = taxi_df.map_partitions(bin_fares_frame).persist()
done = distributed.wait(taxi_df)
taxi_df_fare = taxi_df[['fare_bin', 'passenger_count']].groupby(by='fare_bin').count().compute()

################################ GROUPING DATA #################################
CPU times: user 2.78 s, sys: 24.8 ms, total: 2.81 s
Wall time: 18.3 s


In [15]:
taxi_df.head()

Unnamed: 0,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,dropoff_longitude,dropoff_latitude,fare_amount,hour,year,month,day,pickup_latitude_r,pickup_longitude_r,dropoff_latitude_r,dropoff_longitude_r,h_distance,fare_bin
0,1,1,-73.993896,40.750111,1,-73.974785,40.750618,12.0,19,2015,1,15,40.75,-73.99,40.75,-73.97,1.001024,10
1,1,3,-74.001648,40.724243,1,-73.994415,40.759109,14.5,20,2015,1,10,40.72,-74.0,40.75,-73.99,2.438754,10
2,1,1,-73.963341,40.802788,1,-73.95182,40.824413,9.5,20,2015,1,10,40.8,-73.96,40.82,-73.95,1.611153,0
3,1,0,-74.009087,40.713818,1,-74.004326,40.719986,3.5,20,2015,1,10,40.71,-74.0,40.71,-74.0,0.493789,0
4,1,3,-73.971176,40.762428,1,-74.004181,40.742653,15.0,20,2015,1,10,40.76,-73.97,40.74,-74.0,2.202665,10


In [16]:
%%time
print_message('SORTING DATA')
taxi_df = taxi_df.sort_values(by='fare_amount').persist()

################################# SORTING DATA #################################
CPU times: user 17.7 s, sys: 262 ms, total: 18 s
Wall time: 7min 17s
