# Dask and cuDF working together

Converting some of the code to work with Dask

In [None]:
# Import sytem and python modules

import os
import time
import random
from pprint import pprint
import numpy as np
import cupy as cp

# Import RAPIDS specific modules

import cudf as df
import cuml
from cuml import train_test_split
from cuml.metrics.regression import r2_score as r2d2

# Import Dask specific modules
from cuml.dask.common import utils as dask_utils
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import dask_cudf

from cuml.dask.ensemble import RandomForestRegressor as cumlDaskRF

# Import sklearn specific modules
from sklearn.model_selection import KFold
from sklearn.inspection import permutation_importance

# Import data-visualization modules

import matplotlib.pyplot as plt

# Start Dask Cluster

In [None]:
# This will use all GPUs on the local host by default
cluster = LocalCUDACluster(threads_per_worker=1)
c = Client(cluster)

# Query the client for all connected workers
workers = c.has_what().keys()
n_workers = len(workers)
n_streams = 8 # Performance optimization

c

# Data Preprocessing

In [None]:
# Declare some globals variables and paths
FEATURES_PATH = '../data/pts_merged_final.csv'
DEPTH = 'Depth_m'
DATE = 'Date'
FID = 'FID'

TEST_SIZE = 0.2
RANDOM_STATE = 42

In [None]:
# Load everything into GPU-based DF
lakes_depth_df = df.read_csv(FEATURES_PATH)

In [None]:
# Drop unnecessary values from DF
lakes_depth_nd = lakes_depth_df.drop(['FID', 'Date'], axis = 1)
lakes_depth_nd.head(5)

In [None]:
# Inspect data for any anomolies or anything else odd-looking
lakes_depth_nd.describe()

In [None]:
# Make our acutal_predictions i.e. labels and our covariates dataframes
labels = lakes_depth_nd['Depth_m']
covariates = lakes_depth_nd.drop(['Depth_m'], axis=1)

# Check to ensure everything looks good
labels.head(5)

In [None]:
covariates.head(5)

In [None]:
# Make sure we change all our covariate and label data to float32

labels = labels.astype(cp.float32)
covariates = covariates.astype(cp.float32)

In [None]:
cv_train, cv_test, labels_train, labels_test = train_test_split(covariates, labels,
                                                               test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [None]:
# Ensure we have the right size and shapes on our split data
print('Training features shape:', cv_train.shape)
print('Testing features shape:', cv_test.shape)
print('Training labels shape:', labels_train.shape)
print('Testing labels shape:', labels_test.shape)

## Distribute data to worker GPUs

In [None]:
n_partitions = n_workers

def distribute(covariates, labels):

    # Partition with Dask
    # In this case, each worker will train on 1/n_partitions fraction of the data
    cv_dask = dask_cudf.from_cudf(covariates, npartitions=n_partitions)
    labels_dask = dask_cudf.from_cudf(labels, npartitions=n_partitions)

    # Persist to cache the data in active memory
    cv_dask, labels_dask = \
      dask_utils.persist_across_workers(c, [cv_dask, labels_dask], workers=workers)
    
    return cv_dask, labels_dask

cv_train_dask, labels_train_dask = distribute(cv_train, labels_train)
cv_test_dask, labels_test_dask = distribute(cv_test, labels_test)

## Train the distributed cuML model

In [None]:
# Declare some global variables for training phase

# Hyper-paramters
N_ESTIMATORS = 2000
SPLIT_ALGO = 1
SPLIT_CRITERION = 2
BOOTSTRAP = True
BOOTSTRAP_FEATURES = False
ROWS_SAMPLE = 1.0
MAX_DEPTH = 16
MAX_LEAVES = -1
MAX_FEATURES = 'auto'
N_BINS = 8
MIN_ROWS_PER_NODE = 2
MIN_IMPURITY_DECREASE = 0.0
ACCURACY_METRIC = 'mean_ae' # 'mse' #'r2' # 'median_aw' # 
QUANTILEPT = False
SEED = 42
VERBOSE = False

In [None]:
depth_rf_model_0 = cumlDaskRF(n_estimators = N_ESTIMATORS, 
                        split_algo = SPLIT_ALGO, 
                        split_criterion = SPLIT_CRITERION, 
                        bootstrap = BOOTSTRAP,
                        bootstrap_features = BOOTSTRAP_FEATURES, 
                        rows_sample = ROWS_SAMPLE,
                        max_depth = MAX_DEPTH, 
                        max_leaves = MAX_LEAVES, 
                        max_features = MAX_FEATURES,
                        n_bins = N_BINS,
                        min_rows_per_node = MIN_ROWS_PER_NODE,
                        min_impurity_decrease = MIN_IMPURITY_DECREASE,
                        accuracy_metric = ACCURACY_METRIC,
                        quantile_per_tree = QUANTILEPT,
                        seed = SEED,
                        verbose = VERBOSE)

In [None]:
%%time

depth_rf_model_0.fit(cv_train_dask, labels_train_dask)
wait(depth_rf_model_0.rfs) # Allow asynchronous training tasks to finish

# Predict and check accuracy

In [None]:
cuml_y_pred = depth_rf_model_0.predict(cv_test_dask).compute().to_array()

In [None]:
# Let's get some prediction
from sklearn.metrics import mean_absolute_error as m_a_e, r2_score as r2d2

mae_score = m_a_e(labels_test.to_array(), cuml_y_pred)
r2_score = r2d2(labels_test.to_array(), cuml_y_pred)
print("Scores --")
print("MAE: ", mae_score)
print("r2: ", r2_score)