In [1]:
from cuml.dask.ensemble import RandomForestRegressor as cuRF
from cuml.dask.common import to_dask_df
from cuml.metrics import r2_score
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

In [2]:
cluster = LocalCUDACluster(threads_per_worker=1)
client = Client(cluster)

In [3]:
n_total_partitions = len(list(client.has_what().keys()))

In [4]:
import os
import cudf
import numpy as np
print(os.getcwd())
data = cudf.read_csv('load_dataset/LakeDepth/pts_merged_final.csv')
predictor = 'Depth_m'
cols_to_drop = ['FID', 'Date']
for col_to_drop in cols_to_drop:
	data = data.drop([col_to_drop], axis = 1)
	print(" - from DATA: dropped column:", col_to_drop)
    
y = data[predictor]
X = data.drop([predictor], axis = 1)

X = X.astype(np.float32)
y = y.astype(np.float32)

/att/gpfsfs/briskfs01/ppl/cssprad1/projects/rapids_rf_lake_depth
 - from DATA: dropped column: FID
 - from DATA: dropped column: Date


In [5]:
import dask_cudf
X_cudf = dask_cudf.from_cudf(X, npartitions=n_total_partitions)
wait(X_cudf)
y_cudf = dask_cudf.from_cudf(y, npartitions=n_total_partitions)
wait(y_cudf)

DoneAndNotDoneFutures(done=set(), not_done=set())

In [6]:
x_df = to_dask_df(X_cudf)

In [7]:
y_df = to_dask_df(y_cudf)

In [8]:
%%time

cuml_model = cuRF(n_estimators = 500)

CPU times: user 71.9 ms, sys: 10.7 ms, total: 82.6 ms
Wall time: 1.32 s


In [9]:
%%time
cuml_model.fit(X_cudf, y_cudf)

CPU times: user 211 ms, sys: 42.3 ms, total: 253 ms
Wall time: 4.17 s


<cuml.dask.ensemble.randomforestregressor.RandomForestRegressor at 0x7f9c9fc3eed0>

In [10]:
labels_cuml = cuml_model.predict(X_cudf).compute()

In [14]:
from sklearn.metrics import mean_absolute_error as m_a_e, r2_score as r2d2

mae_score = m_a_e(y.to_pandas(), labels_cuml.to_pandas())
r2_score = r2d2(y.to_pandas(), labels_cuml.to_pandas())
print("Scores --")
print("MAE: ", mae_score)
print("r2: ", r2_score)

Scores --
MAE:  0.6949109
r2:  0.5861952469666736
