# Testing Dask on GPUs

In [5]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

import cudf
import cuml
import cupy as cp
import dask_cudf

cluster = LocalCUDACluster(n_workers=2, protocol="ucx", rmm_pool_size="180GB")
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46595 instead
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [6]:
print(client)

<Client: 'ucx://127.0.0.1:58385' processes=2 threads=2, memory=191.00 GiB>


This is the test dataset used to train the ML code:

In [7]:
load_dir_dataset = "/g/data/w97/sho561/Downscale/BARRA/Training_Testing_new/"

In [8]:
train_grids = cp.array([642, 714, 720, 1207, 1233, 1682, 1728, 2348, 2817, 2855, 3002, 3114, 3346, 3809, 4233, 4322, 4615, 4623, 6081, 6145])
all_years = cp.arange(1990,2019, step=1)
train_years = cp.array([1990, 1991, 1992, 1995, 1996, 2001, 2003, 2004, 2016, 2018])
test_years = cp.array([1993, 1994, 1997, 1998, 1999, 2000, 2002, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017, 2019]) 

featuresList = ['av_lat_hflx', 'av_mslp', 'av_netlwsfc', 'av_netswsfc', 'av_qsair_scrn', 'av_temp_scrn', 
'av_canopy_height', 'av_uwnd10m', 'av_vwnd10m', 'av_leaf_area_index', 'soil_albedo', 'soil_porosity', 'soil_bulk_density', 'topog' ]

seed = 100
ntrees = 100

## Using dask to concatonate files

- I struggled to open datasets into a cudf dataframe and concatonate
- So I'm using dask_cudf instead, seems to work pretty well. 
- It's probably good that dask takes control of the data tasks initially.

In [9]:
%%time

dask_sample_cudf = dask_cudf.concat([dask_cudf.read_csv(load_dir_dataset +'%s_%s_predictors_target.csv' %(642, year)) for year in all_years], axis=0).reset_index()

CPU times: user 2.45 s, sys: 539 ms, total: 2.99 s
Wall time: 2.72 s


In [10]:
dask_sample_cudf.compute().dropna(axis=0)

Unnamed: 0,index,ref_coarse_cell,ref_fine_cell,year,month,day,target,av_lat_hflx,av_mslp,av_netlwsfc,...,topog,soil_albedo,ETnw,ETn,ETne,ETw,ETe,ETsw,ETs,ETse
0,0,642,38410,1990,1,1,127.458336,103.0,101483.25,-95.066406,...,558.253296,0.115572,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
1,1,642,38411,1990,1,1,130.458328,103.0,101483.25,-95.066406,...,578.896240,0.115198,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
2,2,642,38412,1990,1,1,130.916672,103.0,101483.25,-95.066406,...,589.701294,0.114823,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
3,3,642,38413,1990,1,1,131.083328,103.0,101483.25,-95.066406,...,571.424072,0.114449,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
4,4,642,38414,1990,1,1,132.666672,103.0,101483.25,-95.066406,...,535.188477,0.114075,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23355,23355,642,42571,2018,9,9,69.166664,69.5,101989.75,-109.976562,...,697.690308,0.106115,67.00,72.25,72.25,59.00,74.0,63.00,70.50,78.5
23356,23356,642,42572,2018,9,9,72.000000,69.5,101989.75,-109.976562,...,609.880127,0.107231,67.00,72.25,72.25,59.00,74.0,63.00,70.50,78.5
23357,23357,642,42573,2018,9,9,71.791664,69.5,101989.75,-109.976562,...,558.244934,0.108348,67.00,72.25,72.25,59.00,74.0,63.00,70.50,78.5
23358,23358,642,42574,2018,9,9,73.083336,69.5,101989.75,-109.976562,...,542.575623,0.109464,67.00,72.25,72.25,59.00,74.0,63.00,70.50,78.5


In [11]:
index_testing = dask_sample_cudf.where(dask_sample_cudf['year'].isin(test_years))
index_training = dask_sample_cudf.where(dask_sample_cudf['year'].isin(train_years))

In [12]:
index_testing.compute().dropna(axis=0)

Unnamed: 0,index,ref_coarse_cell,ref_fine_cell,year,month,day,target,av_lat_hflx,av_mslp,av_netlwsfc,...,topog,soil_albedo,ETnw,ETn,ETne,ETw,ETe,ETsw,ETs,ETse
0,0.0,642.0,38410.0,1993.0,1.0,1.0,107.375000,87.50,101638.0,-58.515625,...,558.253296,0.115572,79.25,82.25,89.25,84.50,99.50,84.00,80.75,102.25
1,1.0,642.0,38411.0,1993.0,1.0,1.0,106.791664,87.50,101638.0,-58.515625,...,578.896240,0.115198,79.25,82.25,89.25,84.50,99.50,84.00,80.75,102.25
2,2.0,642.0,38412.0,1993.0,1.0,1.0,105.250000,87.50,101638.0,-58.515625,...,589.701294,0.114823,79.25,82.25,89.25,84.50,99.50,84.00,80.75,102.25
3,3.0,642.0,38413.0,1993.0,1.0,1.0,105.791664,87.50,101638.0,-58.515625,...,571.424072,0.114449,79.25,82.25,89.25,84.50,99.50,84.00,80.75,102.25
4,4.0,642.0,38414.0,1993.0,1.0,1.0,107.166664,87.50,101638.0,-58.515625,...,535.188477,0.114075,79.25,82.25,89.25,84.50,99.50,84.00,80.75,102.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23355,23355.0,642.0,42571.0,2017.0,9.0,9.0,39.083332,46.25,102075.5,-97.476562,...,697.690308,0.106115,45.75,51.00,60.00,46.75,58.25,57.25,48.50,59.75
23356,23356.0,642.0,42572.0,2017.0,9.0,9.0,39.333332,46.25,102075.5,-97.476562,...,609.880127,0.107231,45.75,51.00,60.00,46.75,58.25,57.25,48.50,59.75
23357,23357.0,642.0,42573.0,2017.0,9.0,9.0,40.375000,46.25,102075.5,-97.476562,...,558.244934,0.108348,45.75,51.00,60.00,46.75,58.25,57.25,48.50,59.75
23358,23358.0,642.0,42574.0,2017.0,9.0,9.0,41.041668,46.25,102075.5,-97.476562,...,542.575623,0.109464,45.75,51.00,60.00,46.75,58.25,57.25,48.50,59.75


In [13]:
index_training.compute().dropna(axis=0)

Unnamed: 0,index,ref_coarse_cell,ref_fine_cell,year,month,day,target,av_lat_hflx,av_mslp,av_netlwsfc,...,topog,soil_albedo,ETnw,ETn,ETne,ETw,ETe,ETsw,ETs,ETse
0,0.0,642.0,38410.0,1990.0,1.0,1.0,127.458336,103.0,101483.25,-95.066406,...,558.253296,0.115572,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
1,1.0,642.0,38411.0,1990.0,1.0,1.0,130.458328,103.0,101483.25,-95.066406,...,578.896240,0.115198,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
2,2.0,642.0,38412.0,1990.0,1.0,1.0,130.916672,103.0,101483.25,-95.066406,...,589.701294,0.114823,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
3,3.0,642.0,38413.0,1990.0,1.0,1.0,131.083328,103.0,101483.25,-95.066406,...,571.424072,0.114449,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
4,4.0,642.0,38414.0,1990.0,1.0,1.0,132.666672,103.0,101483.25,-95.066406,...,535.188477,0.114075,108.75,114.75,138.75,100.25,116.5,102.25,107.75,126.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23355,23355.0,642.0,42571.0,2018.0,9.0,9.0,69.166664,69.5,101989.75,-109.976562,...,697.690308,0.106115,67.00,72.25,72.25,59.00,74.0,63.00,70.50,78.5
23356,23356.0,642.0,42572.0,2018.0,9.0,9.0,72.000000,69.5,101989.75,-109.976562,...,609.880127,0.107231,67.00,72.25,72.25,59.00,74.0,63.00,70.50,78.5
23357,23357.0,642.0,42573.0,2018.0,9.0,9.0,71.791664,69.5,101989.75,-109.976562,...,558.244934,0.108348,67.00,72.25,72.25,59.00,74.0,63.00,70.50,78.5
23358,23358.0,642.0,42574.0,2018.0,9.0,9.0,73.083336,69.5,101989.75,-109.976562,...,542.575623,0.109464,67.00,72.25,72.25,59.00,74.0,63.00,70.50,78.5




## Testing Dask_delayed

- Gotton dask to open the csv files and concatenate the years.
- Now to create a delayed For loop to do all the grids in parallel..

First create the non-parallel function to test against:

In [14]:
def load_data(grid):
    
    dask_sample_cudf = dask_cudf.concat([dask_cudf.read_csv(load_dir_dataset +'%s_%s_predictors_target.csv' %(grid, year)) for year in all_years], axis=0).reset_index()
    
    return dask_sample_cudf
    

In [None]:
%%time

for grid in train_grids:
    load_data(grid).compute().dropna(axis=0)
    

Now lets try the dask_delayed approach:

In [23]:
# I think this is how you import delayed on GPUs
from dask import delayed

In [24]:
lazy_load_data = delayed(load_data)

In [30]:
%%time

for grid in train_grids:
    lazy_load_data(grid)

CPU times: user 7.04 ms, sys: 1.53 ms, total: 8.58 ms
Wall time: 5.43 ms
