In [1]:
# Import sytem and python modules

import os
import time
import random
from pprint import pprint
import numpy as np
import cupy as cp
from pprint import pprint
# Import RAPIDS specific modules

import cudf as df
import cuml
from cuml import train_test_split
from cuml.metrics.regression import r2_score as r2d2
from cuml.ensemble import RandomForestRegressor as clRF

# Import sklearn specific modules
from sklearn.model_selection import KFold
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
# Import data-visualization modules

import matplotlib.pyplot as plt

In [2]:
FEATURES_PATH = 'load_dataset/LakeDepth/pts_merged_final.csv'
DEPTH = 'Depth_m'
DATE = 'Date'
FID = 'FID'

TEST_SIZE = 0.2
RANDOM_STATE = 42

# Load everything into GPU-based DF
lakes_depth_df = df.read_csv(FEATURES_PATH)
lakes_depth_nd = lakes_depth_df.drop(['FID', 'Date'], axis = 1)
lakes_depth_nd.head(5)

Unnamed: 0,Depth_m,b1_LC8_075,b2_LC8_075,b3_LC8_075,b4_LC8_075,b5_LC8_075,b6_LC8_075,b7_LC8_075,b8_LC8_075,b9_LC8_075,...,b26_LC8_07,b27_LC8_07,b28_LC8_07,b29_LC8_07,b30_LC8_07,b31_LC8_07,b32_LC8_07,b33_LC8_07,b34_LC8_07,b35_LC8_07
0,0.63,164,271,199,42,27,16,605,824,3905,...,2625,165,100,136,643,98,59,80,381,593
1,0.672727,165,272,196,44,29,16,607,842,3750,...,2750,176,107,148,659,97,59,82,364,552
2,0.670588,154,260,193,40,32,19,592,798,3850,...,2105,208,123,166,800,123,73,98,475,594
3,0.822222,156,250,195,48,40,26,624,800,3250,...,1846,256,160,205,833,167,104,133,542,650
4,1.725,117,164,78,38,23,17,713,1500,3079,...,2235,197,140,295,605,145,104,218,447,739


In [3]:
# Make our acutal_predictions i.e. labels and our covariates dataframes
labels = lakes_depth_nd['Depth_m']
covariates = lakes_depth_nd.drop(['Depth_m'], axis=1)

# Check to ensure everything looks good
labels.head(5)

0    0.630000
1    0.672727
2    0.670588
3    0.822222
4    1.725000
Name: Depth_m, dtype: float64

In [4]:
# Make sure we change all our covariate and label data to float32

labels = labels.astype(cp.float32)
covariates = covariates.astype(cp.float32)

In [5]:
cv_train, cv_test, labels_train, labels_test = train_test_split(covariates, labels,
                                                               test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [6]:
N_ESTIMATORS = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
SPLIT_ALGO = 1
SPLIT_CRITERION = 2
BOOTSTRAP = [True, False]
BOOTSTRAP_FEATURES = False
ROWS_SAMPLE = 1.0
MAX_DEPTH = [int(x) for x in np.linspace(10, 110, num=11)]
MAX_LEAVES = -1
MAX_FEATURES = ['auto', 'sqrt']
N_BINS = [int(x) for x in np.linspace(start=5, stop=20, num=10)]
MIN_ROWS_PER_NODE = 2
MIN_IMPURITY_DECREASE = 0.0
ACCURACY_METRIC = 'mean_ae'  # 'mse' #'r2' # 'median_aw' #
QUANTILEPT = False
SEED = 42
VERBOSE = False

random_grid = {'n_estimators': N_ESTIMATORS,
               'max_depth': MAX_DEPTH,
               'bootstrap': BOOTSTRAP,
               'max_features': MAX_FEATURES,
               'n_bins': N_BINS}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
 'max_features': ['auto', 'sqrt'],
 'n_bins': [5, 6, 8, 10, 11, 13, 15, 16, 18, 20],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [7]:
rf = clRF()

In [8]:
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 5, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42,
                               n_jobs = -1)

In [9]:
rf_random.fit(cv_train, labels_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  5.5min remaining:  4.8min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  7.1min finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestRegressor(n_estimators=100, max_depth=16, handle=<cuml.common.handle.Handle object at 0x7f96873d6d70>, max_features='auto', n_bins=8, n_streams=8, split_algo=1, split_criterion=2, bootstrap=True, bootstrap_features=False, verbose=2, min_rows_per_node=2, rows_sample=1.0, max_leaves=-1, accuracy_metric='mse', output_type='input', dtype=None, min_impurity_decrease=0.0, quantile_per_tree=False, seed=None),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_bins': [5, 6, 8, 10, 11, 13, 15, 16,
                                                   18, 20],
                                        'n_estimato

In [10]:
rf_random.best_params_

{'n_estimators': 1000,
 'n_bins': 11,
 'max_features': 'auto',
 'max_depth': 70,
 'bootstrap': False}

In [11]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    mae_score = mean_absolute_error(test_labels.to_pandas(), predictions.to_pandas())
    r2 = r2_score(test_labels.to_pandas(), predictions.to_pandas())
    mse_score = mean_squared_error(test_labels.to_pandas(), predictions.to_pandas())
    print('Mean Absolute Error: {:0.4f} meters.'.format(mae_score))
    print('Mean Squared Error: {:0.4f}'.format(mse_score))
    print('r2 score: {:0.4f}'.format(r2))
    return mae_score

In [14]:
base_model = clRF(n_estimators = 200)
base_model.fit(cv_train, labels_train)
base_accuracy = evaluate(base_model, cv_test, labels_test)


Mean Absolute Error: 0.4693 meters.
Mean Squared Error: 0.8310
r2 score: 0.7368


In [15]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, cv_test, labels_test)

Mean Absolute Error: 0.5584 meters.
Mean Squared Error: 1.4277
r2 score: 0.5477


In [None]:
# Lets look at a random sample of datapoints to see the actual vs predicted depths
for i in range(20):
    random_data_point = random.randint(4635)
    print('Actual: {:20} Predicted: {}'.format(labels_test.to_array()[random_data_point],
                                               model_0_predictions.to_array()[random_data_point]))