In [None]:
# Import sytem and python modules

import os
import time
import random
from pprint import pprint
import numpy as np
import cupy as cp

# Import RAPIDS specific modules

import cudf as df
import cuml
from cuml import train_test_split
from cuml.metrics.regression import r2_score as r2d2
from cuml.ensemble import RandomForestRegressor as clRF

# Import sklearn specific modules
from sklearn.model_selection import KFold
from sklearn.inspection import permutation_importance

# Import data-visualization modules

import matplotlib.pyplot as plt

In [None]:
# Declare some globals variables and paths
FEATURES_PATH = '../data/pts_merged_final.csv'
DEPTH = 'Depth_m'
DATE = 'Date'
FID = 'FID'

In [None]:
# Load everything into GPU-based DF
lakes_depth_df = df.read_csv(FEATURES_PATH)

In [None]:
# Drop unnecessary values from DF
lakes_depth_nd = lakes_depth_df.drop(['FID', 'Date'], axis = 1)
lakes_depth_nd.head(5)

In [None]:
# Make our acutal_predictions i.e. labels and our covariates dataframes
labels = lakes_depth_nd['Depth_m']
covariates = lakes_depth_nd.drop(['Depth_m'], axis=1)

# Check to ensure everything looks good
labels.head(5)

In [None]:
# Make sure we change all our covariate and label data to float32

labels = labels.astype(cp.float32)
covariates = covariates.astype(cp.float32)

In [None]:
# Introduce lists to the hyperparameters we want to 

N_ESTIMATORS = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
SPLIT_ALGO = 1
SPLIT_CRITERION = 2
BOOTSTRAP = [True, False]
BOOTSTRAP_FEATURES = False
ROWS_SAMPLE = 1.0
MAX_DEPTH = [int(x) for x in np.linspace(10, 110, num = 11)]
MAX_LEAVES = -1
MAX_FEATURES = ['auto', 'sqrt']
N_BINS = [int(x) for x in np.linspace(start = 5, stop = 20, num = 10)]
MIN_ROWS_PER_NODE = 2
MIN_IMPURITY_DECREASE = 0.0
ACCURACY_METRIC = 'mean_ae' # 'mse' #'r2' # 'median_aw' # 
QUANTILEPT = False
SEED = 42
VERBOSE = False

random_grid = {'n_estimators' : N_ESTIMATORS,
              'max_depth' : MAX_DEPTH,
              'bootstrap' : BOOTSTRAP,
              'max_features': MAX_FEATURES,
              'n_bins' : N_BINS}

pprint(random_grid)

In [None]:
def random_search_(N_SEARCH):
    k_fold = KFold(5)
    results = []

    for i in range(N_SEARCH):
        print("Random search epoch: ", i)
        depth_rf_model = clRF(n_estimators = random.choice(N_ESTIMATORS), 
                            split_algo = SPLIT_ALGO, 
                            split_criterion = SPLIT_CRITERION, 
                            bootstrap = random.choice(BOOTSTRAP),
                            bootstrap_features = BOOTSTRAP_FEATURES, 
                            rows_sample = ROWS_SAMPLE,
                            max_depth = random.choice(MAX_DEPTH), 
                            max_leaves = MAX_LEAVES, 
                            max_features = random.choice(MAX_FEATURES),
                            n_bins = random.choice(N_BINS),
                            min_rows_per_node = MIN_ROWS_PER_NODE,
                            min_impurity_decrease = MIN_IMPURITY_DECREASE,
                            accuracy_metric = ACCURACY_METRIC,
                            quantile_per_tree = QUANTILEPT,
                            seed = SEED,
                            verbose = VERBOSE)

        # Split the data to train and test, shuffle to prevent overfitting
        st = time.time()
        cv_train, cv_test, labels_train, labels_test = train_test_split(covariates, labels,
                                                               test_size=TEST_SIZE, 
                                                               shuffle=True,
                                                               random_state=RANDOM_STATE)
        et = time.time()
        print("   -time to split data (sec): ", et-st)
        
        # Fit the model to new parameters
        st = time.time()
        depth_rf_model.fit(cv_train, labels_train)
        et = time.time()
        print("   -time to train (sec): ", et-st)
        
        score = depth_rf_model.score(cv_test, labels_test)
        print("   -score (mae): ", score)

        results.append({'n_estimators':depth_rf_model.n_estimators, 'bootstrap':depth_rf_model.bootstrap,
                  'max_depth':depth_rf_model.max_depth, 'max_features':depth_rf_model.max_features,
                  'n_bins':depth_rf_model.n_bins, 'performance':score})
        
        results.sort(key=lambda x : -x['performace'])
        
        
    return results

In [None]:
def random_search_kfold(N_SEARCH):

    k_fold = KFold(5)
    results = []


    for i in range(N_SEARCH):
        print("Random search epoch: ", i)
        depth_rf_model = clRF(n_estimators = random.choice(N_ESTIMATORS), 
                            split_algo = SPLIT_ALGO, 
                            split_criterion = SPLIT_CRITERION, 
                            bootstrap = random.choice(BOOTSTRAP),
                            bootstrap_features = BOOTSTRAP_FEATURES, 
                            rows_sample = ROWS_SAMPLE,
                            max_depth = random.choice(MAX_DEPTH), 
                            max_leaves = MAX_LEAVES, 
                            max_features = random.choice(MAX_FEATURES),
                            n_bins = random.choice(N_BINS),
                            min_rows_per_node = MIN_ROWS_PER_NODE,
                            min_impurity_decrease = MIN_IMPURITY_DECREASE,
                            accuracy_metric = ACCURACY_METRIC,
                            quantile_per_tree = QUANTILEPT,
                            seed = SEED,
                            verbose = VERBOSE)

        scores = []
        st = time.time()
        for k, (train, test) in enumerate(k_fold.split(covariates, labels)):
            depth_rf_model.fit(covariates.iloc[train], labels.iloc[train])
            score = depth_rf_model.score(covariates.iloc[test], labels.iloc[test])
            scores.append(score)
        et = time.time()
        print("   -time to train (sec): ", et-st)

        results.append({'n_estimators':depth_rf_model.n_estimators, 'bootstrap':depth_rf_model.bootstrap,
                  'max_depth':depth_rf_model.max_depth, 'max_features':depth_rf_model.max_features,
                  'n_bins':depth_rf_model.n_bins, 'performance':np.mean(scores)})
        
        results.sort(key=lambda x : -x['performance'])
        
    return results

In [None]:
rs_search_100 = random_search_kfold(100)
print(rs_search_100)