# Random forest classification

## RAPIDS single GPU

<img src="https://rapids.ai/assets/images/RAPIDS-logo-purple.svg" width="400">

In [1]:
import os

# Load data and feature engineering

Load a full month for this exercise. Note we are loading the data with RAPIDS now (`cudf.read_csv` vs. `pd.read_csv`)

In [2]:
import cudf
import s3fs

s3 = s3fs.S3FileSystem(anon=True)

data = cudf.read_csv(
    s3.open( 's3://kjkasjdk2934872398ojljosudfsu8fuj23/data_rev8.csv', mode='rb')
)

In [3]:
print(f'Num rows: {len(data)}, Size: {data.memory_usage(deep=True).sum() / 1e6} MB')

Num rows: 200000, Size: 305.688903 MB


In [4]:
data.shape

(200000, 189)

In [5]:
data = data.drop(columns=['Unnamed: 0', 'Time'])
data = data.astype('float32')

In [6]:
features = list(data.columns[1:])
target = data.columns[0]

# Train model

In [7]:
%pip install pyDOE

Note: you may need to restart the kernel to use updated packages.


In [60]:
n_samples = 15

min_rows_per_node = [2, 5]
rows_sample = [0.03, 0.15]
max_features = [70, 186]

In [61]:
from pyDOE import lhs
import numpy as np
np.random.seed(42)

lhd = lhs(3, samples=n_samples)

In [62]:
data.shape

(200000, 187)

In [63]:
import pandas as pd

def scale_param(x, limits):
    range_ = limits[1]-limits[0]
    res = x*range_+min(limits)
    return res

samples = pd.DataFrame({'min_rows_per_node': np.round(scale_param(lhd[:,0], min_rows_per_node),0).astype(int).tolist(),
           'rows_sample': scale_param(lhd[:,1], rows_sample).tolist(),
           'max_features': np.round(scale_param(lhd[:,2], max_features),0).astype(int).tolist()
          })
samples.head()

Unnamed: 0,min_rows_per_node,rows_sample,max_features
0,2,0.052929,113
1,2,0.037606,79
2,3,0.072434,140
3,4,0.149275,102
4,5,0.125725,90


In [64]:
from cuml.ensemble import RandomForestRegressor
from cuml.metrics.regression import mean_absolute_error, mean_squared_error, r2_score
from tqdm.auto import tqdm

In [65]:
7*24*12

2016

In [66]:
fold_train = []
fold_test = []

n_folds = 4
folds_cumul = True

if n_folds == 4 and not folds_cumul:
    for fold in tqdm(range(4), total=4):
        fold_train_start = fold*40000
        fold_train_end = (fold+1)*40000
        fold_test_end = (fold+1)*50000

        train_data_x = data[features].iloc[fold_train_start:fold_train_end]
        train_data_y = data[target].iloc[fold_train_start:fold_train_end]

        test_data_x = data[features].iloc[fold_train_end:fold_test_end]
        test_data_y = data[target].iloc[fold_train_end:fold_test_end]

        fold_train.append([train_data_x, train_data_y])
        fold_test.append([test_data_x, test_data_y])
        
if n_folds == 4 and folds_cumul:
    for fold in tqdm(range(4), total=4):
        fold_train_start = 0
        fold_train_end = 180000+(fold)*3000
        fold_test_end = fold_train_end+2016

        train_data_x = data[features].iloc[fold_train_start:fold_train_end]
        train_data_y = data[target].iloc[fold_train_start:fold_train_end]

        test_data_x = data[features].iloc[fold_train_end:fold_test_end]
        test_data_y = data[target].iloc[fold_train_end:fold_test_end]

        fold_train.append([train_data_x, train_data_y])
        fold_test.append([test_data_x, test_data_y])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [None]:
res = []

for sample in tqdm(list(samples.index), total=samples.shape[0]):
    this_res = {}
    this_res['min_rows_per_node'] = samples.loc[sample, 'min_rows_per_node']
    this_res['rows_sample'] = samples.loc[sample, 'rows_sample']
    this_res['max_features'] = samples.loc[sample, 'max_features']
    this_res['res'] = {'folds': []}
    for fold in tqdm(range(4), total=4, leave=False):
        this_fold = {}

        rfr = RandomForestRegressor(n_estimators=1000, 
                                    min_rows_per_node = samples.loc[sample, 'min_rows_per_node'],
                                    rows_sample = samples.loc[sample, 'rows_sample'],
                                    max_features = int(samples.loc[sample, 'max_features']))
        _ = rfr.fit(*fold_train[fold])

        preds = rfr.predict(fold_test[fold][0])
        orig = fold_test[fold][1]
        
        this_fold['mae'] = float(mean_absolute_error(orig, preds))
        this_fold['rmse'] = float(mean_squared_error(orig, preds, squared=False))
        this_fold['r2'] = r2_score(orig, preds)
        this_res['res']['folds'].append(this_fold)
    this_res['res']['mae'] = np.mean([x['mae'] for x in this_res['res']['folds']])
    this_res['res']['rmse'] = np.mean([x['rmse'] for x in this_res['res']['folds']])
    this_res['res']['r2'] = np.mean([x['r2'] for x in this_res['res']['folds']])
    print("min_rows_per_node:{} rows_sample:{} max_features:{} mae:{}".format(
    this_res['min_rows_per_node'],this_res['rows_sample'],this_res['max_features'],this_res['res']['mae']))
    res.append(this_res)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:2 rows_sample:0.05292940916619948 max_features:113 mae:506.0132369995117


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:2 rows_sample:0.03760571445127933 max_features:79 mae:505.39683532714844


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:3 rows_sample:0.07243393794367631 max_features:140 mae:508.29700469970703


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:4 rows_sample:0.14927456321663024 max_features:102 mae:515.4037399291992


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:5 rows_sample:0.12572505626459646 max_features:90 mae:512.9100570678711


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:4 rows_sample:0.039248149123539494 max_features:101 mae:504.1115951538086


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:3 rows_sample:0.11136419298949833 max_features:127 mae:510.44600677490234


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:4 rows_sample:0.10673931655089633 max_features:148 mae:510.07994842529297


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:5 rows_sample:0.06369871288542621 max_features:121 mae:505.4130630493164


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:4 rows_sample:0.12678137691205107 max_features:168 mae:513.9752197265625


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:4 rows_sample:0.1002814076911441 max_features:76 mae:512.1823043823242


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

min_rows_per_node:3 rows_sample:0.08032983312158434 max_features:133 mae:510.12745666503906


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

In [None]:
rdf = pd.DataFrame({'min_rows_per_node': [r['min_rows_per_node'] for r in res],
 'rows_sample': [r['rows_sample'] for r in res],
 'max_features': [r['max_features'] for r in res],
 'mae': [r['res']['mae'] for r in res],
                    'rmse': [r['res']['rmse'] for r in res],
                   'r2': [r['res']['r2'] for r in res]})

In [None]:
rdf.sort_values('mae')

In [None]:
rdf.to_csv('run5_15_cv.csv')

In [None]:
rdf.corr()

In [None]:
rdf.plot.scatter('min_rows_per_node', 'mae');

In [None]:
rdf.plot.scatter('rows_sample', 'mae');

In [None]:
rdf.plot.scatter('max_features', 'mae');