# Random Forest

## Setup

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

data = np.load('./dataset/preprocessed.npz')
x_train_whole = data['train_x']
y_train_whole = data['train_y']
x_test = data['test_x']

# set nan to 0
x_train_whole[np.isnan(x_train_whole)] = 0
x_test[np.isnan(x_test)] = 0

x_train, x_val, y_train, y_val = train_test_split(x_train_whole, y_train_whole, test_size=0.2)

print('x_test shape:', x_test.shape)
print('x_train shape:', x_train.shape)
print('x_val shape:', x_val.shape)
print('y_train shape:', y_train.shape)
print('y_val shape:', y_val.shape)

x_test shape: (4398, 20083)
x_train shape: (2400, 20083)
x_val shape: (600, 20083)
y_train shape: (2400,)
y_val shape: (600,)


## Training and Validation

In [8]:
forest = RandomForestRegressor(
    n_estimators=125,
    criterion='mse',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False
)

forest.fit(x_train[:,1:], y_train) # skip id
print('Feature importances: ', forest.feature_importances_)

y_pred_train = forest.predict(x_train[:,1:])
y_pred_train = np.clip(y_pred_train, 0.0, None) # set lower limit to 0 to avoid log error
train_score = np.sqrt(mean_squared_log_error(y_train, y_pred_train))
print('Train set score: ', train_score)

y_pred_val = forest.predict(x_val[:,1:])
y_pred_val = np.clip(y_pred_val, 0.0, None)
val_score = np.sqrt(mean_squared_log_error(y_val, y_pred_val))
print('Validation set score: ', val_score)

del forest

Feature importances:  [5.53089753e-01 1.38294419e-01 4.02775243e-02 ... 0.00000000e+00
 0.00000000e+00 5.11576374e-07]
Train set score:  1.8624725525605823
Validation set score:  2.109640402162157


## Export test output

In [21]:
forest = RandomForestRegressor(
    n_estimators=125,
    criterion='mse',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False
)

forest.fit(x_train_whole[:,1:], y_train_whole) # skip id
print('Feature importances: ', forest.feature_importances_)

y_pred_train_whole = forest.predict(x_train_whole[:,1:])
y_pred_train_whole = np.clip(y_pred_train_whole, 0.0, None) # set lower limit to 0 to avoid log error
train_score = np.sqrt(mean_squared_log_error(y_train_whole, y_pred_train_whole))
print('Train set score: ', train_score)

y_pred_test = forest.predict(x_test[:,1:])
output =np.concatenate((x_test[:,0].reshape(-1,1), y_pred_test.reshape(-1,1)), axis=1)
np.savetxt('./test_out_randomforest.csv', output, header='id,revenue', delimiter=',', fmt='%i', comments='')
print('file saved')

Feature importances:  [5.53407557e-01 1.39469179e-01 3.53911931e-02 ... 0.00000000e+00
 0.00000000e+00 1.39687282e-06]
Train set score:  1.8320865020150159
file saved
