In [10]:
#Reading in and reviewing training and test datasets
import pandas as pd
import numpy as np
%matplotlib inline

train_x = pd.read_csv('../Data/train_x2.csv')
train_y = pd.read_csv('../Data/train_y2.csv',header=None)
test_x = pd.read_csv('../Data/test_x2.csv')

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
#train_x.head()
#train_y.head()
#test_x.head()
train_y = np.ravel(train_y,'C') #converting dependent variable to 1d array

(1458, 181)
(1458, 1)
(1459, 181)


**Fitting initial random forest**

In [17]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(0)
initial_forest = RandomForestRegressor(n_estimators=50, n_jobs=-1, oob_score=True)
initial_forest.fit(train_x.values, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=-1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

OOB error from initial forest

In [18]:
print('OOB Error:',initial_forest.oob_score_) #out-of-bag error
print('Acc. Score:',initial_forest.score(train_x, train_y)) #accuracy score

OOB Error: 0.838430502062
Acc. Score: 0.97628425657


Using GridSearchCV to experiment with hyperparameters

In [19]:
from sklearn.model_selection import GridSearchCV
initial_forest.get_params
param_grid = [{'n_estimators':[800,1000],'max_features':[11,7,9],'random_state':[43],'n_jobs':[-1]}]
grid_search = GridSearchCV(initial_forest, param_grid, cv=5)
grid_search.fit(train_x, train_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=-1, oob_score=True, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_jobs': [-1], 'n_estimators': [800, 1000], 'max_features': [11, 7, 9], 'random_state': [43]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [25]:
print('Best Parameters:',grid_search.best_params_)
print('Training accuracy:',grid_search.score(train_x, train_y)) #training accuracy
print('Training error:', 1 - .979138289648) #training error
print('Best Score:', grid_search.best_score_) #best score
print('Error of best model:', 1 - .840457869031) #best model error
#print('CV Results:',grid_search.cv_results_)

Best Parameters: {'n_jobs': -1, 'n_estimators': 800, 'max_features': 11, 'random_state': 43}
Training accuracy: 0.979138289648
Training error: 0.020861710352000018
Best Score: 0.840457869031
Error of best model: 0.15954213096900005


### Fitting model with second dataset

In [8]:
from sklearn.ensemble import RandomForestRegressor
initial_forest2 = RandomForestRegressor(n_estimators=50, n_jobs=-1, oob_score=True)

In [11]:
initial_forest2.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=-1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [15]:
print(initial_forest2.score(train_x, train_y))

0.982954264926


In [22]:
from sklearn.model_selection import GridSearchCV
initial_forest2.get_params
param_grid = [{'n_estimators':[600,800],'max_features':[13,14,15],'n_jobs':[-1],'max_depth':[7,8,9]}]
grid_search = GridSearchCV(initial_forest2, param_grid, cv=5)
grid_search.fit(train_x, train_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=-1, oob_score=True, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_jobs': [-1], 'max_depth': [7, 8, 9], 'max_features': [13, 14, 15], 'n_estimators': [600, 800]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [23]:
print('Best Parameters:',grid_search.best_params_)
print('Training accuracy:',grid_search.score(train_x, train_y)) #training accuracy
print('Training error:', 1 - .979138289648) #training error
print('Best Score:', grid_search.best_score_) #best score
print('Error of best model:', 1 - .840457869031) #best model error
#print('CV Results:',grid_search.cv_results_)

Best Parameters: {'n_estimators': 800, 'max_features': 15, 'max_depth': 9, 'n_jobs': -1}
Training accuracy: 0.957289033879
Training error: 0.020861710352000018
Best Score: 0.876504988228
Error of best model: 0.15954213096900005
