# Random search cross validation

In [1]:
from osgeo import gdal, gdal_array
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score
import numpy as np
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from pprint import pprint
gdal.UseExceptions()
gdal.AllRegister()

In [2]:
#read all training raster dataset
tr_stacked = gdal.Open('trainingImage/stacked_indicies_kampar.tif', gdal.GA_ReadOnly)
tr_smi = gdal.Open('trainingImage/smi_training_1.tif', gdal.GA_ReadOnly)

In [3]:
#convert to array for training data
stacked_zeros = np.zeros((tr_stacked.RasterYSize, tr_stacked.RasterXSize, tr_stacked.RasterCount),
                            gdal_array.GDALTypeCodeToNumericTypeCode(tr_stacked.GetRasterBand(1).DataType))
for a in range(stacked_zeros.shape[2]):
    stacked_zeros[:, :, a] = tr_stacked.GetRasterBand(a + 1).ReadAsArray()
smi = tr_smi.GetRasterBand(1).ReadAsArray().astype(np.float32)
smi_masked = np.ma.masked_where(smi == 0, smi)
#let's create feature and label array of training data
x = stacked_zeros[smi > 0, :] #feature
y = smi[smi > 0] #label
print('X shape {x} and Y shape {y}'.format(x = x.shape, y = y.shape))

X shape (176810, 15) and Y shape (176810,)


In [4]:
#split our data to train and test data by 75% and 25%
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)
print('Training features shape :', x_train.shape)
print('Training labels shape :', y_train.shape)
print('Testing features shape', x_test.shape)
print('Testing labels shape:', y_test.shape)

Training features shape : (132607, 15)
Training labels shape : (132607,)
Testing features shape (44203, 15)
Testing labels shape: (44203,)


In [5]:
#Random search cross validation parameters
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 250, num = 7)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(1, 42, num = 7)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators'   : n_estimators,
    'max_features'   : max_features,
    'max_depth'     : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
    'bootstrap' : bootstrap
}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [1, 7, 14, 21, 28, 35, 42, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 125, 150, 175, 200, 225, 250]}


In [6]:
#create a base model
rfReg = RandomForestRegressor(random_state = 42)
#initiate random search model
rf_random = RandomizedSearchCV(estimator = rfReg, param_distributions = random_grid, 
                               n_iter = 30, scoring='neg_mean_absolute_error', cv = 3, verbose = 2, random_state = 42, n_jobs = -1, return_train_score = True)

In [7]:
#fit the grid
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 42.9min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 97.8min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [8]:
#print the best params
pprint(rf_random.best_params_)

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}


In [9]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    rsq = r2_score(test_labels, predictions)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('R Squared score = {r}'.format(r = rsq))
    
    return accuracy

In [10]:
best_random = rf_random.best_estimator_
grid_accuracy = evaluate(best_random, x_test, y_test)

Model Performance
Average Error: 0.0786 degrees.
Accuracy = 83.55%.
R Squared score = 0.5506993815454889
