# ___

# [ Machine Learning in Geosciences ]

**Department of Applied Geoinformatics and Carthography, Charles University** 

*Lukas Brodsky lukas.brodsky@natur.cuni.cz*

    
___


## Geographic Random Forest

Example based on simulated data 

In [None]:
import os
from copy import deepcopy
import numpy as np
import gdal
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

### Data

In [None]:
# update path
PATH = '/Users/lukasbrodsky/Work/Prfuk/Vyuka/Machine_Learning_in_Geosciences/mlgeo/10_geographic_rf/data/'
filename = 'syngeo.tif'

fn = os.path.join(PATH, filename)

In [None]:
ds = gdal.Open(fn)
X_full = ds.GetRasterBand(1).ReadAsArray()
X_full.shape

In [None]:
def build_checkerboard(w, h):
    re = np.r_[ w*[0,1] ] 
    ro = np.r_[ w*[1,0] ] 
    
    return np.row_stack(h*(re, ro))

In [None]:
samples = build_checkerboard(int(X.shape[1] / 2), int(X.shape[0] / 2))

In [None]:
X = X_full
y = X_full + (np.random.randn(X_full.shape[0], X_full.shape[1]) * 500) # add noise
X_ = X.flatten().reshape(-1, 1)
y_ = y.flatten().reshape(-1, 1)

# print(X_.shape) 
# print(y_.shape)

In [None]:
plt.plot(X.flatten(), y.flatten(), 'b.')

### Global RF

In [None]:
# Global Random Forest
param_grid = [{'n_estimators': [10, 30, 50, 100, 200]}]

forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_, y_[:, 0])

In [None]:
grid_search.best_params_

In [None]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_)
final_mse = mean_squared_error(y_, final_predictions)
global_rmse = np.sqrt(final_mse)
print(global_rmse)

### Local RFs

In [None]:
# Lokalni RFs

bandwidth = 20
# TODO: add stride = bandwidth

local_param_grid = [{'n_estimators': [3, 5, 10, 15, 30]}]
local_rmse = []
lrf = RandomForestRegressor(oob_score=True, max_features=None, random_state=42)

for i in range(int(X.shape[0] / bandwidth)):
    i_coord = int(i * bandwidth + (bandwidth / 2))

    for j in range(int(X.shape[1] / bandwidth)):
        j_coord = int(j * bandwidth + (bandwidth / 2))
        print('Local coords: ', i_coord, j_coord)
        
        X_ij = X[i_coord - int(bandwidth/2) : i_coord + int(bandwidth/2), 
                 j_coord - int(bandwidth/2) : j_coord + int(bandwidth/2)]

        y_ij = y[i_coord - int(bandwidth/2) : i_coord + int(bandwidth/2), 
                 j_coord - int(bandwidth/2) : j_coord + int(bandwidth/2)]
        
        lrf_ = deepcopy(lrf)
        grid_search = GridSearchCV(lrf_, local_param_grid, cv=3,
                           scoring='neg_mean_squared_error', return_train_score=True)
        grid_search.fit(X_ij.flatten().reshape(-1, 1), (y_ij.flatten().reshape(-1, 1))[:,0])
        best_model = grid_search.best_estimator_
        print(grid_search.best_params_)
        best_predictions = best_model.predict(X_ij.flatten().reshape(-1, 1))
        best_mse = mean_squared_error((y_ij.flatten().reshape(-1, 1))[:,0], best_predictions)
        best_rmse = np.sqrt(best_mse)
        local_rmse.append(best_rmse)
        print(best_rmse)
        print('---')


In [None]:
print('Globale model RMSE: {}'.format(global_rmse))
print('Local models RMSE: {}'.format(local_rmse))

In [None]:
plt.hlines(global_rmse, xmin=0, xmax=len(local_rmse), color='red') # global
plt.plot(local_rmse, 'g.-') # locals
plt.ylabel('RMSE')
plt.show()