# Aviris 2020 random forest model training and testing

In [1]:
import os
import time
from pprint import pprint

import numpy as np
import cupy

from models.custom_RF import cuRF
from load_dataset.aviris_dataset import Aviris
from src.k_fold_train import k_fold_train
from src import apply_model_to_raster as amr

## Control Panel

In [4]:
# Data sampling
NTH_BAND = 10 # Sample every N-th band from dataset
START_INDEX = 0 # Index to start nth band sampling
PATH = '/att/nobackup/maronne/AVIRIS_2020/stacks/' # Path to raster stacks to be predicted


## Data preprocessing

In [5]:
aviris_dataset = Aviris()
aviris_dataset.nth_row_sampling(START_INDEX, NTH_BAND)
print(aviris_dataset.covariates.columns)
aviris_dataset.split(0.10)

# You can verify below that it samples the correct bands

 - from DATA: reading csv into GPU memory
 - from DATA: done reading csv into GPU memory
Index(['b36_ang201', 'b46_ang201', 'b56_ang201', 'b66_ang201', 'b76_ang201',
       'b86_ang201', 'b96_ang201', 'b106_ang20', 'b116_ang20', 'b126_ang20',
       'b136_ang20', 'b146_ang20', 'b156_ang20', 'b166_ang20', 'b176_ang20',
       'b186_ang20', 'b196_ang20', 'b206_ang20', 'b216_ang20', 'b226_ang20',
       'b236_ang20', 'b246_ang20', 'b256_ang20', 'b266_ang20', 'b276_ang20',
       'b286_ang20', 'b296_ang20', 'b306_ang20', 'b316_ang20', 'b326_ang20',
       'b336_ang20', 'b346_ang20', 'b356_ang20', 'b366_ang20', 'b376_ang20',
       'b386_ang20', 'b396_ang20', 'b406_ang20', 'b416_ang20'],
      dtype='object')


## Hyperparameters for the random forest model

Change as you see fit

In [6]:
hyperparameters = {'N_ESTIMATORS': 1052,
                         'SPLIT_ALGO': 1,
                         'SPLIT_CRITERION': 2,
                         'BOOTSTRAP': True,
                         'BOOTSTRAP_FEATURES': False,
                         'ROWS_SAMPLE': 1.0,
                         'MAX_DEPTH': 30,
                         'MAX_LEAVES': -1,
                         'MAX_FEATURES': 'auto',
                         'N_BINS': 6,
                         'MIN_ROWS_PER_NODE': 2,
                         'MIN_IMPURITY_DECREASE': 0.0,
                         'ACCURACY_METRIC': 'mean_ae',  # 'mse' #'r2' # 'median_aw' #
                         'QUANTILEPT': False,
                         'SEED':  42,
                         'VERBOSE': False
                         }

## Random forest model
- Initialization
- Training
- Metrics gathering

In [7]:
rf_0 = cuRF(hyperparameters)

  verbose=param['VERBOSE'])


We're going to use k_fold validation training to ensure some sort of protection against overfitting

In [8]:
rf_0 = k_fold_train(2, rf_0, aviris_dataset.covariates_train, aviris_dataset.labels_train)
#rf_0.train(aviris_dataset.covariates, aviris_dataset.labels_train)

   - from RS_CV: Fold #: 0
   - from RS_CV: time to train (sec): 5.188384056091309
   - from RS_CV: Score: 1.4465113832845464
   - from RS_CV: Fold #: 1
   - from RS_CV: time to train (sec): 3.935577869415283
   - from RS_CV: Score: 1.4798963479476983
 - from k_fold: time to train and eval: 14.882799625396729


In [9]:
rf_0.get_metrics(aviris_dataset.covariates_test, aviris_dataset.labels_test)

Scores ------
 MAE:  0.27725598
  r2:  0.8135172818188533
 MSE:  0.6662468


(0      1.298750
 1      1.813153
 2      1.722613
 3      0.801375
 4      0.703305
          ...   
 670    1.682699
 671    1.334512
 672    1.646680
 673    1.519925
 674    0.764098
 Length: 675, dtype: float32,
 0.27725598,
 0.8135172818188533,
 0.6662468)

# Raster processing
- Raster loading
- Reshaping to model-friendly format
- Batch prediction
- Writing predictions out as tif

In [10]:
TIF_FILES = list(amr.list_files(PATH, 'tif'))
print(TIF_FILES[3])

ang20170709t224222_corr_v2p9.tif


In [11]:
img_nd_array, img_nd_array_properties = amr.get_array_from_raster(PATH, TIF_FILES[3])
print(img_nd_array_properties)

/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170709t224222_corr_v2p9.tif
(3103, 3421, 425, Affine(5.0999999999998975, 0.0, 576931.114813,
       0.0, -5.0999999999998975, 7915247.221175592), CRS.from_epsg(32604), -9999.0)


In [12]:
print(img_nd_array.shape)

(3421, 3103, 425)


In [13]:
img_nd_array1 = amr.change_img_shape(img_nd_array)

In [14]:
print(img_nd_array1.shape)

(10615363, 425)


In [15]:
img_nd_array2 = img_nd_array1[:, 36+START_INDEX::NTH_BAND]

In [16]:
img_nd_array2.shape
#print(np.array_split(img_nd_array2, 10).shape)

(10615363, 39)

In [17]:
prediction_raster = amr.map_apply_reduce_pandas(img_nd_array2, rf_0, img_nd_array[:, :, 0].shape)

In [18]:
prediction_raster.shape

(3421, 3103)

In [19]:
amr.output_gtiff(prediction_raster, img_nd_array_properties, img_nd_array, TIF_FILES[3], 'predictions/')

ang20170709t224222_corr_v2p9_predicted.tif
predictions/ang20170709t224222_corr_v2p9_predicted.tif
