# Aviris 2020 random forest model training and testing

In [1]:
import os
import time
from pprint import pprint

import numpy as np

from models.custom_RF import cuRF
from load_dataset.aviris_dataset import Aviris
from src.k_fold_train import k_fold_train
from src import apply_model_to_raster as amr

## Control Panel

In [7]:

NTH_BAND = 10 # Sample every N-th band from dataset.

START_INDEX = 0 # Index to start nth band sampling.

DATA_SPLIT_RATIO = 0.10 # Controls how much data is reserved for testing. 

PATH = '/att/nobackup/maronne/AVIRIS_2020/stacks/' # Path to raster stacks to be predicted.

K_FOLD_FOLDS = 5 # Controls the folds to use in k_fold cross validation training.

OUTPUT_PATH = '/att/nobackup/cssprad1/test_predictions/' # Path for the prediction rasters

## Data preprocessing

We load the dataset object in, then sample every nth band using variables provided above.

We then resplit the 'new' data.

In [3]:

aviris_dataset = Aviris()
aviris_dataset.nth_row_sampling(START_INDEX, NTH_BAND)
print(aviris_dataset.covariates.columns)
aviris_dataset.split(DATA_SPLIT_RATIO)

# You can verify below that it samples the correct bands

 - from DATA: reading csv into GPU memory
 - from DATA: done reading csv into GPU memory
Index(['b36_ang201', 'b46_ang201', 'b56_ang201', 'b66_ang201', 'b76_ang201',
       'b86_ang201', 'b96_ang201', 'b106_ang20', 'b116_ang20', 'b126_ang20',
       'b136_ang20', 'b146_ang20', 'b156_ang20', 'b166_ang20', 'b176_ang20',
       'b186_ang20', 'b196_ang20', 'b206_ang20', 'b216_ang20', 'b226_ang20',
       'b236_ang20', 'b246_ang20', 'b256_ang20', 'b266_ang20', 'b276_ang20',
       'b286_ang20', 'b296_ang20', 'b306_ang20', 'b316_ang20', 'b326_ang20',
       'b336_ang20', 'b346_ang20', 'b356_ang20', 'b366_ang20', 'b376_ang20',
       'b386_ang20', 'b396_ang20', 'b406_ang20', 'b416_ang20'],
      dtype='object')


## Hyperparameters for the random forest model

Change as you see fit

In [4]:
hyperparameters = {'N_ESTIMATORS': 1052,
                         'SPLIT_ALGO': 1,
                         'SPLIT_CRITERION': 2,
                         'BOOTSTRAP': True,
                         'BOOTSTRAP_FEATURES': False,
                         'ROWS_SAMPLE': 1.0,
                         'MAX_DEPTH': 30,
                         'MAX_LEAVES': -1,
                         'MAX_FEATURES': 'auto',
                         'N_BINS': 6,
                         'MIN_ROWS_PER_NODE': 2,
                         'MIN_IMPURITY_DECREASE': 0.0,
                         'ACCURACY_METRIC': 'mean_ae',  # 'mse' #'r2' # 'median_aw' #
                         'QUANTILEPT': False,
                         'SEED':  42,
                         'VERBOSE': False
                         }

## Random forest model
- Initialization
- Training
- Metrics gathering

In [5]:
rf_0 = cuRF(hyperparameters)

  verbose=param['VERBOSE'])


### We're going to use k_fold validation training to ensure some sort of protection against overfitting

In [8]:
rf_0 = k_fold_train(K_FOLD_FOLDS, 
                    rf_0, 
                    aviris_dataset.covariates_train, 
                    aviris_dataset.labels_train)

   - from RS_CV: Fold #: 0
   - from RS_CV: time to train (sec): 5.578750133514404
   - from RS_CV: Score: 1.5235434376977777
   - from RS_CV: Fold #: 1
   - from RS_CV: time to train (sec): 5.3650617599487305
   - from RS_CV: Score: 1.4189050754924353
   - from RS_CV: Fold #: 2
   - from RS_CV: time to train (sec): 4.87740421295166
   - from RS_CV: Score: 1.4151493333249425
   - from RS_CV: Fold #: 3
   - from RS_CV: time to train (sec): 4.229202747344971
   - from RS_CV: Score: 1.4488961392469366
   - from RS_CV: Fold #: 4
   - from RS_CV: time to train (sec): 4.2104222774505615
   - from RS_CV: Score: 1.4189650373939624
 - from k_fold: time to train and eval: 46.106565952301025


### Metrics

MAE: mean absolute error
r2: r^2 score
MSE: mean square error

In [9]:
rf_0.get_metrics(aviris_dataset.covariates_test, 
                 aviris_dataset.labels_test)

Scores ------
 MAE:  0.2696717
  r2:  0.8030987939898706
 MSE:  0.63031584


(0      1.272924
 1      1.725956
 2      1.537837
 3      0.694491
 4      1.410125
          ...   
 670    1.237621
 671    1.446139
 672    1.718790
 673    1.725609
 674    1.255140
 Length: 675, dtype: float32,
 0.2696717,
 0.8030987939898706,
 0.63031584)

# Raster processing
- Raster loading
- Reshaping to model-friendly format
- Batch prediction
- Writing predictions out as tif

In [10]:
TIF_FILES = list(amr.list_files(PATH, 'tif'))
pprint(TIF_FILES)

['ang20170714t213741_corr_v2p9.tif',
 'ang20170709t230326_corr_v2p9.tif',
 'ang20170714t234307_corr_v2p9.tif',
 'ang20170709t224222_corr_v2p9.tif',
 'ang20170714t212855_corr_v2p9.tif',
 'ang20170715t005002_corr_v2p9.tif']


In [11]:
for index, TIF_FILE in enumerate(TIF_FILES):
    
    print("TIF prediction: ", index+1, "/", len(TIF_FILES))
    
    img_nd_array, img_nd_array_properties = amr.get_array_from_raster(PATH, 
                                                                      TIF_FILE)
    
    img_nd_array_reshape = amr.change_img_shape(img_nd_array)
    img_nd_array_resample = img_nd_array_reshape[:, 36+START_INDEX::NTH_BAND]
    prediction_raster = amr.map_apply_reduce_pandas(img_nd_array_resample, 
                                                    rf_0, 
                                                    img_nd_array[:, :, 0].shape)
    
    amr.output_gtiff(prediction_raster, 
                     img_nd_array_properties, 
                     img_nd_array, 
                     TIF_FILE, 
                     OUTPUT_PATH)
    
    del img_nd_array, img_nd_array_reshape, img_nd_array_resample, prediction_raster
    

TIF prediction: 1 out of : 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170714t213741_corr_v2p9.tif
ang20170714t213741_corr_v2p9_predicted.tif
/att/nobackup/cssprad1/test_predictions/ang20170714t213741_corr_v2p9_predicted.tif
TIF prediction: 2 out of : 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170709t230326_corr_v2p9.tif
ang20170709t230326_corr_v2p9_predicted.tif
/att/nobackup/cssprad1/test_predictions/ang20170709t230326_corr_v2p9_predicted.tif
TIF prediction: 3 out of : 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170714t234307_corr_v2p9.tif
ang20170714t234307_corr_v2p9_predicted.tif
/att/nobackup/cssprad1/test_predictions/ang20170714t234307_corr_v2p9_predicted.tif
TIF prediction: 4 out of : 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170709t224222_corr_v2p9.tif
ang20170709t224222_corr_v2p9_predicted.tif
/att/nobackup/cssprad1/test_predictions/ang20170709t224222_corr_v2p9_predicted.tif
TIF prediction: 5 out of : 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170714t21