# Aviris 2020 random forest model training and testing

### What this notebook is doing.

#### This notebook takes the AVIRIS data, trains a GPU-based model, and apply's the model to the raster stack


1. Data preprocessing- Load the Aviris data into the GPU, narrow down to the bands we want to sample
2. Model initialization / training - Initialize a GPU random forest model based on hyperparameters, train through k_fold, save model
3. Model raster application - Get the location of the tifs to apply the GPU RF model to, put them in a form (GPU ndarray) that's friendly to the model, then make batch predictions
4. Output the batch predictions back to an image shape, write using rasterio to a GTiff at the specified location.


#### Things to know

1. Before running this make sure you make a directory 'saved_models' inside the 'models' directory
2. The models saved during training will overwrite themselves if they are trainied on the same nth band and starting index
3.  The rasters written at the end will overwrite themselves if the output path isn't changed, or the tifs aren't changed.
4.  If you want to keep every run's output rasters, do one of the above methods.

In [1]:
import os
import logging
import time
from pprint import pprint

import numpy as np

from models.custom_RF import cuRF # GPU/RAPIDS custom Random Forest objects
from load_dataset.aviris_dataset import Aviris # Aviris is now a dataset object
from src.k_fold_train import k_fold_train # Facilitates training using sklearn's kfold
from src import apply_model_to_raster as amr # Handles all raster application, io, etc.


In [2]:
logger = logging.getLogger(__name__)
logName = __name__+str(time.time())
logger.setLevel(logging.INFO)
logger.propagate = False
file_handler = logging.FileHandler(os.path.join('logs', logName))
formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(name)s : %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.info('Start logging for run')
print("Logging found in logs/"+logName)

Logging found in logs/__main__1606324561.2129352


## Control Panel

This is where we can set what we need to set.

Ideally we'll only need to change stuff in this cell and just run the others without changing them.

- Make sure we make the changes before the cell is run

In [3]:
NTH_BAND = 10 # Sample every N-th band from dataset.

START_INDEX = 0 # Index to start nth band sampling.

DATA_SPLIT_RATIO = 0.10 # Controls how much data is reserved for testing. 

PATH = '/att/nobackup/maronne/AVIRIS_2020/stacks/' # Path to raster stacks to be predicted.

K_FOLD_FOLDS = 5 # Controls the folds to use in k_fold cross validation training.

OUTPUT_PATH = '/att/nobackup/cssprad1/test_predictions/' # Path for the prediction rasters

####################### logging ############################
logger.info('\nNTH_BAND:' + str(NTH_BAND) + '\n' + \
           'START_INDEX: ' + str(START_INDEX) + '\n' + \
           'Input path: ' + PATH + '\n' \
           'Output path: ' + OUTPUT_PATH + '\n' + \
           'k_folds: ' + str(K_FOLD_FOLDS) + '\n')
############################################################

## Data preprocessing

We load the dataset object in, then sample every nth band using variables provided above.

We then resplit the 'new' data.

In [4]:

aviris_dataset = Aviris()
aviris_dataset.nth_row_sampling(START_INDEX, NTH_BAND)
print(aviris_dataset.covariates.columns)
aviris_dataset.split(DATA_SPLIT_RATIO)

# You can verify below that it samples the correct bands

####################### logging ############################
logger.info('\nBands sampled:' + \
            str(aviris_dataset.covariates.columns) + \
            '\n')
############################################################

 - from DATA: reading csv into GPU memory
 - from DATA: done reading csv into GPU memory
Index(['b36_ang201', 'b46_ang201', 'b56_ang201', 'b66_ang201', 'b76_ang201',
       'b86_ang201', 'b96_ang201', 'b106_ang20', 'b116_ang20', 'b126_ang20',
       'b136_ang20', 'b146_ang20', 'b156_ang20', 'b166_ang20', 'b176_ang20',
       'b186_ang20', 'b196_ang20', 'b206_ang20', 'b216_ang20', 'b226_ang20',
       'b236_ang20', 'b246_ang20', 'b256_ang20', 'b266_ang20', 'b276_ang20',
       'b286_ang20', 'b296_ang20', 'b306_ang20', 'b316_ang20', 'b326_ang20',
       'b336_ang20', 'b346_ang20', 'b356_ang20', 'b366_ang20', 'b376_ang20',
       'b386_ang20', 'b396_ang20', 'b406_ang20', 'b416_ang20'],
      dtype='object')


## Hyperparameters for the random forest model

Change as you see fit

In [5]:
hyperparameters = {'N_ESTIMATORS': 1052,
                         'SPLIT_ALGO': 1,
                         'SPLIT_CRITERION': 2,
                         'BOOTSTRAP': True,
                         'BOOTSTRAP_FEATURES': False,
                         'ROWS_SAMPLE': 1.0,
                         'MAX_DEPTH': 30,
                         'MAX_LEAVES': -1,
                         'MAX_FEATURES': 'auto',
                         'N_BINS': 6,
                         'MIN_ROWS_PER_NODE': 2,
                         'MIN_IMPURITY_DECREASE': 0.0,
                         'ACCURACY_METRIC': 'mean_ae',  # 'mse' #'r2' # 'median_aw' #
                         'QUANTILEPT': False,
                         'SEED':  42,
                         'VERBOSE': False
                         }

####################### logging ############################
logger.info('\nModel Hyperparameters:' + \
            str(hyperparameters) + \
            '\n')
############################################################

## Random forest model
- Initialization
- Training
- Metrics gathering

In [6]:
rf_0 = cuRF(hyperparameters)

  verbose=param['VERBOSE'])


### We're going to use k_fold validation training to ensure some sort of protection against overfitting

In [7]:
rf_0 = k_fold_train(K_FOLD_FOLDS, 
                    rf_0,
                    NTH_BAND,
                    START_INDEX,
                    aviris_dataset.covariates_train, 
                    aviris_dataset.labels_train)

   - from RS_CV: Fold #: 0
   - from RS_CV: time to train (sec): 5.209261417388916
   - from RS_CV: Score: 1.5542646694536273
   - from RS_CV: Fold #: 1
   - from RS_CV: time to train (sec): 4.454071760177612
   - from RS_CV: Score: 1.5027715215998654
   - from RS_CV: Fold #: 2
   - from RS_CV: time to train (sec): 4.39711856842041
   - from RS_CV: Score: 1.3702002606519457
   - from RS_CV: Fold #: 3
   - from RS_CV: time to train (sec): 4.7152252197265625
   - from RS_CV: Score: 1.5108692938162958
   - from RS_CV: Fold #: 4
   - from RS_CV: time to train (sec): 5.60482120513916
   - from RS_CV: Score: 1.4251251378667698
 - from k_fold: time to train and eval: 49.14971685409546


### Metrics

MAE: mean absolute error
r2: r^2 score
MSE: mean square error

In [8]:
_, mae, r2, mse = rf_0.get_metrics(aviris_dataset.covariates_test, 
                 aviris_dataset.labels_test)

####################### logging ############################
logger.info('\nMAE: ' + \
            str(mae) + '\n' + \
            'r2: ' + \
            str(r2) + '\n' + \
            'MSE: ' + \
            str(mse) + '\n')
############################################################

Scores ------
 MAE:  0.20766893
  r2:  0.8535329947786158
 MSE:  0.39215142


# Raster processing
- Raster loading
- Reshaping to model-friendly format
- Batch prediction
- Writing predictions out as tif

In [9]:
TIF_FILES = list(amr.list_files(PATH, 'tif'))
pprint(TIF_FILES)
# Make sure below that these are the tifs you want to apply the model to.
# If not, check your 'OUTPUT_PATH'.
####################### logging ############################
logger.info('\nTif files input:' + \
            str(TIF_FILES))
############################################################

['ang20170714t213741_corr_v2p9.tif',
 'ang20170709t230326_corr_v2p9.tif',
 'ang20170714t234307_corr_v2p9.tif',
 'ang20170709t224222_corr_v2p9.tif',
 'ang20170714t212855_corr_v2p9.tif',
 'ang20170715t005002_corr_v2p9.tif']


In [10]:
for index, TIF_FILE in enumerate(TIF_FILES):
    
    print("TIF prediction: ", index+1, "/", len(TIF_FILES))
    
    img_nd_array, img_nd_array_properties = amr.get_array_from_raster(PATH, 
                                                                      TIF_FILE)
    
    img_nd_array_reshape = amr.change_img_shape(img_nd_array)
    img_nd_array_resample = img_nd_array_reshape[:, 36+START_INDEX::NTH_BAND]
    prediction_raster = amr.map_apply_reduce_pandas(img_nd_array_resample, 
                                                    rf_0, 
                                                    img_nd_array[:, :, 0].shape)
    
    amr.output_gtiff(prediction_raster, 
                     img_nd_array_properties, 
                     img_nd_array, 
                     TIF_FILE, 
                     OUTPUT_PATH)
    
    del img_nd_array, img_nd_array_reshape, img_nd_array_resample, prediction_raster
    

TIF prediction:  1 / 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170714t213741_corr_v2p9.tif
ang20170714t213741_corr_v2p9_predicted.tif
/att/nobackup/cssprad1/test_predictions/ang20170714t213741_corr_v2p9_predicted.tif
TIF prediction:  2 / 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170709t230326_corr_v2p9.tif
ang20170709t230326_corr_v2p9_predicted.tif
/att/nobackup/cssprad1/test_predictions/ang20170709t230326_corr_v2p9_predicted.tif
TIF prediction:  3 / 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170714t234307_corr_v2p9.tif
ang20170714t234307_corr_v2p9_predicted.tif
/att/nobackup/cssprad1/test_predictions/ang20170714t234307_corr_v2p9_predicted.tif
TIF prediction:  4 / 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170709t224222_corr_v2p9.tif
ang20170709t224222_corr_v2p9_predicted.tif
/att/nobackup/cssprad1/test_predictions/ang20170709t224222_corr_v2p9_predicted.tif
TIF prediction:  5 / 6
/att/nobackup/maronne/AVIRIS_2020/stacks/ang20170714t212855_corr_v2p9.tif
ang20170714