## Apply models to new data

In [1]:
#import all necessary files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import _pickle as cPickle
from bedmap import ClassifyBedforms
from utilities import plot_rate_matrix, thresh
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix

New input datasets require columns depicting bed geology "Bed", local topographic conditions "Topo", bedform elongation "Elong", and bedform area "Area". To be able to trace features that are marked as "bedforms" or "not bedforms", rather than just determining performance statistics, numeric columns depicting "site" and "ObjectID" are required. 

In [2]:
#upload data you've collected
newdata = pd.read_csv('~/rf_bedform_mapping/data/wi_tpi.csv')

### Run Optimized Random Forest Model on new data
Notes: the input option "model" refers to which model you will run whether that is Random Forest, XGBoost, or the ensemble model. Setting "probability=True" will return probabilities of each feature being glacially derived, rather than binary 1/0 values. Setting "probability=False" and not specifying a threshold will default to a threshold of 50%. The input option "threshold" is the probability threshold. Metrics and comparisons for which model and probability threshold are best suited for your data may be found in "performance_metrics.ipynb" and "RandomForest.ipynb".

In [4]:
#optimized model for highest bedform detection probability
rf_bedforms = ClassifyBedforms('~/rf_bedform_mapping/data/wi_tpi.csv',
                            model='random_forest', 
                            threshold=0.5, probability=False)

In [5]:
#add predictions to the original dataset
rf_predicted_bedforms = rf_bedforms.predicted_bedforms
newdata['rf_bedform_pred'] = rf_predicted_bedforms.tolist()

### Run Optimized XGBoost Model on new data

In [3]:
#optimized model for highest bedform detection probability
xgb_bedforms = ClassifyBedforms('~/rf_bedform_mapping/data/wi_tpi.csv',
                            model= 'xgboost', 
                            threshold=0.4, probability=False)

ModuleNotFoundError: No module named 'xgboost'

In [9]:
#add predictions to the original dataset
xgb_predicted_bedforms = xgb_bedforms.predicted_bedforms
newdata['xgb_bedform_pred'] = xgb_predicted_bedforms.tolist()

### Run Optimized Ensemble Model on new data

In [6]:
#optimized model for highest bedform detection probability
ens_bedforms = ClassifyBedforms('~/rf_bedform_mapping/data/wi_tpi.csv',
                            model='ensemble_average', 
                            threshold=0.45, probability=False)

ModuleNotFoundError: No module named 'xgboost'

In [9]:
#add predictions to the original dataset
ens_predicted_bedforms = ens_bedforms.predicted_bedforms
newdata['ens_bedform_pred'] = ens_predicted_bedforms.tolist()

In [11]:
#Export new datafile with all columns of predicted bedforms
newdata.to_csv('~/rf_bedform_mapping/data/wi_models_output.csv', index=False)