## Code accompanyment to"Machine learning applied to a modern-Pleistocene petrographic dataset: The global prediction of sand modal composition (GloPrSM) model"
### J. Isaac Johnson, Glenn R. Sharman, Eugene Szymanski, and Xiao Huang
### University of Arkansas, Department of Geosciences
### Please direct questions and correspondance to gsharman@uark.edu

## Step 1: Load sand modal composition data and make random forests models

In [None]:
# Import required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, ShuffleSplit, train_test_split#, cross_validate, cross_val_score
from sklearn.metrics import *
import time
import pickle
import pathlib

### Load dependent variable data

In [None]:
rf_data = pd.read_excel('GloPrSM_Input_v1.0.xlsx', engine='openpyxl')

print(len(rf_data))
rf_data.head(1)

### Feature selection

In [None]:
feature_list = ['Relief_AVG', 'Area_sq_km', 'Pre_mm_AVG', 'Tmp_dc_AVG', 'Slope_AVG', 'Lith_PYVAVI',
                'Lith_PAPI', 'Lith_VB', 'Lith_PB', 'Lith_EVSC', 'Lith_SMSSSU', 'Lith_MT']

features = rf_data[feature_list]
features.head(1)

### Feature correlation (optional)

In [None]:
feature_list_clean=['Relief','Catchment\n Area','Precipitation','Temperature','Slope','PY+VA+VI','PA+PI','VB','PB','EV+SC','SM+SS+SU','MT']
corr = features.corr()
top_corr_features = corr.index
fig, ax = plt.subplots(figsize=(12,10))
ax.axis('on'), ax.patch.set_edgecolor('black')
g=sns.heatmap(features[top_corr_features].corr(),vmin=-1.,vmax=1.,xticklabels=feature_list_clean,yticklabels=feature_list_clean,annot=True,cmap="bwr")

### Model training
Note: this code can take many hours to run, depending on the number of splits chosen. A large amount of hard drive space is required (~2.5 GB per split used).

In [None]:
# Specify output folder
base_path = r'Z:\Sharman\GloPrSM_git\v1.0'

model_path = base_path + '\\' + 'models'
val_path = base_path + '\\' + 'validation'
test_path = base_path + '\\' + 'test_labels'

# Recursively creates the directory and does not raise an exception if the directory already exists
pathlib.Path(model_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(val_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(test_path).mkdir(parents=True, exist_ok=True)

In [None]:
labels = ['FQ_QFL_IJ', 'LQ_QFL_IJ', 'QmQch_QmQpQch_IJ', 'QpQch_QmQpQch_IJ', 'FkFp_FpFk_IJ', 'LsLv_LvLsLm_IJ', 'LmLv_LvLsLm_IJ']

splits = 10 # Note, 100 splits are used in the article
rs = ShuffleSplit(n_splits=splits, test_size=.2, random_state=0)
stats = np.zeros(shape=(splits,len(labels)))

start = time.time()

for j, label in enumerate(labels):
#for j, label in enumerate(labels[0:2]): # Just QFL
    
    val_df = pd.DataFrame()
    tst_df = pd.DataFrame()

    model = RandomForestRegressor(n_estimators=len(rf_data),random_state=0,max_features='auto')
    data = rf_data[rf_data[label].isnull()==False].reset_index()
    print(label,len(data))
    
    i = 0
    for train_index, test_index in rs.split(data):
        start = time.time()
        
        train_features = data.iloc[train_index].loc[:, feature_list]
        test_features  = data.iloc[test_index].loc[:, feature_list]
        train_labels = data.iloc[train_index].loc[:,label]
        test_labels  = data.iloc[test_index].loc[:,label]
        
        model.fit(train_features, train_labels)
        prediction = model.predict(test_features)
        r2 = r2_score(test_labels, prediction)

        stats[i,j] = r2
        val_df.loc[:,'{}_valid_{}'.format(label, i)] = prediction
        tst_df.loc[:,'{}_label_{}'.format(label, i)] = test_labels

        # Export the validation results
        val_df.to_csv(val_path+'\\'+'{}_validation_rlf.csv'.format(label),index=False)
        tst_df.to_csv(test_path+'\\'+'{}_label_rlf.csv'.format(label),index=False)
        
        # Save the model
        model_filename = 'model_'+str(i)+'.sav'
        model_filepath = model_path+'\\'+str(label)
        pathlib.Path(model_filepath).mkdir(parents=True, exist_ok=True) # Recursively creates the directory and does not raise an exception if the directory already exists
        pickle.dump(model, open(model_filepath+'\\'+model_filename, 'wb'))
            
        print(i, 'R2: {}, {} sec'.format(round(r2,6), round(time.time()-start,1)))    
        i += 1
    print()
r2_df = pd.DataFrame(stats, columns=[x+'_R2' for x in labels])
#r2_df.to_csv('/Volumes/G-DRIVE mobile SSD R-Series/Thesis/Variance_mapping/Glenn8/all_shuffles/Glenn_8_R2_stats.csv',index=False)
r2_df.to_csv(base_path+'\\'+'R2_stats_rlf.csv',index=False)

# Save the list of features used in the models, so you know what is going on
features = pd.DataFrame()
features['Inputs'] = feature_list
features.to_csv(base_path+'\\'+'feature_list.csv', index=False)