This notebooks implements fitting of a randome forest classifier using filtered training features, evaluate classification performance through cross validation and save fitted classifier.

### load packages

In [None]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from odc.io.cgroups import get_cpu_quota
import xarray as xr
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score,cohen_kappa_score,confusion_matrix,ConfusionMatrixDisplay,balanced_accuracy_score
from sklearn.metrics import  precision_score, recall_score
from joblib import dump

ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

### load training data and set parameters

In [None]:
dict_map_merged={11:'Tree crops',12:'Field crops',21:'Forest plantations',31:'Grassland',
                 41:'Aquatic or regularly flooded herbaceous vegetation',44:'Water body',
                 51:'Settlements',61:'Bare soils',70:'Mangrove',71:'Mecrusse',
                72:'Broadleaved (Semi-) evergreen forest',75:'Mopane',
                 74:'Broadleaved (Semi-) deciduous forest'} # dictionary of merged classes
# training_data='Results/train_poly_848_20171124_signatures_2021_filtered.txt'
training_data='Results/train_poly_848_20171124_signatures_2021_force_15pct_filtered.txt'
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','swir_1','swir_2','NDVI']
measurements_MAD=['smad','emad','bcmad']
# class_name = 'LC_Class_I' # class label in integer format
class_name = 'Class_I' # class label in integer format
column_names=[class_name]
for measurement in measurements:
    for i in range(6):
        column_names.append(measurement+'_'+str(i))
for measurement in measurements_MAD:
    for i in range(2):
        column_names.append(measurement+'_'+str(i))
# load training data as dataframe
df_training_data= pd.read_csv(training_data,delimiter=' ')
df_training_data=df_training_data[column_names]
print(df_training_data)

### plot training samples distribution

In [None]:
class_counts=df_training_data[class_name].value_counts()
class_indices=class_counts.index
class_legends=[dict_map_merged[class_indices[i]] for i in range(len(class_indices))]
# class_legends=[dict_map[class_indices[i]] for i in range(len(class_indices))]
plt.figure(figsize=(15,5))
ax=plt.bar(class_legends,height=class_counts.to_numpy())
plt.bar_label(ax)
plt.gca().set_ylabel('Number of training samples')
plt.gca().set_xlabel('Land cover class')
plt.gca().tick_params(axis='x', rotation=45)

### grid search to find optimal hyperparameters

In [None]:
# convert to numpy array
model_input=df_training_data.to_numpy()
# grid search to find optimal random forest classifier hyperparameters
cv=model_selection.StratifiedShuffleSplit(n_splits=5,random_state=1) # stratified shuffle K-fold splitting
rf = RandomForestClassifier()
grid_parameters={'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 400, num = 15)],
                 'max_samples':[0.3,0.5,1],'min_samples_split':[2,4,8],'max_features': ['sqrt', 'log2'],
                'class_weight':['balanced', None]}
print('Grid searching best hyper-parameters...')
grid_search=model_selection.GridSearchCV(estimator = rf,param_grid=grid_parameters,cv=cv,n_jobs=ncpus)
grid_search.fit(model_input[:,1:],model_input[:,0])
print('Optimal parameters: \n',grid_search.best_params_)
rf = RandomForestClassifier(**grid_search.best_params_, random_state=1, n_jobs=ncpus)

### fit classifier using the optimised hyperparameters

In [None]:
# fit classifier using all features
rf=RandomForestClassifier(n_estimators=100,max_samples=0.5,min_samples_leaf=1,bootstrap=True)
rf.fit(model_input[:,1:],model_input[:,0])

### plot feature importance

In [None]:
# most important features
feat_importance_indices=np.argsort(rf.feature_importances_)[-10:]
print('most importance features: \n',df_training_data.columns[1:][feat_importance_indices])
plt.figure(figsize=(5,30))
order=np.argsort(rf.feature_importances_)
plt.barh(y=np.array(df_training_data.columns[1:])[order],width=rf.feature_importances_[order])
plt.gca().set_ylabel('Importance', labelpad=10)
plt.gca().set_xlabel('Variable', labelpad=10)

### save the fitted model

In [None]:
# dump(rf, 'Results/RF_model_Mozambique_2017.joblib')
# dump(rf, 'Results/RF_model_Mozambique_2021.joblib')
dump(rf, 'Results/RF_model_using_filtered_15ptc_td_Mozambique_2021.joblib')

### evaluate performance

In [None]:
model_input_selected=model_input
skf=model_selection.StratifiedKFold(n_splits=5,shuffle=True,random_state=1) # stratified K-fold splitting
# skf=model_selection.StratifiedKFold(n_splits=5,shuffle=False) # stratified K-fold splitting
overall_acc=model_selection.cross_val_score(rf,model_input_selected[:,1:],model_input_selected[:,0],cv=skf,scoring='accuracy')
print('Overall accuracy from cv scores: ',np.mean(overall_acc))
f1_macro=model_selection.cross_val_score(rf,model_input_selected[:,1:],model_input_selected[:,0],cv=skf,scoring='f1_macro')
print('f1_macro from cv scores: ',np.mean(f1_macro))
# print('Overall accuracy: ',np.mean(overall_acc)*100,'%')
cv_results=model_selection.cross_validate(rf,model_input_selected[:,1:],model_input_selected[:,0],cv=skf)
print('Overall accuracy from cv: ',np.mean(cv_results['test_score']))
predictions=model_selection.cross_val_predict(rf,model_input_selected[:,1:],model_input_selected[:,0],cv=skf)
print('Overall accuracy from cv predict: ',accuracy_score(model_input_selected[:, 0],predictions))

# values=list(dict_map.keys())
values=list(dict_map_merged.keys())
precision=precision_score(model_input_selected[:,0],predictions,labels=values,average=None)
# print('Precision for each class: \n',dict(zip([dict_map[value] for value in values],np.around(precision,3))))
print('Precision for each class: \n',dict(zip([dict_map_merged[value] for value in values],np.around(precision,3))))

recall=recall_score(model_input_selected[:,0],predictions,labels=values,average=None)
# print('Recall for each class: \n',dict(zip([dict_map[value] for value in values],np.around(recall,3))))
print('Recall for each class: \n',dict(zip([dict_map_merged[value] for value in values],np.around(recall,3))))

cm=confusion_matrix(model_input_selected[:, 0], predictions)
# disp=ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[dict_map[rf.classes_[i]] for i in range(len(rf.classes_))])
disp=ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[dict_map_merged[rf.classes_[i]] for i in range(len(rf.classes_))])
print('Confusion matrix:\n')
fig, ax = plt.subplots(figsize=(15,15))
disp.plot(ax=ax)