# 1) Training and Evaluating a Random Forest Classifier for Image Classification

    This code performs image classification using a Random Forest classifier. It processes training data, including handling of missing values and infinity, calculates the distribution of image classes within an area of interest (AOI), trains the Random Forest model with hyperparameter settings, evaluates its performance via a confusion matrix and classification report, and saves the model for future use.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import pickle

data = [['water', 1], ['urbanized', 2], ['soil', 3], ['cropland', 4], ['grassland', 5], ['forest/treecover', 6]]
df = pd.DataFrame(data, columns=['layer', 'class'])

training_samples_inf = pd.read_csv('/path/to/point_feature_value_extracted/training_samples.csv')

training_samples = training_samples_inf.replace([np.inf, -np.inf], np.nan, inplace=True)
training_samples = training_samples_inf.dropna(axis=0)
original_features = training_samples.iloc[:, 3:33]
column_names = training_samples.columns

stat = []
unique = training_samples['class'].unique()
for i in unique:
    no = len(training_samples[training_samples['class'] == i])
    stat = stat + [no]
    
stat = np.array(stat)
combined = np.vstack((unique, stat)).T

df1 = pd.DataFrame(combined)
df1.columns = ['class', 'no_of_found_images_within_AOI']
df1['%'] = round(df1['no_of_found_images_within_AOI'] / df1['no_of_found_images_within_AOI'].sum(), 2)
df2 = df1.merge(df, on='class')
df2 = df2.sort_values('class')

print('Shape of the training samples with NaN, Inf/no.features', training_samples_inf.shape)
print('Shape of the training samples without NaN, Inf/no.features', training_samples.shape)
print('Contribution of each class in training samples')

filename = "/path/to/save/RF_model.sav"

target = training_samples['class']
randomforest = RandomForestClassifier(n_jobs=-1, n_estimators=500, class_weight="balanced", min_samples_leaf=1, bootstrap=True, oob_score=True)
rf_model_initial = randomforest.fit(original_features, target)
y_rf_train_pred = rf_model_initial.predict(original_features)

conf = confusion_matrix(target, y_rf_train_pred, normalize='true')  # Normalization to percentage

labels = ['1', '2', '3', '4', '5', '6']

plt.figure(figsize=(10/2.54, 8/2.54))
plt.title(f"Confusion Matrix [%]", fontsize=9)
ax = sns.heatmap(conf * 100, annot=True, cmap="Greens", fmt='.2f', cbar=True, xticklabels=labels, yticklabels=labels, vmin=0, vmax=100)
ax.set_xlabel("Predicted", fontsize=9)
ax.set_ylabel("Actual", fontsize=9)
ax.set_xticklabels(ax.get_xticklabels(), ha="right", fontsize=9)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=9)
for text in ax.texts:
    text.set_fontsize(8)
plt.tight_layout(pad=1.5)
plt.savefig("path/to/save/RF_confusion_matrix.png", dpi=500)

report = metrics.classification_report(target, y_rf_train_pred, digits=3, output_dict=True)
df3 = pd.DataFrame(report).transpose()

pickle.dump(rf_model_initial, open(filename, 'wb'))

oob_error = 1 - randomforest.oob_score_

print("Classification accuracy report:", df3)
print(f'OOB error: {oob_error:.3f}')
print(f'OOB score: {randomforest.oob_score_:.3f}')