# 1) Forest Classification Validation Analysis

    This code performs a comprehensive validation analysis on forest classification predictions by comparing predicted classes against actual target classes in CSV files, calculating various metrics including F1-score, precision, recall, Cohen's Kappa, and AUC for each class, and generating confusion matrices for visual assessment.

In [None]:
import pandas as pd
from sklearn.metrics import f1_score, classification_report, confusion_matrix, cohen_kappa_score, accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_fscore_support
from sklearn.exceptions import UndefinedMetricWarning  # Importuj to
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import time
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

csv_folder = '/path/to/extracted_feature_values/csv'
output_file_path = '/path/to/save/tiles_accuracy.txt'
confusion_matrix_folder = '/path/to/save/confusion_matrix/'

csv_files = [file for file in os.listdir(csv_folder) if file.endswith('.csv')]

start = time.process_time()

confusion_matrices = []
with open(output_file_path, 'w') as output_file:
    for csv_file_name in csv_files:
        csv_file_path = os.path.join(csv_folder, csv_file_name)
        df = pd.read_csv(csv_file_path)
        df['target'] = pd.to_numeric(df['target'], errors='coerce')
        df['predicted'] = pd.to_numeric(df['predicted'], errors='coerce')
        
        df_filtered = df[(df['target'].between(1, 6)) & (df['predicted'].between(1, 6))]
        if df_filtered.empty:
            print(f"No valid data for {csv_file_name}, skipping.")
            continue
        
        y_true = df_filtered['target']
        y_pred = df_filtered['predicted']

        f1_scores, precisions, recalls = {}, {}, {}
        
        for class_label in range(1, 7):
            if (class_label in y_true.values) and (class_label in y_pred.values):
                precision, recall, f1, _ = precision_recall_fscore_support(y_true == class_label, y_pred == class_label, average='binary', zero_division=0)
                f1_scores[class_label] = f"{f1:.4f}"
                precisions[class_label] = f"{precision:.4f}"
                recalls[class_label] = f"{recall:.4f}"
            else:
                f1_scores[class_label] = "n/a"
                precisions[class_label] = "n/a"
                recalls[class_label] = "n/a"    

        overall_accuracy_all_classes = accuracy_score(df_filtered['target'], df_filtered['predicted'])
        kappa_score_all_classes = cohen_kappa_score(df_filtered['target'], df_filtered['predicted'])

        class_6_indices = df_filtered['target'] == 6
        overall_accuracy_class_6 = accuracy_score(df_filtered[class_6_indices]['target'], df_filtered[class_6_indices]['predicted'])

        classes = list(range(1, 7))
        y_true_binarized = label_binarize(df_filtered['target'], classes=classes)
        auc_per_class = {}
        roc_per_class = {}
        for class_label in classes:
            binary_labels = df_filtered['target'] == class_label
            if binary_labels.any():  

                fpr, tpr, thresholds = roc_curve(binary_labels, df_filtered['predicted'] == class_label)
                auc_value = auc(fpr, tpr)
                auc_per_class[f'class {class_label}'] = "{:.4f}".format(auc_value)
                
                roc_per_class[class_label] = (fpr, tpr, thresholds)
            else:
                auc_per_class[f'class {class_label}'] = "n/a"
                roc_per_class[class_label] = (None, None, None)
        
        user_accuracy_per_class = {}
        producer_accuracy_per_class = {}
        for class_label in range(1, 7):
            binary_target = (df_filtered['target'] == class_label).astype(int)
            binary_predicted = (df_filtered['predicted'] == class_label).astype(int)
            user_accuracy = "{:.4f}".format(precision_score(binary_target, binary_predicted, zero_division=0))
            producer_accuracy = "{:.4f}".format(recall_score(binary_target, binary_predicted, zero_division=0))
            user_accuracy_per_class[f'User_Accuracy_class_{class_label}'] = user_accuracy
            producer_accuracy_per_class[f'Producer_Accuracy_class_{class_label}'] = producer_accuracy

        output_file.write("_" * 70)
        output_file.write(f"\nResult for {csv_file_name}:\n")
        
        output_file.write("\nF1-scores for each class:\n")
        for class_label, score in f1_scores.items():
            output_file.write(f"Class {class_label}: {score}\n") 
        
        output_file.write(f"\nOverall Accuracy for all classes: {overall_accuracy_all_classes:.4f}\n")
        output_file.write(f"Overall Accuracy for class 6 (Forest): {overall_accuracy_class_6:.4f}\n")
        output_file.write(f"\nCohen's Kappa for all classes: {kappa_score_all_classes:.4f}\n")
        
        auc_values = [float(value) for value in auc_per_class.values() if value != "n/a"]
        auc_average = np.mean(auc_values) if auc_values else "n/a"
        output_file.write(f"\nAUC average for all classes: {auc_average}\n")
        
        class_report = classification_report(
            df_filtered['target'],
            df_filtered['predicted'],
            digits=4,
        )
        
        output_file.write(f"\nClassification Report:\n")
        output_file.write(class_report + '\n')
        
        for class_label in range(1, 7):
            output_file.write(f"\nUser Accuracy for class {class_label}: {user_accuracy_per_class[f'User_Accuracy_class_{class_label}']}\n")
            output_file.write(f"Producer Accuracy for class {class_label}: {producer_accuracy_per_class[f'Producer_Accuracy_class_{class_label}']}\n")

        expected_labels = [1, 2, 3, 4, 5, 6]
        conf_matrix = confusion_matrix(df_filtered['target'], df_filtered['predicted'], labels=expected_labels)

        confusion_matrix_filename = os.path.splitext(csv_file_name)[0] + '_confusion_matrix.png'
        confusion_matrix_filepath = os.path.join(confusion_matrix_folder, confusion_matrix_filename)
        
        labels = ['1', '2', '3', '4', '5', '6']
        plt.figure(figsize=(10/2.54, 8/2.54))
        ax = sns.heatmap(conf_matrix, annot=True, cmap="Greens", fmt='.0f', cbar=True, xticklabels=expected_labels, yticklabels=expected_labels)

        confusion_matrix_title = os.path.splitext(csv_file_name)[0][:-10]
        ax.set_xlabel("Predicted", fontsize=9)
        ax.set_ylabel("Actual", fontsize=9)
        plt.title(f"Confusion Matrix", fontsize=9)
        ax.set_xticklabels(ax.get_xticklabels(), ha="right", fontsize=9)
        ax.set_yticklabels(ax.get_yticklabels(), fontsize=9)
        for text in ax.texts:
            text.set_fontsize(8)
        plt.tight_layout(pad=1.5)
        plt.savefig(confusion_matrix_filepath, bbox_inches='tight', pad_inches=0.1, dpi=800)
        plt.close()
        confusion_matrices.append(conf_matrix)

        unclassified_points = len(df[~df['predicted'].between(1, 6)])

print("Processing time in [s]", time.process_time() - start)
print("\nAll results saved successfully :)")
