# Accuracy Assessment of Water Observations from Space (WOfS) Product in Africa


The cleaned validation samples from the previous step `02b_Convert_Institution_to_AEZ.ipynb` are ingested here to create confusion mattrices for each agro-ecological zone, and one for the entire continent.


**Input data** : `<AEZ>_wofs_ls_validation_points.csv>`

**Output_data** : `<AEZ>_confusion_matrix.csv`

Last modified: 13/02/2023


### Load packages
Import Python packages that are used for the analysis.

In [1]:
%matplotlib inline

import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import plot_confusion_matrix, f1_score  

## Load the datasets
Ground truth points 

In [2]:
#Read the ground truth data 
#For each AEZ
file_path = ('../02_Validation_results/WOfS_Assessment/wofs_ls/')
validation_files = [i for i in glob.glob(os.path.join(file_path, '*.{}'.format('csv'))) if not 'Continental' in i]
validation_files

['../02_Validation_results/WOfS_Assessment/wofs_ls/Indian_ocean_wofs_ls_validation_points.csv',
 '../02_Validation_results/WOfS_Assessment/wofs_ls/Sahel_wofs_ls_validation_points.csv',
 '../02_Validation_results/WOfS_Assessment/wofs_ls/Central_wofs_ls_validation_points.csv',
 '../02_Validation_results/WOfS_Assessment/wofs_ls/Southern_wofs_ls_validation_points.csv',
 '../02_Validation_results/WOfS_Assessment/wofs_ls/Eastern_wofs_ls_validation_points.csv',
 '../02_Validation_results/WOfS_Assessment/wofs_ls/Northern_wofs_ls_validation_points.csv',
 '../02_Validation_results/WOfS_Assessment/wofs_ls/Western_wofs_ls_validation_points.csv']

## Create a continental validation dataset

In [3]:
continental = pd.concat([pd.read_csv(f) for f in validation_files])

#export to disk
continental.to_csv('../02_Validation_results/WOfS_Assessment/wofs_ls/Continental_wofs_ls_validation_points.csv')

## Function for creating confusion matrix

In [4]:
def create_confusion_matrix(df, aez):

    # create a confusion matrix
    confusion_matrix = pd.crosstab(
        df["ACTUAL"],
        df["PREDICTION"],
        rownames=["ACTUAL"],
        colnames=["PREDICTION"],
        margins=True,
    )
    
    #producer's accuracy
    confusion_matrix["Producer's"] = [
        confusion_matrix.loc[0][0] / confusion_matrix.loc[0]["All"] * 100,
        confusion_matrix.loc[1][1] / confusion_matrix.loc[1]["All"] * 100,
        np.nan,
    ]
    
    #user's acc
    users_accuracy = pd.Series(
        [
            confusion_matrix[0][0] / confusion_matrix[0]["All"] * 100,
            confusion_matrix[1][1] / confusion_matrix[1]["All"] * 100,
        ]
    ).rename("User's")

    confusion_matrix = confusion_matrix.append( 
        users_accuracy
    )
    
    #overall acc
    confusion_matrix.loc["User's", "Producer's"] = (
        (confusion_matrix[0][0] + confusion_matrix[1][1])
        / confusion_matrix["All"]["All"]
        * 100
    )
    df["PREDICTION"] = df["PREDICTION"].astype(str).astype(int)
    
    #fscore
    fscore = pd.Series(
        [
            (
                2
                * (
                    confusion_matrix.loc["User's"][0]
                    * confusion_matrix.loc[0]["Producer's"]
                )
                / (
                    confusion_matrix.loc["User's"][0]
                    + confusion_matrix.loc[0]["Producer's"]
                )
            )
            / 100,
            f1_score(df["ACTUAL"], df["PREDICTION"]),
        ]
    ).rename("F-score")
    
    #tidy confusion matrix
    confusion_matrix = confusion_matrix.append(fscore)
    confusion_matrix = confusion_matrix.round(decimals=2)
    confusion_matrix = confusion_matrix.rename(
        columns={
            "0": "NoWater",
            "1": "Water",
            0: "NoWater",
            1: "Water",
            "All": "Total",
        },
        index={"0": "NoWater", "1": "Water", 0: "NoWater", 1: "Water", "All": "Total"},
    )
    
    #remove the nonsensical values in the table
    confusion_matrix.loc["User's", 'Total'] = '--'
    confusion_matrix.loc['Total', "Producer's"] = '--'
    confusion_matrix.loc["F-score", 'Total'] = '--'
    confusion_matrix.loc["F-score", "Producer's"] = '--'
    
    print('\n')
    print('n samples for', aez,':', len(df))
    print(confusion_matrix)
    # saving out the confusion matrix
    confusion_matrix.to_csv('../02_Validation_results/WOfS_Assessment/wofs_ls/ConfusionMatrix/'+aez+'_confusion_matrix.csv')

## AEZ confusion matrices

In [5]:
for v in validation_files:
    df = pd.read_csv(v, delimiter=",")
    aez = v[49:-30]
    create_confusion_matrix(df, aez)


  confusion_matrix = confusion_matrix.append(
  confusion_matrix = confusion_matrix.append(fscore)
  confusion_matrix = confusion_matrix.append(
  confusion_matrix = confusion_matrix.append(fscore)
  confusion_matrix = confusion_matrix.append(
  confusion_matrix = confusion_matrix.append(fscore)
  confusion_matrix = confusion_matrix.append(
  confusion_matrix = confusion_matrix.append(fscore)
  confusion_matrix = confusion_matrix.append(
  confusion_matrix = confusion_matrix.append(fscore)
  confusion_matrix = confusion_matrix.append(
  confusion_matrix = confusion_matrix.append(fscore)
  confusion_matrix = confusion_matrix.append(




n samples for Indian_ocean : 2082
         NoWater    Water   Total Producer's
ACTUAL                                      
NoWater   333.00    45.00   378.0       88.1
Water     455.00  1249.00  1704.0       73.3
Total     788.00  1294.00  2082.0         --
User's     42.26    96.52      --      75.98
F-score     0.57     0.83      --         --


n samples for Sahel : 1236
         NoWater   Water   Total Producer's
ACTUAL                                     
NoWater   409.00  119.00   528.0      77.46
Water      85.00  623.00   708.0      87.99
Total     494.00  742.00  1236.0         --
User's     82.79   83.96      --       83.5
F-score     0.80    0.86      --         --


n samples for Central : 1096
         NoWater   Water   Total Producer's
ACTUAL                                     
NoWater   111.00   26.00   137.0      81.02
Water     169.00  790.00   959.0      82.38
Total     280.00  816.00  1096.0         --
User's     39.64   96.81      --      82.21
F-score     0.53 

  confusion_matrix = confusion_matrix.append(fscore)


## Continental confusion matrix

In [6]:
df = pd.concat([pd.read_csv(f) for f in validation_files])

create_confusion_matrix(df, 'Continental')



n samples for Continental : 11363
         NoWater    Water    Total Producer's
ACTUAL                                       
NoWater  2721.00   369.00   3090.0      88.06
Water    1669.00  6604.00   8273.0      79.83
Total    4390.00  6973.00  11363.0         --
User's     61.98    94.71       --      82.06
F-score     0.73     0.87       --         --


  confusion_matrix = confusion_matrix.append(
  confusion_matrix = confusion_matrix.append(fscore)
