sample allocation for the Hispaniola accuracy assessment

In [1]:
import numpy as np
import os
from os.path import join
import pandas as pd

from accuracy_assessment.good_practice_sample_allocation import total_sample_num_calculate, sample_allocation

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def sample_allocation_pf_preference(total_sample_num, array_weight, pf_class_num=150, rare_class_num=75, rare_class_threshold=0.1):
    """
        allocate the sample number for each class.
        We have a preference for the primary forest here. More sample number will be allocated to the primary forest.

    Args:
        total_sample_num: total sample number
        array_weight: weight for each class
        rare_class_num: the sample number allocated to the rare class
        rare_class_threshold: the weight threshold to determine the rara class

    Returns:
        array_selected_num: the selected sample number of each class (strata)
    """

    array_selected_num = np.zeros((len(array_weight)), dtype=int)

    # set the sample number for rare class
    for i_class, array_weight_class in enumerate(array_weight):
        if array_weight_class <= rare_class_threshold:
            array_selected_num[i_class] = rare_class_num

    # set the sample number for primary forest
    array_selected_num[1] = pf_class_num  # primary wet forest
    array_selected_num[2] = pf_class_num  # primary dry forest

    # calculate the proportion for the remaining classes
    rest_proportion = array_weight[array_selected_num == 0]
    rest_proportion_redistribute = rest_proportion / np.nansum(rest_proportion)

    # set the sample number for rest class based on the area proportion
    rest_sample_count = (total_sample_num - np.nansum(array_selected_num)) * rest_proportion_redistribute
    rest_sample_count = np.round(rest_sample_count)

    array_selected_num[array_selected_num == 0] = rest_sample_count

    return array_selected_num

In [3]:
pwd = os.getcwd()
rootpath = os.path.abspath(os.path.join(pwd, '..'))

filename_accuracy_assessment_tabel = join(rootpath, 'results', 'accuracy_assessment_table.xlsx')

filename_lc_pct = join(rootpath, 'results', 'land_cover_pct.xlsx')
df_lc_pct = pd.read_excel(filename_lc_pct, sheet_name='Hispaniola')

# get the count and weight for land cover classification accuracy assessment
array_count_lc = df_lc_pct.iloc[:, 2:10].sum().values
array_weight_lc = array_count_lc / np.nansum(array_count_lc)

standar_error_est_overall_accu = 0.02
# conjecture_user_accuracy = np.array([0.85, 0.8, 0.8, 0.85, 0.85, 0.85, 0.95, 0.85])
conjecture_user_accuracy = 0.80

total_sample_num = total_sample_num_calculate(standar_error_est_overall_accu, array_weight_lc, conjecture_user_accuracy)

# array_selected_num = sample_allocation(total_sample_num, array_weight, rare_class_num=100, rare_class_threshold=0.1)
array_selected_num_pf_preference = sample_allocation_pf_preference(total_sample_num, array_weight_lc,
                                                                   pf_class_num=50,
                                                                   rare_class_num=25,
                                                                   rare_class_threshold=0.1)

print(f'total sample count: {total_sample_num}')
print(f'sample count for each land cover stratum: {array_selected_num_pf_preference}')
print(f'developed: {array_selected_num_pf_preference[0]}')
print(f'primary wet forest: {array_selected_num_pf_preference[1]}')
print(f'primary dry forest: {array_selected_num_pf_preference[2]}')
print(f'secondary forest: {array_selected_num_pf_preference[3]}')
print(f'shrub/grass: {array_selected_num_pf_preference[4]}')
print(f'water: {array_selected_num_pf_preference[5]}')
print(f'wetland: {array_selected_num_pf_preference[6]}')
print(f'other: {array_selected_num_pf_preference[7]}')


total sample count: 400
sample count for each land cover stratum: [ 25  50  50  86  25  25  25 114]
developed: 25
primary wet forest: 50
primary dry forest: 50
secondary forest: 86
shrub/grass: 25
water: 25
wetland: 25
other: 114


In [4]:
# get the count and weight for primary forest loss detection accuracy assessment
count_pf_loss = df_lc_pct.iloc[0, 3:5].sum() - df_lc_pct.iloc[-1, 3:5].sum()
count_other = df_lc_pct.loc[0, 'TOTAL'] - count_pf_loss

array_count_pf_loss = np.array([count_other, count_pf_loss])
array_weight_pf_loss = array_count_pf_loss / np.nansum(array_count_pf_loss)

standard_error_est_overall_accu = 0.01
conjecture_user_accuracy = np.array([0.95, 0.7])
# conjecture_user_accuracy = 0.85

total_sample_num = total_sample_num_calculate(standard_error_est_overall_accu, array_weight_pf_loss, conjecture_user_accuracy)
array_selected_num = sample_allocation(total_sample_num, array_weight_pf_loss, rare_class_num=100, rare_class_threshold=0.1)

print(f'total sample count: {total_sample_num}')
print(f'sample count for each stratum: {array_selected_num}')
print(f'other: {array_selected_num[0]}')
print(f'PF loss: {array_selected_num[1]}')


total sample count: 486
sample count for each stratum: [386 100]
other: 386
PF loss: 100
