### Process stats for multiband image

In [1]:
import ee
import pandas as pd
import geemap
import geopandas as gpd
import modules.tidy_tables as tidy_tables
import json
import numpy as np
import time

from modules.gee_initialize import initialize_ee 

initialize_ee()

from parameters.config_runtime import country_dataset_name
from parameters.config_output_naming import geo_id_column, geometry_area_column, plot_id_column

from parameters.config_lookups import lookup_gee_datasets

## get lists from lookup for different streams of processing
from  dataset_properties.make_processing_lists_from_lookup import buffer_stats_list,\
                                                                  presence_only_flag_list,\
                                                                  country_allocation_stats_only_list, \
                                                                  plot_stats_list, \
                                                                  decimal_place_column_list

Earth Engine has been initialized with the specified credentials.


In [1]:
# %run alt_whisp_prep_multiband_image.ipynb # get stats and image
from modules.multiband_stats import get_stats, get_stats_formatted 

##### TO DO - put function into a python script
NB This is for on the fly processing as pandas may be quicker than working with fcs

In [2]:
def process_dataframe(df, geo_id_column, geometry_area_column, country_dataset_name, lookup_gee_datasets,
                      presence_only_flag_list, decimal_place_column_list, plot_id_column, keep_system_index=False, debug=False):
    """tidying data frame based on a series of lists of strings for handling columns differently"""

    if debug: st = time.time()
  
    if geo_id_column in df.columns:
        prefix_columns_list =[geo_id_column, geometry_area_column, "Country"]     
    else:
        prefix_columns_list =[geometry_area_column, "Country"] 
        print(geo_id_column, "column not found, so skipping inclusion")
    
    # for adding geo_ids from asset registry later
    if keep_system_index:
        df = df.rename(columns={"system_index": "system:index"}) # corrected the syntax for rename()
        prefix_columns_list.insert(0, "system:index")
        print("keeping 'system:index' column (e.g. to allow joining Geo ids to df later etc)")


    df_out = tidy_tables.reorder_columns_by_lookup(
        df, lookup_gee_datasets, 'dataset_order', "dataset_name", prefix_columns_list
    ).drop(country_dataset_name, axis=1, errors='ignore')
    
    # convert positive results values to "True" for specific columns
    df_out[presence_only_flag_list] = np.where(df_out[presence_only_flag_list] > 0, "True", "-")
    
    # fix decimals
    df_out[decimal_place_column_list] = df_out[decimal_place_column_list].round(decimals=0, out=None).astype(int)
    
    # add in plot id as a simple id for quick cross referencing e.g. if shown on a map
    df_out[plot_id_column] = df_out.index + 1
    
    #set as an index
    df_out.set_index(plot_id_column, inplace=True)
    
    # remove underscores in column names
    df_out.columns = df_out.columns.str.replace('_', ' ')
    
    if debug: print('...table tidying time:', time.time() - st, 'seconds.\n Processing finished')
    
    return df_out

##### Adding lists for reformatting manually here for now for faster processing (avoid use of the parameters/lookup_gee_datasets.csv)

In [3]:

presence_only_list = ['RADD_alerts', 'WDPA','KBA']

decimal_place_column_list = ['EUFO_2020',
 'GLAD_Primary',
 'TMF_undist',
 'JAXA_FNF_2020',
 'GFC_TC_2020',
 'GLAD_LULC_2020',
 'ESA_TC_2020',
 'TMF_disturbed',
 'TMF_plant',
 'Oil_palm_Descals',
 'Oil_palm_FDaP',
 'Cocoa_ETH']

order =['EUFO_2020',
 'GLAD_Primary',
 'TMF_undist',
 'JAXA_FNF_2020',
 'GFC_TC_2020',
 'GLAD_LULC_2020',
 'ESA_TC_2020',
 'RADD_alerts',
 'TMF_disturbed',
 'TMF_plant',
 'Oil_palm_Descals',
 'Oil_palm_FDaP',
 'Cocoa_ETH',
 'WDPA',
 'KBA']

id_name = "PLOTID"

prefix_columns_list =[geo_id_column,geometry_area_column, "Country"] 
dataset_order_list = prefix_columns_list + order

NameError: name 'geo_id_column' is not defined

In [4]:
st = time.time()

%store -r roi

keep_system_index = True

# print (ee.FeatureCollection(roi).size().getInfo())

if ee.FeatureCollection(roi).size().getInfo()>500:#update this code to depend on a variable? or if memory error?
    print ("Over size limit for on the fly processing. \n Attempting export to Google Drive 'Whisp_results/Whisp_output_table_export.csv' as a batch task... ")
    stats_fc_formatted = get_stats_formatted(
        roi,
        order = dataset_order_list, # not working as reverts to alphabetical
        id_name = id_name,
        flag_positive = presence_only_flag_list,
        round_properties = decimal_place_column_list,
        # exclude_properties = ['GLAD_LULC_2020'],
        select_and_rename = True # underscore removal (slow as relies on get info currently)
        )
    
    task = ee.batch.Export.table.toDrive(collection=stats_fc_formatted, 
                     description="whisp_output_table_export", 
                     folder="Whisp_results",
                     fileFormat="CSV")

    task.start()
    
    df_out =  ("Task running. Check https://code.earthengine.google.com/tasks. You can manually upload to Sepal when complete")
    
    print (df_out)
else:
    print ('...stats processing started')
    
    def add_system_index_as_property_to_feature(feature):
        # Get the system:index of the feature
        system_index = feature.get('system:index')
           
        # Set the 'id' property of the feature
        return feature.set('system_index',system_index)
    
    def add_system_index_as_property_to_fc(feature_col):    
       return feature_col.map(add_system_index_as_property_to_feature)
        
    stats_fc = get_stats(roi)
    
    stats_fc = add_system_index_as_property_to_fc(stats_fc)
    

    
    if keep_system_index:
        stats_fc = add_system_index_as_property_to_fc(stats_fc)
        
    df = geemap.ee_to_df(stats_fc)    
    
    df_out = process_dataframe(df, geo_id_column, geometry_area_column, country_dataset_name, lookup_gee_datasets,
                          presence_only_flag_list, decimal_place_column_list, plot_id_column,keep_system_index=keep_system_index,debug=False)

    print ('...ee to df processing time:', time.time() - st, 'seconds')# get the execution time


NameError: name 'time' is not defined