### Process stats for image collection

In [1]:
import json
import ee
import geemap
import folium ###
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import time
import math ###
import geojson 
from datetime import datetime ###
from sidecar import Sidecar ###

from modules.gee_initialize import initialize_ee 

initialize_ee()

from modules.json_to_ee import json_to_feature_with_id ###
# from modules.agstack_setup import start_agstack_session ###
import modules.agstack_to_gee as agstack_to_gee ###
import modules.area_stats as area_stats
from modules.country_allocation import find_country_from_modal_stats
import modules.tidy_tables as tidy_tables

from parameters import * # NB for run-specific parameters edit "parameters/config_runtime"

from modules.file_to_ceo import get_ceo_url ###
from modules.image_prep import export_image_collection_to_asset, add_multi_lookup_properties_to_image_collection
from modules.image_prep import add_multi_lookup_properties_to_image_collection
from parameters.config_lookups import lookup_gee_datasets

print ("imports complete")

Earth Engine has been initialized with the specified credentials.
imports complete


In [None]:
%store -r roi

%run whisp_prep_singleband_collection.ipynb # get stats and image

In [None]:
# ee.Algorithms.if(roi.first().get(geometry_area_column) =
roi = area_stats.add_area_hectares_property_to_feature_collection(roi,geometry_area_column)


Calculating zonal statistics for continuous data (e.g tree cover) within polygon(s)

##### i) Mapping over image collection with reduce regions (creates long format temporary table)

In [13]:
# get the start time
st = time.time()

if debug: print ("processing stats...")

# zonal_stats_out = area_stats.zonal_stats_plot_w_buffer(roi, roi_buffer, images_iCol_filt, plot_stats_list, buffer_stats_list, reducer_choice, debug)
zonal_stats_out = area_stats.zonal_stats_iCol (feature_collection=roi,image_collection=images_iCol_filt,reducer_choice=reducer_choice)

# convert to Pandas Dataframe
df = geemap.ee_to_df(zonal_stats_out) # limit of 5000 (unlikely to need more but i have code for it if needed)

if debug: print ('stats execution time:', time.time() - st, 'seconds')# get the execution time

processing stats...
no buffer stats required
Total execution time: 2.869277238845825 seconds


Look up table linking country codes to country names (from administrative feature collection) is stored here: scripts: create_country_lookup.py

##### ii): Create lookup tables for country allocation
Approach is based on raster stats and listing the country for a specific geometry based on which has most overlap



Make on-the-fly look up table to link country name to geo id based on raster stats
- uses rasterised admin layer with admin codes as pixel values
- for each geo id finds most common value in that geometry (i.e. "mode" statistic)

In [14]:
st = time.time()

lookup_geo_id_to_country_names = find_country_from_modal_stats(
    roi=roi,
    image_collection = images_iCol_filt,
    reducer_choice=reducer_choice,
    geo_id_column=geo_id_column,
    country_dataset_id=country_dataset_id,
    admin_code_col_name=admin_code_col_name,
    lookup_country_codes_to_names=lookup_country_codes_to_names)


##### iii) Reformat results table
- convert areas to percentage cover of plot
- long to wide format; 
- set presence only flags
- add in country names (using lookup tables) to the final results

In [15]:
#add percentage column
df["percentage"] = (df["sum"]/
                             df[geometry_area_column])*100


#convert to wide format (one row per geo_id)
df_wide = df.pivot_table(index=[geo_id_column,geometry_area_column],
                                  columns=['dataset_name'],values=['percentage'])

#tidy unwanted headers (runs in place so no need to assign variable)
tidy_tables.tidy_dataframe_after_pivot(df_wide) 

# convert positive results values to "True" for specific columns and fix decimals
df_wide[presence_only_flag_list] = np.where(df_wide[presence_only_flag_list] > 0, "True", "-")
df_wide[decimal_place_column_list] = df_wide[decimal_place_column_list].round(decimals=0, out=None).astype(int)

# Reset index formatting
df_wide=df_wide.reset_index()

#decimals for geometry column
df_wide[geometry_area_column]=df_wide[geometry_area_column].round(decimals=1, out=None)

# #joins country name based on majority overlap with country 
df_wide_w_country = df_wide.merge(lookup_geo_id_to_country_names,on=geo_id_column)

##### iv) Further reformatting
- reorder columns
- reorder rows to match Geo_ID / ROI order
- add plot id column for easy reference
- remove underscores in column headers

In [16]:
#columns to add to start of table
prefix_columns_list =[geo_id_column,geometry_area_column,"Country"] 

#use lookup to reorder cols
df_wide_w_country_reordered_cols = tidy_tables.reorder_columns_by_lookup(
    df_wide_w_country,
    lookup_gee_datasets,
    'dataset_order',
    "dataset_name",
    prefix_columns_list).drop(country_dataset_name, axis=1, errors='ignore')

# reordering rows using geo_id order from feature collection (if more than one feature). NB Some repetition here with below. Avoiding Geopandas for speed of csv production.
if len(df_wide_w_country_reordered_cols)>1:
    df_wide_w_country_reordered_cols_n_rows = pd.merge(geemap.ee_to_pandas(roi)[geo_id_column],
                  df_wide_w_country_reordered_cols, 
                  left_on=geo_id_column, 
                  right_on=geo_id_column, 
                  how='inner')
    
#renaming for clarity
df_out = df_wide_w_country_reordered_cols_n_rows

#adding in index plot id column for visual cross referencing on map etc
df_out[plot_id_column] = df_out.index +1

df_out.set_index(plot_id_column, inplace = True)

ceo_cols_for_exporting = df_out.columns # with underscores for use in ceo export

# remove underscores in column names
df_out.columns = df_out.columns.str.replace('_', ' ')


In [None]:
if debug: print ('tidying tables execution time:', time.time() - st, 'seconds')# get the execution time

In [4]:
df=df_out
def whisp_to_ceo(roi,df):
    %store roi
    %store df
    %run whisp_ceo.ipynb   
    return ceo_url

NameError: name 'df_out' is not defined

In [17]:
# #export output csv
# df_out.to_csv(path_or_buf=out_file_wide,header=True)

# if debug: print ("output csv: ", out_file_wide)
# # df_wide

output csv:  /home/sepal-user/whisp/whisp_output_table.csv
