In [83]:
import requests
import pandas as pd
from pyogrio import read_dataframe
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta
import glob
import json
import numpy as np
import ee
import geemap as gee
import time
import standardize_pft_funcs as spf
import math
from pprint import pprint

In [84]:
try:
    ee.Initialize()
except Exception as e:
    print('GEE token expired. Use Jupyter Lab to authenticate')
    ee.Authenticate()

In [85]:
# parameters
base = '/mnt/poseidon/remotesensing/arctic/data/'
out_path = f'{base}/training/Test_06/fcover/temp'
data_path = f'{base}/vectors/abr/original_data'
temp_path = f'{base}/vectors/abr/standardization_temp'
stand_path = f'{base}/vectors/abr/standardization_info'
harm_path = f'{base}/vectors/abr/harmonized_data'
checklist_path = '/mnt/poseidon/remotesensing/arctic/data/vectors/akveg/AKVEG_species_checklist.csv'
leaf_path = '/mnt/poseidon/remotesensing/arctic/data/vectors/ava_post2000/evergreendecid_macander2022.csv'

---
---
# 1. Data Preparation
---

## 1.1 Load data

In [86]:
# read vegetation fcover as dataframe
file = f'{data_path}/vpi_plot_species_cover_with_trace.csv'
rs_veg_df = pd.read_csv(file)
print(f'Number of plot-species rows: {len(rs_veg_df)}')

Number of plot-species rows: 15866


In [87]:
# read non-vegetation fcover as dataframe
file = f'{data_path}/vpi_plot_summary.csv'
rs_nonveg_df = pd.read_csv(file)
nonveg_top = rs_nonveg_df[['plot_id', 'litter_topcov', 'water_topcov', 'bareground_topcov']]
print(f'Number of plots: {len(nonveg_top)}')

Number of plots: 107


In [88]:
# get ancillary data (lat,lon,etc)
file = f'{data_path}/plot.csv'
ancillary = pd.read_csv(file, index_col=0)
print(f'Non-vegetation table columns: {nonveg_top.columns.tolist()}')
print(f'Vegetation table columns: {rs.columns.tolist()}')

Non-vegetation table columns: ['plot_id', 'litter_topcov', 'water_topcov', 'bareground_topcov']
Vegetation table columns: ['plot_id', 'veg_taxonomy', 'family', 'species_cover', 'vascular', 'plot_sw_digit']


In [90]:
# drop els plots (plots that start with a letter)
# els crew used handheld remote sensing devices,
# which is not the data we want
startswith_digit = [x.isdigit() for x in rs_veg_df['plot_id'].str[0]]
rs_veg_df['plot_sw_digit'] = startswith_digit
rs_veg_df = rs_veg_df[rs_veg_df['plot_sw_digit'] == True]

## 1.2 Extract species information

In [96]:
# create table of unique species
unique_species_df = spf.get_unique_species(DFRAME=rs_veg_df, 
                                           SCOL='veg_taxonomy', 
                                           DNAME='abr_rs', 
                                           SAVE=True, 
                                           OUTP=stand_path)

# load species checklist
checklist_df = read_dataframe(checklist_path)

# get first 2 words (genus-species) from checklist accepted name and data species name
checklist_df['Mapping Name'] = checklist_df['Accepted Name'].apply(spf.get_substrings)
unique_species_df['Mapping Name'] = unique_species_df['veg_taxonomy'].apply(spf.get_substrings)

Saved unique species list to /mnt/poseidon/remotesensing/arctic/data//vectors/abr/standardization_info/abr_rs_unique_species.csv.


## 1.3 Assign species habits

In [98]:
# get potential habits
habits_df = spf.fill_habits(unique_species=unique_species_df, 
                            checklist=checklist_df, 
                            u_name='veg_taxonomy', 
                            c_unofficial_name='Name', 
                            c_official_name='Accepted Name', 
                            mapping_name='Mapping Name',
                            habit='Habit')

161 species are missing habits.
93 species still missing habits.
27 species still missing habits.
25 species still missing habits.


## 1.4 Add leaf retention

In [99]:
# add leaf retention (lr) columns
leaf_retention_df = pd.read_csv(leaf_path, header=None)
leaf_retention_df.columns = ['evergreendecid', 'species']
habits_df_with_lr = spf.add_leaf_habit(habits_df, leaf_retention_df)

# export
habits_df_with_lr.to_csv(f'{temp_path}/abr_species_habit_00.csv')

# habits---some species can have many growth forms and
# thus the string contains a list of habits
habit_list = list(habits_df_with_lr['Potential Habit'].unique())
print(f'List of current potential habits:\n{habit_list}')

List of current potential habits:
['forb', 'graminoid', 'lichen', 'shrub', 'liverwort', 'dwarf shrub', 'forb, dwarf shrub, forb, shrub', 'moss', nan, 'spore-bearing', ' shrub,dwarf shrub']


## 1.5 Export shrubs, nonshrubs, null habit seperately

In [107]:
# get all non-null habit names
nonnull_habit_df = habits_df_with_lr[~habits_df_with_lr['Potential Habit'].isnull()]

In [108]:
# get all shrub species
shrubs_df = nonnull_habit_df[nonnull_habit_df['Potential Habit'].str.contains('shrub')]
shrubs_df.to_csv(f'{temp_path}/abr_shrubs_00.csv')

In [109]:
# get all non-shrub species
nonshrubs_df = nonnull_habit_df[~nonnull_habit_df['Potential Habit'].str.contains('shrub')]
nonshrubs_df.to_csv(f'{temp_path}/abr_nonshrubs_00.csv')

In [110]:
# get null habits
null_habit_df = habits_df_with_lr[habits_df_with_lr['Potential Habit'].isnull()]
null_habit_df.to_csv(f'{temp_path}/abr_nullhabit_00.csv')

In [117]:
print('Total number of rows before separating: ', 
      len(habits_df_with_lr))

Total number of rows before separating:  385


In [119]:
print('Separate null habit, nonshrub habit, shrub habit row sum: ', 
      (len(null_habit_df) + len(nonshrubs_df) + len(shrubs_df)))

Separate null habit, nonshrub habit, shrub habit row sum:  385


---
---
# 2. Manual Cleaning
- Manually fix habits from `shrubs_00` and `nullhabit_00`
    - Consulted with arctic ecologist for shrubs
    - Used web search results for null habits
- Rename cleaned files by suffixing with `_01`
---
---

---
---
# 3. Standardization

#### PFT Schema
---
- veg: lichen = algae, lichen
- veg: bryophyte = liverwort, moss
- veg: graminoid = graminoid
- veg: forb = spore-bearing, forb
- veg: evergreen shrub = evergreen shrubs of all heights
- veg: deciduous shrub = deciduous shrubs of all heights
---
- nonveg: litter = litter + scat
- nonveg: bare ground = bare mineral + crust + bare ground
- nonveg: water = water
---

In [120]:
# get manually adjusted data
nonshrub_path = f'{temp_path}/abr_rs_nonshrubs_00.csv'
nullhabit_path = f'{temp_path}/abr_rs_nullhabit_01.csv'
shrub_path = f'{temp_path}/abr_rs_shrubs_01.csv'

alltypes = [nonshrub_path, shrub_path, nullhabit_path]

dfs = []
for path in alltypes:
    habit_df = pd.read_csv(path, index_col=0)
    dfs.append(habit_df)

In [121]:
# show all habits after habit names have been adjusted manually
clean_habits_df = pd.concat(dfs)
clean_habits_df['Habit'].unique().tolist()

['forb',
 'graminoid',
 'lichen',
 'liverwort',
 'moss',
 'spore-bearing',
 'shrub',
 'dwarf shrub',
 'shrub, tree',
 'bare ground',
 'bare mineral',
 'crust',
 'litter',
 'scat',
 'algae',
 'fungus',
 'water']

## 3.1 Standardize habit (PFT) names

In [122]:
# definition that aggregates habits into parent habits
# this is abr-specific, which is why it's not in the SPF module
def standardize_habit(habits, leaftypes, heights):
    
    """
    Description
    -----------
    Reads habit names and assigns a standardized name. E.g. Habits
    called 'moss' or 'liverwort' will be standardized to 'bryophyte'.
    Column names will have the suffix 'cover (%)'
    
    Paramters
    -----------
    habits     : [pandas column] dataframe selection 
                 with orignal habit names
    leaftypes  : [pandas column] dataframe selection 
                 with leaf retention type
    heights    : [pandas column] dataframe selection
                 with potential height information
    
    Returns
    -----------
    List of new habit names associated with each original habit name
    
    """
    
    habits = habits.to_numpy()
    leaftypes = leaftypes.to_numpy()
    heights = heights.to_numpy()
    new_habits = []
    
    for habit, leaf, height in zip(habits, leaftypes, heights):
        habit = habit.lower()
        if 'algae' in habit:
            new_habit = 'lichen cover (%)'
        elif habit == np.nan or 'unknown' in habit:
            new_habit = 'unknown cover (%)'
        elif 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte cover (%)'
        elif 'spore-bearing' in habit:
            new_habit = 'forb cover (%)'
        elif habit == 'grass':
            new_habit = 'graminoid cover (%)'
            
        elif habit == 'shrub, tree':
            new_habit = f'{leaf} dwarf to tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
            
        elif 'litter' in habit or 'scat' in habit:
            new_habit = 'litter cover (%)'
        elif 'bare ground' in habit or 'crust' in habit or 'mineral' in habit:
            new_habit = 'bare ground cover (%)'
        else:
            new_habit = f'{habit} cover (%)'
        new_habits.append(" ".join(new_habit.split()).lower())
    
    return new_habits

In [123]:
# standardize habit names
standard_habit_cols_list = standardize_habit(clean_habits_df['Habit'], 
                                             clean_habits_df['Leaf Retention'], 
                                             clean_habits_df['Height'])

# add standardized habit names as column
clean_habits_df['Standard Habit'] = standard_habit_cols_list

# export
file = f'{stand_path}/abr_rs_species_habit_standardized.csv'
clean_habits_df.to_csv(file)

## 3.2 Join PFT (habit) to fcover data

In [124]:
# add vegetation fcover and sum to PFT level
# this is abr-specific, which is why it's not in the SPF module
def add_cover(cover_df, pft_groups, habit_col):
    
    """
    Description
    -----------
    Joins standardized PFT names with fcover data via species name.
    For every plot, aggregate habits by summing fcover values.
    
    Parameters
    -----------
    cover_df   : [pandas dataframe] dataframe with species rowwise 
                 and plots columnwise
    pft_groups : [pandas dataframe] dataframe with species rowwise and
                 a standard PFTs column
    habit_col  : [string] name of column in pft_groups that have 
                 standardized PFT names
    
    Returns
    -----------
    Multi-index pandas dataframe with plots (idx0) and associated habit fcovers (idx1)
    """
    
    cover = cover_df.reset_index()
    withfcover = cover.merge(pft_groups, 
                             left_on='veg_taxonomy', 
                             right_on='Name', 
                             how='left')
    grouped = withfcover.groupby(['plot_id', habit_col], 
                                 group_keys=True).agg({'species_cover': 'sum'})
    return grouped

In [125]:
# create standardized table layout
grouped_standard = add_cover(rs_veg_df, clean_habits_df, 'Standard Habit')
grouped_standard = spf.flatten_multilevel(grouped_standard)
grouped_standard = spf.transpose_df(grouped_standard, 'Standard Habit')
standard_groups = spf.add_standard_cols(grouped_standard)

## 3.3 Read ancillary table

In [128]:
# add ancillary information
anc_df = ancillary.copy()
anc_df.rename(columns={'plot_id':'Site Code'}, inplace=True)
# year
anc_df['year'] = pd.DatetimeIndex(anc_df['field_start_ts']).year
anc_df['year'] = anc_df['year'].astype('Int64')
# plot size -- ABR is all 55m
anc_df['plot_radius_m'] = '55'
# dataset source
anc_df['source'] = 'ABR_RS'
# subselect ancillary columns of interest
anc_df = anc_df[['Site Code', 'latitude', 'longitude', 'year', 'plot_radius_m', 'source']]
# print number of rows
print('Number of ancillary rows: ', len(anc_df))

Number of ancillary rows:  913


## 3.4 Create top cover table

In [137]:
# rename columns and set index
topcover = nonveg_top.copy()
topcover.rename(columns={'litter_topcov':'litter total cover (%)',
                         'water_topcov':'water top cover (%)',
                         'bareground_topcov':'bare ground top cover (%)',
                         'plot_id':'Site Code'}, inplace=True)
topcover1 = topcover.drop(columns=['litter total cover (%)'])
topcover1.set_index('Site Code', inplace=True)
topcover1 = topcover1[sorted(topcover1.columns.tolist())]

In [138]:
# add ancillary
topcover1 = topcover1.merge(anc_df, on='Site Code', how='left')
topcover1.set_index('Site Code', inplace=True)

In [139]:
# optional export
file = f'{harm_path}/abr_rs_fcover_top_standardized.csv'
topcover1.to_csv(file)

In [140]:
print('Number of top cover rows: ', len(topcover1))

Number of top cover rows:  107


## 3.5 Create total cover file (aggregated habits)

In [141]:
# set index and sort columns
totcover = standard_groups.copy()
totcover.rename(columns={'plot_id':'Site Code'}, inplace=True)
totcover.drop(columns=['fungus cover (%)'], inplace=True)
totcover.set_index('Site Code', inplace=True)
totcover.columns.name = None

# add litter
litter = topcover[['litter total cover (%)']]
litter2 = litter.copy()
litter2 = litter2.rename(columns={'litter total cover (%)':'litter cover (%)'}, inplace=True)
totcover = pd.concat([totcover, litter2], axis=1)
totcover = totcover[sorted(totcover.columns.tolist())]

In [142]:
# add "total cover" to column names
cols = totcover.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
totcover.columns = cols

In [143]:
# add ancillary
totcover = totcover.merge(anc_df, on='Site Code', how='left')

In [144]:
totcover.set_index('Site Code', inplace=True)
totcover.drop(columns=['water total cover (%)', 'bare ground total cover (%)'], inplace=True)

In [146]:
print('Number of total cover rows: ', len(totcover))

Number of total cover rows:  107


In [147]:
# export
file = f'{harm_path}/abr_rs_fcover_total_standardized.csv'
totcover.to_csv(file)

## 3.7 Creat combined top and total cover file (aggregated habits)

In [148]:
topcover.set_index('Site Code', inplace=True)

In [149]:
allcols = pd.concat([totcover, topcover], axis=1)
cover = sorted([col for col in allcols.columns if 'cover (%)' in col])
other = sorted([col for col in allcols.columns if 'cover (%)' not in col])
c = allcols[cover]
o = allcols[other]
allcols_final = pd.concat([c, o], axis=1)
allcols_final = allcols_final.loc[:,~allcols_final.columns.duplicated()].copy()

In [150]:
# ABR plots are not near each other, so they are not
# spatially aggregated. Regardless, I created a "parent"
# and "child" file for consistency with plots that DO spatially
# aggregate (aka, are within 55m of each other)
aggregated_PFT_fcover = spf.agg_to_pft_schema(allcols_final)
p = 'abr_fcover_parent.csv'
c = 'abr_fcover_child.csv'
aggregated_PFT_fcover.to_csv(f'{harm_path}/{p}')
aggregated_PFT_fcover.to_csv(f'{harm_path}/{c}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_shrub['evergreen shrub total cover (%)'] = e_shrub.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_shrub['deciduous shrub total cover (%)'] = d_shrub.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_tree['evergreen tree total cover (%)'] = e_tree.sum(axis=1)
A value is tr