In [1]:
import requests
import pandas as pd
from pyogrio import read_dataframe
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta
import glob
import json
import numpy as np
import ee
import geemap as gee
import time
import standardize_pft_funcs as spf
import geopandas as gpd
import chardet
import os

In [2]:
try:
    ee.Initialize()
except Exception as e:
    print('GEE token expired. Use Jupyter Lab to authenticate')
    ee.Authenticate()

## Load data

In [3]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NGEE_SP'
spp_csvs = sorted(glob.glob(f'{p}/*_spp.csv'))

In [4]:
dfs = []
for csv_path in spp_csvs:
    #print(csv_path)
    with open(csv_path, 'rb') as f:
        result = chardet.detect(f.read())
        df = pd.read_csv(csv_path, encoding=result['encoding'])
        dfs.append(df)

In [5]:
# ancillary
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NGEE_SP/'
f = 'ngee_arctic_seward_peninsula_ancillary.csv'
ancillary = pd.read_csv(p+f)
ancillary.drop(columns=['TID'], inplace=True)
ancillary.dropna(inplace=True) #remove sites with missing ancillary data

## Get species info

In [6]:
# get accepted names (first column) from each csv
spp_names = []
for df in dfs: 
    spp_name = df.iloc[:, 0] # first col with author
    spp_names.append(spp_name)

In [7]:
# concatenate all spp names into one dataframe
all_spp_names = pd.concat(spp_names, ignore_index=True)
all_spp_names = all_spp_names.dropna().reset_index(drop=True)
all_spp_names = pd.DataFrame(all_spp_names)
all_spp_names.columns = ['Accepted Name']

In [8]:
# get unique species names
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NGEE_SP'
species = spf.get_unique_species(all_spp_names, 'Accepted Name', 'NGEE_SP', True, p)

Saved unique species list to /mnt/poseidon/remotesensing/arctic/data/vectors/NGEE_SP/NGEE_SP_unique_species.csv.


## Extract potential habits for each species

In [9]:
# load species checklist
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'
f = 'AKVEG_species_checklist.csv'
checklist = read_dataframe(p + f)

# get first 2 words (genus-species) from checklist accepted name and data species name
checklist['Mapping Name'] = checklist['Accepted Name'].apply(spf.get_substrings)
species['Mapping Name'] = species['Accepted Name'].apply(spf.get_substrings)

In [10]:
habits = spf.fill_habits(unique_species=species, 
                         checklist=checklist, 
                         u_name='Accepted Name', 
                         c_unofficial_name='Name', 
                         c_official_name='Accepted Name', 
                         mapping_name='Mapping Name',
                         habit='Habit')

96 species are missing habits.
60 species still missing habits.
21 species still missing habits.
19 species still missing habits.


## Add leaf retention column

In [11]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AK-AVA_post2000/'
f = 'evergreendecid_macander2022.csv'
evergrndecid = pd.read_csv(p + f, header=None)
evergrndecid.columns = ['evergreendecid', 'species']
final = spf.add_leaf_habit(habits, evergrndecid)

In [12]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NGEE_SP/'
f = 'NGEE_SP_species_habit_00.csv'
final.to_csv(p + f)

In [13]:
list(final['Potential Habit'].unique())

[nan,
 'forb',
 'lichen',
 'shrub',
 'dwarf shrub',
 'graminoid',
 'moss',
 'liverwort',
 ' forb,shrub, multiple',
 'spore-bearing',
 'shrub, shrub, dwarf shrub',
 'coniferous tree',
 'dwarf shrub, shrub']

In [14]:
# get all shrub species
finalnonnull = final[~final['Potential Habit'].isnull()]
shrubs = finalnonnull[finalnonnull['Potential Habit'].str.contains('shrub')]
shrubs.to_csv(p + 'NGEE_SP_shrubs_00.csv')

In [15]:
# get all non-shrub species
nonshrubs = finalnonnull[~finalnonnull['Potential Habit'].str.contains('shrub')]
nonshrubs.to_csv(p + 'NGEE_SP_nonshrubs_00.csv')

In [16]:
# get null habits
finalnull = final[final['Potential Habit'].isnull()]
finalnull.to_csv(p + 'NGEE_SP_nullhabit_00.csv')

In [17]:
len(finalnull) + len(nonshrubs) + len(shrubs)

266

In [18]:
len(final)

266

## Manual editing on shrubs_00 and nullhabit_00

## Standardization

In [19]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NGEE_SP/'
nonshrub = 'NGEE_SP_nonshrubs_01.csv'
null = 'NGEE_SP_nullhabit_01.csv'
shrub = 'NGEE_SP_shrubs_01.csv'

alltypes = [nonshrub, shrub, null]

d = []
for t in alltypes:
    habits = pd.read_csv(p + t, index_col=0)
    d.append(habits)

In [20]:
completehabits = pd.concat(d)
completehabits['Habit'].unique().tolist()

['forb',
 'lichen',
 'graminoid',
 'moss',
 'liverwort',
 'spore-bearing',
 'coniferous tree',
 'shrub',
 'dwarf shrub',
 'bryophyte',
 'bare soil',
 'litter',
 'unknown',
 'rock',
 'water']

In [21]:
def standardize_habit_nonstandard(habitrow, leafrow, heightrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    heightrow = heightrow.to_numpy()
    new_row = []
    
    for habit, leaf, height in zip(habitrow, leafrow, heightrow):
        habit = habit.lower()
        if habit == 'coniferous tree':
            new_habit = f'{leaf} tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
        elif 'tree' in habit:
            new_habit = f'{leaf} tree cover (%)'
        else:
            new_habit = f'{habit} cover (%)'
        new_row.append(" ".join(new_habit.split()).lower())
    
    return new_row

In [22]:
def standardize_habit(habitrow, leafrow, heightrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    heightrow = heightrow.to_numpy()
    new_row = []
    
    for habit, leaf, height in zip(habitrow, leafrow, heightrow):
        habit = habit.lower()
        if 'algae' in habit or 'cyanobacteria' in habit:
            new_habit = 'lichen cover (%)'
        elif habit == np.nan or 'unknown' in habit:
            new_habit = 'unknown cover (%)'
        elif 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte cover (%)'
        elif 'spore-bearing' in habit:
            new_habit = 'forb cover (%)'
        elif habit == 'grass':
            new_habit = 'graminoid cover (%)'
            
        elif habit == 'shrub, tree':
            new_habit = f'{leaf} dwarf to tree cover (%)'
        elif 'tree' in habit:
            new_habit = f'{leaf} tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
            
        elif 'litter' in habit or 'scat' in habit:
            new_habit = 'litter cover (%)'
        elif 'rock' in habit or 'bare soil' in habit:
            new_habit = 'bare ground cover (%)'
        else:
            new_habit = f'{habit} cover (%)'
        new_row.append(" ".join(new_habit.split()).lower())
    
    return new_row

In [23]:
# standardized PFT
standard_habit_col = standardize_habit(completehabits['Habit'], 
                                       completehabits['Leaf Retention'], 
                                       completehabits['Height'])

In [24]:
# non-standardized PFT
nonstandard_habit_col = standardize_habit_nonstandard(completehabits['Habit'], 
                                                      completehabits['Leaf Retention'],
                                                      completehabits['Height'])

In [25]:
completehabits['Standard Habit'] = standard_habit_col
completehabits['NonStandard Habit'] = nonstandard_habit_col

In [26]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NGEE_SP/'
f = 'NGEE_SP_species_habit_standardized.csv'
completehabits.to_csv(p + f)

## Clean up fcover data

In [27]:
spp = spp_csvs

In [28]:
def get_fcover_percent(spp, habit_name):
    
    plot_data = []
    names = [os.path.basename(x) for x in spp]
    convs = []
    for n in names:
        conv = n.split('_')[-2]
        convs.append(conv)
    finalhabits = completehabits[['Name', habit_name]]
    
    
    for s, conv in zip(spp, convs):

        # read species data
        with open(s, 'rb') as f:
            result = chardet.detect(f.read())
            df = pd.read_csv(s, encoding=result['encoding'], header=1)
        print(habit_name, s, end='\x1b[1K\r')

        # rename columns
        cols = df.columns.to_numpy()
        cols[0] = 'Name'
        cols[1] = 'Name NoAuthor'
        cols[2] = 'Study Name'
        df.columns = cols

        # select plot TIDs
        tid_cols = df.columns.to_list()[3:]

        # get habit for each species in file
        plotlevel_habit = df.merge(finalhabits, left_on='Name', right_on='Name') # survey plot names will be dropped here

        # br-bq to percentage conversion dict
        bb_dict = {'r': 0.05, '+': 0.55, '1': 3.0, '2': 15.0, '3': 37.5, '4': 62.5, '5': 87.5, '-9': np.nan}
        # Westoff converstion dict
        wv_dict = {'1': 0.05, '2': 1.0, '3': 2.5, '4': 4.5, '5': 9.0, '6': 18.5, '7': 37.0, '8': 62.0, '9': 87.5}
        # Hult-Serander conversion dict
        hs_dict = {'+': 0.5, '1': 4.0, '2': 9.0, '3': 18.0, '4': 38.0, '5': 63.0, '6': 87.0}

        # convert wv codes
        if conv == 'wv':

            # replace westoff-van den maarel code with percentage
            cols = plotlevel_habit.columns.to_list()
            plotlevel_habit = plotlevel_habit.astype(str)
            for col in cols:
                plotlevel_habit.replace({col: wv_dict}, inplace=True)

        # covert br-bl codes        
        elif conv == 'brbl':

            # replace br-bl code with percentage
            cols = plotlevel_habit.columns.to_list()
            plotlevel_habit = plotlevel_habit.astype(str)
            for col in cols:
                plotlevel_habit.replace({col: bb_dict}, inplace=True)

        elif conv == 'hs':

            # replace hs code with percentage
            cols = plotlevel_habit.columns.to_list()
            plotlevel_habit = plotlevel_habit.astype(str)
            for col in cols:
                plotlevel_habit.replace({col: hs_dict}, inplace=True)

        # if already percent
        else:

            plotlevel_habit = plotlevel_habit.astype(str)

        # clean % data
        plot_pft = plotlevel_habit.drop(columns=['Name NoAuthor', 'Name', 'Study Name'])
        plot_pft = plot_pft.loc[:, plot_pft.columns != habit_name].replace(' ', '0', regex=True)
        plot_pft = plot_pft.loc[:, plot_pft.columns != habit_name].replace('', '0', regex=True)
        plot_pft = plot_pft.loc[:, plot_pft.columns != habit_name].replace('\*', '0', regex=True)
        plot_pft = plot_pft.loc[:, plot_pft.columns != habit_name].astype(float)

        # add back the standard habit column
        plot_pft[habit_name] = plotlevel_habit[habit_name] # replace codes with new fcover %

        # loop through each plot and sum fcover per pft
        final = plot_pft.groupby(by=[habit_name]).sum().T
        final.index = tid_cols
        plot_data.append(final)
        
    return plot_data

In [29]:
sf = get_fcover_percent(spp, 'Standard Habit')
nsf = get_fcover_percent(spp, 'NonStandard Habit')

NonStandard Habit /mnt/poseidon/remotesensing/arctic/data/vectors/NGEE_SP/ngee_arctic_seward_peninsula_teller_perc_spp.csv[1K1K

In [30]:
standard_fcover = pd.concat(sf, axis=0)
nonstandard_fcover = pd.concat(nsf, axis=0)

In [31]:
standard_fcover.index.name = 'Site Code'
standard_fcover.index = standard_fcover.index.astype(str)
standard_fcover.columns.name = None
nonstandard_fcover.index.name = 'Site Code'
nonstandard_fcover.index = nonstandard_fcover.index.astype(str)
nonstandard_fcover.columns.name = None

In [32]:
def add_standard_cols(groups):
    
    # required columns
    necessary_cols = ['deciduous dwarf shrub cover (%)',
                      'deciduous dwarf to low shrub cover (%)',
                      'deciduous dwarf to tall shrub cover (%)',
                      'deciduous dwarf to tree cover (%)',
                      'deciduous tree cover (%)',
                      'evergreen dwarf shrub cover (%)',
                      'evergreen dwarf to low shrub cover (%)',
                      'evergreen dwarf to tall shrub cover (%)',
                      'evergreen dwarf to tree cover (%)',
                      'evergreen tree cover (%)',
                      'bryophyte cover (%)',
                      'forb cover (%)',
                      'graminoid cover (%)',
                      'lichen cover (%)']
    
    # add missing columns and fill with nan
    cols = groups.columns.tolist()
    addcols = []
    for nc in necessary_cols:
        if nc not in cols:
            addcols.append(nc)
    groups[addcols] = np.nan
    return groups

In [33]:
standard_fcover = add_standard_cols(standard_fcover)

In [37]:
len(standard_fcover)

101

## ancillary

In [34]:
anc = ancillary.copy()
anc.drop(columns=['date'], inplace=True)
anc = anc[['Site Code', 'latitude', 'longitude', 'year', 'plot_radius_m', 'source']]
anc = anc.dropna() #remove sites with missing ancillary data
anc.set_index('Site Code', inplace=True)
anc.index = anc.index.astype(str)

In [38]:
len(anc)

98

## top cover non-veg

In [39]:
topcover = standard_fcover.copy()
topcover.columns.name = None
topcover = topcover[sorted(topcover.columns.tolist())]

In [40]:
# add "total cover" to column names
cols = topcover.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
topcover.columns = cols

In [41]:
# assume non-veg is total cover
topcover.rename(columns={'bare ground total cover (%)':'bare ground top cover (%)',
                         'litter total cover (%)':'litter total cover (%)',
                         'water total cover (%)':'water top cover (%)'},
              inplace=True)
# join topcover and ancillary
topcover.index = topcover.index.astype(str)
topcover = topcover.groupby(lambda x:x, axis=1).sum()
topcover = pd.concat([topcover, anc], axis=1)
# drop water = 100%
topcover = topcover[topcover['water top cover (%)'] != 100]
cols = [col for col in topcover.columns if not 'cover (%)' in col]
# re-create ancillary without 100% water cols
anc = topcover[cols]
# finish top cover
cols = [col for col in topcover.columns if not 'total cover (%)' in col]
topcover1 = topcover[cols]
topcover1 = topcover1.dropna(how='all', subset=['latitude', 'longitude'])

In [43]:
len(topcover1)

98

In [44]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'NGEE_SP_fcover_top_nonPFT.csv'
topcover1.to_csv(p + f)

## total cover (not merged)

In [45]:
# set index and sort columns
totcover_nm = nonstandard_fcover.copy()
totcover_nm.columns.name = None
totcover_nm = totcover_nm[sorted(totcover_nm.columns.tolist())]

In [46]:
# add "total cover" to column names
cols = totcover_nm.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
totcover_nm.columns = cols

In [47]:
# assume non-veg is total cover
totcover_nm.rename(columns={'bare soil total cover (%)':'bare ground top cover (%)',
                            'rock total cover (%)':'rock top cover (%)',
                            'litter total cover (%)':'litter total cover (%)',
                            'water total cover (%)':'water top cover (%)'},
              inplace=True)
# join topcover and ancillary
totcover_nm.index = totcover_nm.index.astype(str)
totcover_nm = pd.concat([totcover_nm, anc], axis=1)
# drop water = 100%
totcover_nm = totcover_nm[totcover_nm['water top cover (%)'] != 100]

In [48]:
# drop non-veg
cols = [col for col in totcover_nm.columns if not 'top cover (%)' in col]
totcover_nm = totcover_nm[cols]

In [49]:
totcover_nm = totcover_nm.dropna(how='all')
totcover_nm.drop(columns=['unknown total cover (%)'], inplace=True)
totcover_nm = totcover_nm.dropna(how='all', subset=['latitude', 'longitude'])

In [50]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'NGEE_SP_fcover_total_orig.csv'
totcover_nm.to_csv(p + f)

In [51]:
len(totcover_nm)

98

## Create total cover file (merged)

In [54]:
# set index and sort columns
totcover = topcover.copy()

In [55]:
# get total cover from topcover df
cols = [col for col in totcover.columns if not 'top cover (%)' in col]
totcover = totcover[cols]
# remove things
totcover = totcover.dropna(how='all')
totcover.drop(columns=['unknown total cover (%)'], inplace=True)
totcover = totcover.dropna(how='all', subset=['latitude', 'longitude'])

In [56]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'NGEE_SP_fcover_total_merged.csv'
totcover.to_csv(p + f)

In [58]:
len(totcover)

98

## Create all cover file for test 05

In [59]:
allcover = topcover.copy()
# remove things
allcover = allcover.dropna(how='all')
allcover.drop(columns=['unknown total cover (%)'], inplace=True)
allcover = allcover.dropna(how='all', subset=['latitude', 'longitude'])

In [60]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_05/temp_fcover/'
f = 'SP_fcover_child.csv'
allcover.to_csv(p + f)

## Aggregate child plots

In [61]:
# load total cover
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_05/temp_fcover/'
f = 'SP_fcover_child.csv'
akava = pd.read_csv(p + f)

# only plots with very small sizes need to be processed, 
# the other remains the same, discarding all plots earlier than 2010
akava['plot_radius_m'].astype(float)
akava_small_fcover = akava[(akava['plot_radius_m']<10) &  (akava['year']>=2010)]
akava_large_fcover = akava[(akava['plot_radius_m']>=10) &  (akava['year']>=2010)]

In [62]:
len(akava_small_fcover)

98

In [63]:
selected_columns = ['Site Code', 'year', 'latitude', 'longitude', 'source', 'plot_radius_m']
akava_small = akava_small_fcover[selected_columns]
# akava_small = akava.fillna(0)
akava_small.head()

Unnamed: 0,Site Code,year,latitude,longitude,source,plot_radius_m
0,CL_EL1_Vg_Comp,2018.0,64.84525,-163.782758,SP,1.25
1,CL_EL2_Vg_Comp,2018.0,64.845515,-163.784122,SP,1.25
2,CL_PMRB1_Vg_Comp,2018.0,64.85961,-163.69164,SP,1.25
3,CL_PMRB2_Vg_Comp,2018.0,64.859995,-163.687948,SP,1.25
4,CL_PMRB3_Vg_Comp,2018.0,64.856385,-163.692985,SP,1.25


In [64]:
#### transform geographic to utm so that distance caculation is more intuitive
from pyproj import Transformer
src_crs = "EPSG:4326"
target_crs = "EPSG:32606"
transformer = Transformer.from_crs(src_crs, target_crs)

lon = akava_small['longitude'].to_numpy()
lat = akava_small['latitude'].to_numpy()

#### store the projected coords
projcoords = []   
for i in range(0,akava_small.shape[0]):
    xcoord, ycoord = lon[i],lat[i]
    projcoords.append(transformer.transform(ycoord,xcoord))

In [65]:
#### calculate the distance matrix of all small plots (for examination)
from scipy.spatial.distance import cdist
coord = np.array(projcoords)
dist_mat = cdist(coord, coord, 'euclidean')
# dist_mat[:3,:] <= 60

In [66]:
### group pixels/plots based on their euclidean distance
def group_pixels_by_distance(pixel_data, distance_threshold):
    """
        pixel_data: ndarray of coordinate pair: n by 2, default is utm projection
        distance_threshold: threshold used for grouping, default is 60m
        
        return:
        a list of values indicating the group id of each pixel
    
    """
    cluster_id = 0
    pixel_clusters = {}
   
    def expand_cluster(pixel, cluster_id):
        if pixel_clusters.get(cluster_id) is None:
            pixel_clusters[cluster_id] = []
       
        pixel_clusters[cluster_id].append(pixel)

    cluster_array = np.full(len(pixel_data), -1)  # Initialize with -1 (unassigned)
   
    for i, pixel in enumerate(pixel_data):
        assigned = False
       
        for c_id, cluster_pixels in pixel_clusters.items():
            cluster_pixels = np.array(cluster_pixels)
            distances = np.linalg.norm(cluster_pixels - pixel, axis=1)
            if np.any(distances <= distance_threshold):
                expand_cluster(pixel, c_id)
                assigned = True
                cluster_array[i] = c_id
                break
       
        if not assigned:
            expand_cluster(pixel, cluster_id)
            cluster_array[i] = cluster_id
            cluster_id += 1

    return cluster_array

dist_thres = 55
coord = np.array(projcoords)  ## UTM coords, unit is meter
group = group_pixels_by_distance(coord, dist_thres)

In [67]:
### add the group id to df for aggregation
akava_small['group_id'] = group
akava_small.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  akava_small['group_id'] = group


Unnamed: 0,Site Code,year,latitude,longitude,source,plot_radius_m,group_id
0,CL_EL1_Vg_Comp,2018.0,64.84525,-163.782758,SP,1.25,0
1,CL_EL2_Vg_Comp,2018.0,64.845515,-163.784122,SP,1.25,1
2,CL_PMRB1_Vg_Comp,2018.0,64.85961,-163.69164,SP,1.25,2
3,CL_PMRB2_Vg_Comp,2018.0,64.859995,-163.687948,SP,1.25,3
4,CL_PMRB3_Vg_Comp,2018.0,64.856385,-163.692985,SP,1.25,4


In [74]:
groups = akava_small.groupby(['group_id', 
                              'year', 'source']).agg({'latitude':'mean',
                                            'longitude':'mean',
                                            'Site Code':list,
                                            'plot_radius_m':list})

In [75]:
def get_plot_size(rowlst):
    if len(rowlst) == 1:
        val = rowlst[0]
    else:
        val = 55
    return val
groups['plot_radius_m'] = groups['plot_radius_m'].apply(lambda row: get_plot_size(row))

In [76]:
groups2 = groups.explode('Site Code')

In [77]:
groups2.reset_index(inplace=True)
groups2['parent_id'] = (groups2['group_id'].astype(str) + '_' + 
                        groups2['year'].astype(int).astype(str) + '_' +
                        groups2['source'])
groups2.rename(columns={'latitude':'parent_latitude',
                        'longitude':'parent_longitude'},
               inplace=True)
groups2.drop(columns=['group_id', 'year', 'source'], inplace=True)

In [78]:
groups2.set_index('Site Code', inplace=True)
asf = akava_small_fcover.set_index('Site Code')
asf.drop(columns=['plot_radius_m'], inplace=True)
joined = pd.concat([asf, groups2], axis=1)
joined.columns

Index(['bare ground top cover (%)', 'bryophyte total cover (%)',
       'deciduous dwarf shrub total cover (%)',
       'deciduous dwarf to low shrub total cover (%)',
       'deciduous dwarf to tall shrub total cover (%)',
       'deciduous dwarf to tree total cover (%)',
       'deciduous tree total cover (%)',
       'evergreen dwarf shrub total cover (%)',
       'evergreen dwarf to low shrub total cover (%)',
       'evergreen dwarf to tall shrub total cover (%)',
       'evergreen dwarf to tree total cover (%)',
       'evergreen tree total cover (%)', 'forb total cover (%)',
       'graminoid total cover (%)', 'lichen total cover (%)',
       'litter total cover (%)', 'water top cover (%)', 'latitude',
       'longitude', 'year', 'source', 'parent_latitude', 'parent_longitude',
       'plot_radius_m', 'parent_id'],
      dtype='object')

In [89]:
info_cols = ['parent_latitude', 'parent_longitude', 'plot_radius_m']
data_cols = [col for col in joined.columns if 'cover (%)' in col]
anci_cols = set(joined.columns) - set(data_cols) - set(info_cols)
info_dict = dict.fromkeys(info_cols, 'mean')
data_dict = dict.fromkeys(data_cols, 'mean')
anci_dict = dict.fromkeys(anci_cols, set)
d = {**data_dict, **info_dict, **anci_dict}

parent_plots = joined.groupby('parent_id').agg(d)

In [90]:
parent_plots.drop(columns=['latitude', 'longitude', 'parent_id', 
                           'source'],
                 inplace=True)

In [91]:
parent_plots['year'] = parent_plots['year'].explode()
parent_plots['source'] = 'NGEE_SP'
parent_plots.rename(columns={'Site Code':'child_site_codes',
                             'parent_longitude':'longitude',
                             'parent_latitude':'latitude'}, 
                    inplace=True)
parent_plots.index.name = 'Site Code'

In [92]:
parent_plots['year'] = parent_plots['year'].astype(int)

In [93]:
parent_plots

Unnamed: 0_level_0,bare ground top cover (%),bryophyte total cover (%),deciduous dwarf shrub total cover (%),deciduous dwarf to low shrub total cover (%),deciduous dwarf to tall shrub total cover (%),deciduous dwarf to tree total cover (%),deciduous tree total cover (%),evergreen dwarf shrub total cover (%),evergreen dwarf to low shrub total cover (%),evergreen dwarf to tall shrub total cover (%),...,forb total cover (%),graminoid total cover (%),lichen total cover (%),litter total cover (%),water top cover (%),latitude,longitude,plot_radius_m,year,source
Site Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0_2018_SP,1.0,9.0,3.0,12.0,0.0,0.0,0.0,49.0,2.0,0.0,...,0.0,3.0,48.0,2.0,0.0,64.845250,-163.782758,1.25,2018,NGEE_SP
10_2018_SP,0.0,53.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,44.0,0.0,5.0,0.0,64.859385,-163.703835,1.25,2018,NGEE_SP
11_2018_SP,0.0,30.0,0.0,70.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,63.0,0.0,4.0,10.0,64.858285,-163.712740,1.25,2018,NGEE_SP
12_2018_SP,0.0,3.0,0.0,37.0,0.0,0.0,0.0,17.0,5.0,0.0,...,1.0,3.0,122.0,3.0,0.0,64.857520,-163.693148,1.25,2018,NGEE_SP
13_2018_SP,0.0,6.0,0.0,26.0,0.0,0.0,0.0,9.0,7.0,0.0,...,0.0,2.0,99.0,5.0,0.0,64.860790,-163.698248,1.25,2018,NGEE_SP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83_2017_SP,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,64.743175,-165.963990,1.25,2017,NGEE_SP
84_2017_SP,0.0,48.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,64.729570,-165.944735,55.00,2017,NGEE_SP
85_2017_SP,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,64.736497,-165.948845,1.25,2017,NGEE_SP
8_2018_SP,0.0,35.0,0.0,12.0,0.0,0.0,0.0,9.5,4.0,0.0,...,2.5,40.5,18.5,8.0,2.0,64.860154,-163.697310,55.00,2018,NGEE_SP


In [94]:
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_05/temp_fcover/'
f = 'SP_fcover_parent.csv'
parent_plots.to_csv(p + f)

## total/top cover parent test 04

In [95]:
# # ancillary
# noncover = [col for col in parent_plots.columns if not 'cover (%)' in col]
# noncover = parent_plots[noncover]
# # total
# totcover_parent = [col for col in parent_plots.columns if 'total cover (%)' in col]
# totcover_parent = parent_plots[totcover_parent]
# # top
# topcover_parent = [col for col in parent_plots.columns if 'top cover (%)' in col]
# topcover_parent = parent_plots[topcover_parent]

In [96]:
# totcover_parent = pd.concat([totcover_parent, noncover], axis=1)
# topcover_parent = pd.concat([topcover_parent, noncover], axis=1)

In [97]:
# # total
# p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
# f = 'NGEE_SP_PARENT_fcover_total_merged.csv'
# totcover_parent.to_csv(p + f)
# # top
# f = 'NGEE_SP_PARENT_fcover_top_nonPFT.csv'
# topcover_parent.to_csv(p + f)

## map

In [98]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/supplementary/tundra_alaska/'
f = 'tundra_alaska.shp'
ak_tundra = gpd.read_file(p + f)

In [99]:
fc = gee.geopandas_to_ee(ak_tundra)
pp2 = parent_plots.reset_index()

In [100]:
par_geom = pp2[['Site Code', 'latitude', 'longitude']]
par_points = gee.df_to_ee(par_geom, 
                          latitude='latitude', 
                          longitude='longitude')

In [101]:
chi_geom = akava[['Site Code', 'latitude', 'longitude']]
chi_points = gee.df_to_ee(chi_geom, 
                          latitude='latitude', 
                          longitude='longitude')

In [102]:
Map = gee.Map()
Map.center_object(fc, 6)
Map.addLayer(fc, {}, 'arctic')
Map.addLayer(chi_points, {}, 'child')
Map.addLayer(par_points, {'color':'red'}, 'parent')
Map

Map(center=[66.56551547720869, -158.1544109640721], controls=(WidgetControl(options=['position', 'transparent_…

In [431]:
len(pp2)

86