In [211]:
import requests
import pandas as pd
from pyogrio import read_dataframe
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta
import glob
import json
import numpy as np
import ee
import geemap as gee
import time
import standardize_pft_funcs as spf
import geopandas as gpd

In [212]:
try:
    ee.Initialize()
except Exception as e:
    print('GEE token expired. Use Jupyter Lab to authenticate')
    ee.Authenticate()

## Load NEON data

In [213]:
# import NEON files
DIR = '/mnt/poseidon/remotesensing/arctic/data/vectors/NEON_2021'
FILES = sorted(glob.glob(DIR + '/*Z.csv'))

dfs = []
for file in FILES:
    df = read_dataframe(file)
    dfs.append(df)

## Extract plot centroid coords

In [215]:
# create name column to exract plot centroids
# df = pd.concat(dfs)
# df.reset_index(inplace=True, drop=True)
# df['name'] = df.namedLocation + '.' + df.subplotID

In [216]:
# get subplot lat/lon from server
# requests_dict = {}
# plots = df['name'].to_list()
# url = 'http://data.neonscience.org/api/v0/locations/'
# locs = []

# for plot in plots:
    
#     # only get response when the request is new
#     response = None
#     if plot not in list(requests_dict.keys()):
#         response = requests.get(url + plot)
#         requests_dict[plot] = response
#     else:
#         response = requests_dict[plot]
#     print(url + plot, end='\r')
    
#     # extract lat/lon
#     try:
#         lat = response.json()['data']['locationDecimalLatitude']
#         lon = response.json()['data']['locationDecimalLongitude']
#     except Exception as e:
#         print(response.content)
#         #print(e)
#         lat, lon = None, None
#     locs.append([lat,lon])

In [217]:
# add coordinate data to rows
# coords = pd.DataFrame(locs, columns=['subplot_lat','subplot_lon'])
# df2 = pd.concat([df,coords], axis=1)

In [218]:
# df2.to_csv(DIR + '/NEON.D18.TOOLBARR.DP1.10058.001.div_1m2Data.2021.csv')

## Read NEON plots w/ lat/lon

In [219]:
# set vars
out_path = '/mnt/poseidon/remotesensing/arctic/data/vectors/NEON_2021'

In [220]:
# get data
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NEON_2021/'
f = 'NEON.D18.TOOLBARR.DP1.10058.001.div_1m2Data.2021.csv'
neon = pd.read_csv(p + f, index_col='Unnamed: 0')

In [221]:
neon.columns

Index(['uid', 'namedLocation', 'domainID', 'siteID', 'decimalLatitude',
       'decimalLongitude', 'geodeticDatum', 'coordinateUncertainty',
       'elevation', 'elevationUncertainty', 'plotType', 'nlcdClass', 'plotID',
       'subplotID', 'endDate', 'boutNumber', 'eventID',
       'samplingProtocolVersion', 'divDataType', 'targetTaxaPresent',
       'otherVariablesPresent', 'taxonID', 'scientificName', 'taxonRank',
       'family', 'nativeStatusCode', 'identificationQualifier',
       'taxonIDRemarks', 'morphospeciesID', 'morphospeciesIDRemarks',
       'identificationReferences', 'otherVariables', 'percentCover',
       'heightPlantOver300cm', 'heightPlantSpecies', 'remarks', 'measuredBy',
       'recordedBy', 'samplingImpractical', 'samplingImpracticalRemarks',
       'biophysicalCriteria', 'name', 'subplot_lat', 'subplot_lon'],
      dtype='object')

In [222]:
plant = neon.groupby(['name', 'scientificName']).agg({'percentCover':list})
other = neon.groupby(['name', 'otherVariables']).agg({'percentCover':list})

In [223]:
cover = pd.concat([plant, other], axis=1)
# merge cover columns and drop one
cover.columns = ['pcover_species', 'pcover_other']
cover = cover.where(cover['pcover_species'].notnull(), cover['pcover_other'], axis=0)
cover.drop(columns='pcover_other', inplace=True)
# flatten multi-index
cover.columns = cover.columns.get_level_values(0)
cover = cover.reset_index()
# groupby again
cover.groupby(['name', 'level_1']).agg({'pcover_species':'sum'})
cover.columns = cover.columns.get_level_values(0)
cover = cover.sort_values('name')
cover = cover.reset_index(drop=True)
# sum cover values in list
cover['pcover_species'] = cover['pcover_species'].apply(sum).astype(int)
cover.columns = ['Site Code', 'Name', 'Cover']
cover

Unnamed: 0,Site Code,Name,Cover
0,BARR_001.basePlot.div.31.1.1,Carex aquatilis Wahlenb.,6
1,BARR_001.basePlot.div.31.1.1,moss,97
2,BARR_001.basePlot.div.31.1.1,otherNonVascular,0
3,BARR_001.basePlot.div.31.1.1,scat,0
4,BARR_001.basePlot.div.31.1.1,standingDeadHerbaceous,12
...,...,...,...
3801,TOOL_043.basePlot.div.41.4.1,Eriophorum vaginatum L.,14
3802,TOOL_043.basePlot.div.41.4.1,Ledum palustre L.,16
3803,TOOL_043.basePlot.div.41.4.1,Rubus chamaemorus L.,37
3804,TOOL_043.basePlot.div.41.4.1,standingDeadHerbaceous,88


In [224]:
species = spf.get_unique_species(DFRAME=neon, 
                                 SCOL='scientificName', 
                                 DNAME='NEON', 
                                 SAVE=True, 
                                 OUTP=out_path)

Saved unique species list to /mnt/poseidon/remotesensing/arctic/data/vectors/NEON_2021/NEON_unique_species.csv.


In [225]:
species

Unnamed: 0,scientificName
0,Alopecurus magellanicus Lam.
1,Andromeda polifolia L.
2,Anemone parviflora Michx.
3,Arctagrostis latifolia (R. Br.) Griseb.
4,Arctophila fulva (Trin.) Rupr. ex Andersson
...,...
107,Tofieldia pusilla (Michx.) Pers.
108,Trichophorum cespitosum (L.) Hartm.
109,Unknown plant
110,Vaccinium uliginosum L.


In [226]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NEON_2021/'
f = 'NEON_ancillary_01.csv'
ancillary = pd.read_csv(p + f, index_col=0)
ancillary

Unnamed: 0,name,namedLocation,domainID,siteID,geodeticDatum,coordinateUncertainty,elevation,elevationUncertainty,plotType,nlcdClass,plotID,subplotID,endDate,boutNumber,eventID,samplingProtocolVersion,subplot_lat,subplot_lon
0,BARR_001.basePlot.div.31.1.1,BARR_001.basePlot.div,D18,BARR,WGS84,20.10,5.7,0.1,distributed,sedgeHerbaceous,BARR_001,31.1.1,2021-07-19,1,BARR.1.2021,NEON.DOC.014042vL,71.244022,-156.421657
1,BARR_001.basePlot.div.32.2.1,BARR_001.basePlot.div,D18,BARR,WGS84,20.10,5.7,0.1,distributed,sedgeHerbaceous,BARR_001,32.2.1,2021-07-19,1,BARR.1.2021,NEON.DOC.014042vL,71.244021,-156.421132
2,BARR_001.basePlot.div.32.4.1,BARR_001.basePlot.div,D18,BARR,WGS84,20.10,5.7,0.1,distributed,sedgeHerbaceous,BARR_001,32.4.1,2021-07-19,1,BARR.1.2021,NEON.DOC.014042vL,71.244102,-156.421122
3,BARR_001.basePlot.div.40.1.1,BARR_001.basePlot.div,D18,BARR,WGS84,20.10,5.7,0.1,distributed,sedgeHerbaceous,BARR_001,40.1.1,2021-07-19,1,BARR.1.2021,NEON.DOC.014042vL,71.244110,-156.421666
4,BARR_001.basePlot.div.40.3.1,BARR_001.basePlot.div,D18,BARR,WGS84,20.10,5.7,0.1,distributed,sedgeHerbaceous,BARR_001,40.3.1,2021-07-19,1,BARR.1.2021,NEON.DOC.014042vL,71.244191,-156.421656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,TOOL_043.basePlot.div.32.2.1,TOOL_043.basePlot.div,D18,TOOL,WGS84,20.25,821.6,0.2,tower,sedgeHerbaceous,TOOL_043,32.2.1,8/6/21,1,TOOL.1.2021,NEON.DOC.014042vL,68.665429,-149.375293
276,TOOL_043.basePlot.div.32.4.1,TOOL_043.basePlot.div,D18,TOOL,WGS84,20.25,821.6,0.2,tower,sedgeHerbaceous,TOOL_043,32.4.1,8/6/21,1,TOOL.1.2021,NEON.DOC.014042vL,68.665510,-149.375301
277,TOOL_043.basePlot.div.40.1.1,TOOL_043.basePlot.div,D18,TOOL,WGS84,20.25,821.6,0.2,tower,sedgeHerbaceous,TOOL_043,40.1.1,8/6/21,1,TOOL.1.2021,NEON.DOC.014042vL,68.665518,-149.375736
278,TOOL_043.basePlot.div.40.3.1,TOOL_043.basePlot.div,D18,TOOL,WGS84,20.25,821.6,0.2,tower,sedgeHerbaceous,TOOL_043,40.3.1,8/6/21,1,TOOL.1.2021,NEON.DOC.014042vL,68.665599,-149.375744


In [227]:
# load species checklist
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'
f = 'AKVEG_species_checklist.csv'
checklist = read_dataframe(p + f)

# get first 2 words (genus-species) from checklist accepted name and data species name
checklist['Mapping Name'] = checklist['Accepted Name'].apply(spf.get_substrings)
species['Mapping Name'] = species['scientificName'].apply(spf.get_substrings)

In [228]:
habits = spf.fill_habits(unique_species=species, 
                         checklist=checklist, 
                         u_name='scientificName', 
                         c_unofficial_name='Name', 
                         c_official_name='Accepted Name', 
                         mapping_name='Mapping Name',
                         habit='Habit')

38 species are missing habits.
15 species still missing habits.
4 species still missing habits.
3 species still missing habits.


In [229]:
habits

Unnamed: 0_level_0,Name,Mapping Name,Habit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Alopecurus magellanicus Lam.,Alopecurus magellanicus,graminoid
1,Andromeda polifolia L.,Andromeda polifolia,dwarf shrub
2,Anemone parviflora Michx.,Anemone parviflora,forb
3,Arctagrostis latifolia (R. Br.) Griseb.,Arctagrostis latifolia,graminoid
4,Arctophila fulva (Trin.) Rupr. ex Andersson,Arctophila fulva,graminoid
...,...,...,...
107,Tofieldia pusilla (Michx.) Pers.,Tofieldia pusilla,forb
108,Trichophorum cespitosum (L.) Hartm.,Trichophorum cespitosum,graminoid
109,Unknown plant,plant,
110,Vaccinium uliginosum L.,Vaccinium uliginosum,"dwarf shrub, shrub"


## Add evergreen/deciduous distinctions from macander 22 supplementary

In [230]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AK-AVA_post2000/'
f = 'evergreendecid_macander2022.csv'
evergrndecid = pd.read_csv(p + f, header=None)
evergrndecid.columns = ['evergreendecid', 'species']
final = spf.add_leaf_habit(habits, evergrndecid)

In [231]:
final

Unnamed: 0_level_0,Name,Mapping Name,Potential Habit,Habit,Leaf Retention,Potential Height,Height
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Alopecurus magellanicus Lam.,Alopecurus magellanicus,graminoid,graminoid,,,
1,Andromeda polifolia L.,Andromeda polifolia,dwarf shrub,dwarf shrub,Evergreen,,
2,Anemone parviflora Michx.,Anemone parviflora,forb,forb,,,
3,Arctagrostis latifolia (R. Br.) Griseb.,Arctagrostis latifolia,graminoid,graminoid,,,
4,Arctophila fulva (Trin.) Rupr. ex Andersson,Arctophila fulva,graminoid,graminoid,,,
...,...,...,...,...,...,...,...
107,Tofieldia pusilla (Michx.) Pers.,Tofieldia pusilla,forb,forb,,,
108,Trichophorum cespitosum (L.) Hartm.,Trichophorum cespitosum,graminoid,graminoid,,,
109,Unknown plant,plant,,,,,
110,Vaccinium uliginosum L.,Vaccinium uliginosum,"dwarf shrub, shrub","dwarf shrub, shrub","Evergreen,Deciduous",,


In [232]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NEON_2021/'
f = 'NEON_species_habit_00.csv'
final.to_csv(p + f)

In [233]:
list(final['Potential Habit'].unique())

['graminoid',
 'dwarf shrub',
 'forb',
 'deciduous tree, dwarf shrub, shrub, deciduous tree, tree',
 nan,
 'spore-bearing',
 'dwarf shrub, shrub',
 'shrub',
 ' dwarf shrub, shrub, deciduous tree,shrub, tree']

## Export shrubs, nonshrubs, and null

In [234]:
# get all shrub species
finalnonnull = final[~final['Potential Habit'].isnull()]
shrubs = finalnonnull[finalnonnull['Potential Habit'].str.contains('shrub')]
shrubs.to_csv(p + 'NEON_shrubs_00.csv')

In [235]:
# get all non-shrub species
nonshrubs = finalnonnull[~finalnonnull['Potential Habit'].str.contains('shrub')]
nonshrubs.to_csv(p + 'NEON_nonshrubs_00.csv')

In [236]:
# get null habits
finalnull = final[final['Potential Habit'].isnull()]
finalnull.to_csv(p + 'NEON_nullhabit_00.csv')

In [237]:
len(finalnull) + len(nonshrubs) + len(shrubs)

112

In [238]:
len(final)

112

## HERE: Manually add data from Amy's shrub03 file

## Standardize PFT Schema

In [239]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NEON_2021/'
nonshrub = 'NEON_nonshrubs_00.csv'
shrub = 'NEON_shrubs_01.csv'
null = 'NEON_nullhabit_01.csv'
alltypes = [nonshrub, shrub, null]

d = []
for t in alltypes:
    habits = pd.read_csv(p + t, index_col=0)
    d.append(habits)

In [240]:
completehabits = pd.concat(d)
completehabits['Habit'].unique()

array(['graminoid', 'forb', 'spore-bearing', 'dwarf shrub', 'shrub',
       'unknown', 'lichen', 'litter', 'moss', 'other', 'fungus', 'rock',
       'scat', 'soil', 'dead herbaceous', 'dead woody', 'water'],
      dtype=object)

In [241]:
def standardize_habit_nonstandard(habitrow, leafrow, heightrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    heightrow = heightrow.to_numpy()
    new_row = []
    
    for habit, leaf, height in zip(habitrow, leafrow, heightrow):
        habit = habit.lower()
        if habit == 'shrub, tree':
            new_habit = f'{leaf} dwarf to tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
        else:
            new_habit = f'{habit} cover (%)'
        new_row.append(" ".join(new_habit.split()).lower())
    
    return new_row

In [242]:
def standardize_habit(habitrow, leafrow, heightrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    heightrow = heightrow.to_numpy()
    new_row = []
    
    for habit, leaf, height in zip(habitrow, leafrow, heightrow):
        habit = habit.lower()
        if 'algae' in habit:
            new_habit = 'lichen cover (%)'
        elif habit == np.nan or 'unknown' in habit:
            new_habit = 'unknown cover (%)'
        elif 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte cover (%)'
        elif 'spore-bearing' in habit:
            new_habit = 'forb cover (%)'
        elif habit == 'grass':
            new_habit = 'graminoid cover (%)'
            
        elif habit == 'shrub, tree':
            new_habit = f'{leaf} dwarf to tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
            
        elif 'litter' in habit or 'scat' in habit:
            new_habit = 'litter cover (%)'
        elif 'soil' in habit or 'rock' in habit:
            new_habit = 'bare ground cover (%)'
            
        else:
            new_habit = f'{habit} cover (%)'
        new_row.append(" ".join(new_habit.split()).lower())
    
    return new_row

In [243]:
# standardized PFT
standard_habit_col = standardize_habit(completehabits['Habit'], 
                                       completehabits['Leaf Retention'], 
                                       completehabits['Height'])

In [244]:
# non-standardized PFT
nonstandard_habit_col = standardize_habit_nonstandard(completehabits['Habit'], 
                                                      completehabits['Leaf Retention'],
                                                      completehabits['Height'])

In [245]:
completehabits['Standard Habit'] = standard_habit_col
completehabits['NonStandard Habit'] = nonstandard_habit_col

In [246]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/NEON_2021/'
f = 'NEON_species_habit_standardized.csv'
completehabits.to_csv(p + f)

In [247]:
standard_fcover = completehabits.drop(columns=['NonStandard Habit'])
nonstandard_fcover = completehabits.drop(columns=['Standard Habit'])

## Join habit to fcover data

In [248]:
# add vegetation fcover and sum to PFT level
def add_cover(cover_df, pft_groups, habit_col):
    #cover = cover_df.reset_index()
    withfcover = cover.merge(pft_groups, left_on='Name', right_on='Name', how='left')
    grouped = withfcover.groupby(['Site Code', habit_col], 
                                 group_keys=True).agg({'Cover': 'sum'})
    return grouped

In [249]:
# flatten multilevel
def flatten_multilevel(grouped):
    grouped.columns = grouped.columns.get_level_values(0)
    grouped = grouped.reset_index()
    return grouped

In [250]:
# transpose
def transpose_df(grouped, habit_col):
    groups = grouped.set_index(['Site Code', habit_col]).stack().unstack([1,2])
    groups.columns = groups.columns.get_level_values(0)
    groups = groups.reset_index()
    return groups

In [251]:
def add_standard_cols(groups):
    
    # required columns
    necessary_cols = ['deciduous dwarf shrub cover (%)',
                      'deciduous dwarf to low shrub cover (%)',
                      'deciduous dwarf to tall shrub cover (%)',
                      'deciduous dwarf to tree cover (%)',
                      'deciduous tree cover (%)',
                      'evergreen dwarf shrub cover (%)',
                      'evergreen dwarf to low shrub cover (%)',
                      'evergreen dwarf to tall shrub cover (%)',
                      'evergreen dwarf to tree cover (%)',
                      'evergreen tree cover (%)',
                      'bryophyte cover (%)',
                      'forb cover (%)',
                      'graminoid cover (%)',
                      'lichen cover (%)']
    
    # add missing columns and fill with nan
    cols = groups.columns.tolist()
    addcols = []
    for nc in necessary_cols:
        if nc not in cols:
            addcols.append(nc)
    groups[addcols] = np.nan
    return groups

In [252]:
grouped_standard = add_cover(cover, completehabits, 'Standard Habit')
grouped_standard = flatten_multilevel(grouped_standard)
grouped_standard = transpose_df(grouped_standard, 'Standard Habit')
standard_groups = add_standard_cols(grouped_standard)

In [253]:
grouped_nonstand = add_cover(cover, completehabits, 'NonStandard Habit')
grouped_nonstand = flatten_multilevel(grouped_nonstand)
grouped_nonstand = transpose_df(grouped_nonstand, 'NonStandard Habit')

## Read ancillary table

In [254]:
anc = ancillary.copy()
# year
anc['year'] = pd.DatetimeIndex(anc['endDate']).year
anc['year'] = anc['year'].astype('int')
# plot size
anc['plot_radius_m'] = 1
# dataset source
anc['source'] = 'NEON'
anc.rename(columns={'subplot_lat':'latitude','subplot_lon':'longitude'}, inplace=True)

In [255]:
anc = anc[['name', 'latitude', 'longitude', 'year', 'plot_radius_m', 'source']]

In [256]:
anc.rename(columns={'name':'Site Code'}, inplace=True)
anc.set_index('Site Code', inplace=True)

In [257]:
anc

Unnamed: 0_level_0,latitude,longitude,year,plot_radius_m,source
Site Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BARR_001.basePlot.div.31.1.1,71.244022,-156.421657,2021,1,NEON
BARR_001.basePlot.div.32.2.1,71.244021,-156.421132,2021,1,NEON
BARR_001.basePlot.div.32.4.1,71.244102,-156.421122,2021,1,NEON
BARR_001.basePlot.div.40.1.1,71.244110,-156.421666,2021,1,NEON
BARR_001.basePlot.div.40.3.1,71.244191,-156.421656,2021,1,NEON
...,...,...,...,...,...
TOOL_043.basePlot.div.32.2.1,68.665429,-149.375293,2021,1,NEON
TOOL_043.basePlot.div.32.4.1,68.665510,-149.375301,2021,1,NEON
TOOL_043.basePlot.div.40.1.1,68.665518,-149.375736,2021,1,NEON
TOOL_043.basePlot.div.40.3.1,68.665599,-149.375744,2021,1,NEON


## Create total cover file (not merged)

In [258]:
# set index and sort columns
totcover_nm = grouped_nonstand.copy()
totcover_nm.set_index('Site Code', inplace=True)
totcover_nm.drop(columns=['fungus cover (%)'], inplace=True)
totcover_nm.columns.name = None
totcover_nm = totcover_nm[sorted(totcover_nm.columns.tolist())]

In [259]:
# add "total cover" to column names
cols = totcover_nm.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
totcover_nm.columns = cols

In [260]:
# add ancillary
totcover_nm.index = totcover_nm.index.astype(str)
anc.index = anc.index.astype(str)
totcover_nm = pd.concat([totcover_nm, anc], axis=1, join='inner')
totcover_nm.replace(0.0, np.NaN, inplace=True)
totcover_nm.index.name = 'Site Code'

In [261]:
# combine cover columns
totcover_nm.drop(columns=['other total cover (%)', 
                          'unknown total cover (%)'], inplace=True)
totcover_nm = totcover_nm[totcover_nm['water total cover (%)'] != 100]

In [262]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'NEON_fcover_total_orig.csv'
totcover_nm.to_csv(p + f)

In [263]:
len(totcover_nm)

275

## Create total cover file (merged)

In [264]:
# set index and sort columns
totcover = standard_groups.copy()
totcover.set_index('Site Code', inplace=True)
totcover.drop(columns=['fungus cover (%)'], inplace=True)
totcover.columns.name = None
totcover = totcover[sorted(totcover.columns.tolist())]

In [265]:
# add "total cover" to column names
cols = totcover.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
totcover.columns = cols

In [266]:
# add ancillary
totcover.index = totcover.index.astype(str)
anc.index = anc.index.astype(str)
totcover = pd.concat([totcover, anc], axis=1, join='inner')
totcover.replace(0.0, np.NaN, inplace=True)
totcover.index.name = 'Site Code'

In [267]:
# combine cover columns
totcover.drop(columns=['other total cover (%)', 
                       'unknown total cover (%)'], inplace=True)
totcover = totcover[totcover['water total cover (%)'] != 100]
totcover.rename(columns={'water total cover (%)':'water top cover (%)',
                         'bare ground total cover (%)':'bare ground top cover (%)'},
               inplace=True)

In [268]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_05/temp_fcover/'
f = 'NEON_fcover_child.csv'
totcover.to_csv(p + f)

In [269]:
len(totcover)

275

## Aggregate child plots to parent plots

In [192]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [193]:
# load the plots of interest
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_05/temp_fcover/'
f = 'NEON_fcover_child.csv'
akava = pd.read_csv(p + f)

# only plots with very small sizes need to be processed, 
# the other remains the same, discarding all plots earlier than 2010
akava['plot_radius_m'].astype(float)
akava_small_fcover = akava[(akava['plot_radius_m']<10) &  (akava['year']>=2010)]
akava_large_fcover = akava[(akava['plot_radius_m']>=10) &  (akava['year']>=2010)]

In [194]:
len(akava_small_fcover)

275

In [195]:
selected_columns = ['Site Code','year','latitude', 'longitude', 'source']
akava_small = akava_small_fcover[selected_columns]
# akava_small = akava.fillna(0)
akava_small.head()

Unnamed: 0,Site Code,year,latitude,longitude,source
0,BARR_001.basePlot.div.31.1.1,2021,71.244022,-156.421657,NEON
1,BARR_001.basePlot.div.32.2.1,2021,71.244021,-156.421132,NEON
2,BARR_001.basePlot.div.32.4.1,2021,71.244102,-156.421122,NEON
3,BARR_001.basePlot.div.40.1.1,2021,71.24411,-156.421666,NEON
4,BARR_001.basePlot.div.40.3.1,2021,71.244191,-156.421656,NEON


In [196]:
#### transform geographic to utm so that distance caculation is more intuitive
from pyproj import Transformer
src_crs = "EPSG:4326"
target_crs = "EPSG:32606"
transformer = Transformer.from_crs(src_crs, target_crs)

lon = akava_small['longitude'].to_numpy()
lat = akava_small['latitude'].to_numpy()

#### store the projected coords
projcoords = []   
for i in range(0,akava_small.shape[0]):
    xcoord, ycoord = lon[i],lat[i]
    projcoords.append(transformer.transform(ycoord,xcoord))

In [197]:
#### calculate the distance matrix of all small plots (for examination)
from scipy.spatial.distance import cdist
coord = np.array(projcoords)
dist_mat = cdist(coord, coord, 'euclidean')
# dist_mat[:3,:] <= 60

In [198]:
### group pixels/plots based on their euclidean distance
def group_pixels_by_distance(pixel_data, distance_threshold):
    """
        pixel_data: ndarray of coordinate pair: n by 2, default is utm projection
        distance_threshold: threshold used for grouping, default is 60m
        
        return:
        a list of values indicating the group id of each pixel
    
    """
    cluster_id = 0
    pixel_clusters = {}
   
    def expand_cluster(pixel, cluster_id):
        if pixel_clusters.get(cluster_id) is None:
            pixel_clusters[cluster_id] = []
       
        pixel_clusters[cluster_id].append(pixel)

    cluster_array = np.full(len(pixel_data), -1)  # Initialize with -1 (unassigned)
   
    for i, pixel in enumerate(pixel_data):
        assigned = False
       
        for c_id, cluster_pixels in pixel_clusters.items():
            cluster_pixels = np.array(cluster_pixels)
            distances = np.linalg.norm(cluster_pixels - pixel, axis=1)
            if np.any(distances <= distance_threshold):
                expand_cluster(pixel, c_id)
                assigned = True
                cluster_array[i] = c_id
                break
       
        if not assigned:
            expand_cluster(pixel, cluster_id)
            cluster_array[i] = cluster_id
            cluster_id += 1

    return cluster_array

dist_thres = 55
coord = np.array(projcoords)  ## UTM coords, unit is meter
group = group_pixels_by_distance(coord, dist_thres)

In [199]:
### add the group id to df for aggregation
akava_small['group_id'] = group
akava_small.head()

Unnamed: 0,Site Code,year,latitude,longitude,source,group_id
0,BARR_001.basePlot.div.31.1.1,2021,71.244022,-156.421657,NEON,0
1,BARR_001.basePlot.div.32.2.1,2021,71.244021,-156.421132,NEON,0
2,BARR_001.basePlot.div.32.4.1,2021,71.244102,-156.421122,NEON,0
3,BARR_001.basePlot.div.40.1.1,2021,71.24411,-156.421666,NEON,0
4,BARR_001.basePlot.div.40.3.1,2021,71.244191,-156.421656,NEON,0


In [200]:
groups = akava_small.groupby(['group_id', 
                              'year', 'source']).agg({'latitude':'mean',
                                            'longitude':'mean',
                                            'Site Code':list})

In [201]:
def get_plot_size(rowlst):
    if len(rowlst) == 1:
        val = 1
    else:
        val = 55
    return val
groups['plot_radius_m_2'] = groups['Site Code'].apply(lambda row: get_plot_size(row))

In [202]:
groups2 = groups.explode('Site Code')

In [203]:
groups2.reset_index(inplace=True)
groups2['parent_id'] = (groups2['group_id'].astype(str) + '_' + 
                        groups2['year'].astype(str) + '_' +
                        groups2['source'])
groups2.rename(columns={'latitude':'parent_latitude',
                        'longitude':'parent_longitude'},
               inplace=True)
groups2.drop(columns=['group_id', 'year', 'source'], inplace=True)

In [204]:
groups2.set_index('Site Code', inplace=True)
akava_small_fcover.set_index('Site Code', inplace=True)

In [205]:
joined = pd.concat([akava_small_fcover, groups2], axis=1)
joined.reset_index(inplace=True)

In [206]:
info_cols = ['parent_latitude', 'parent_longitude', 'plot_radius_m_2'] #gets mean of child lats/lons
data_cols = [col for col in joined.columns if 'cover (%)' in col]
anci_cols = set(joined.columns) - set(data_cols) - set(info_cols)
info_dict = dict.fromkeys(info_cols, 'mean')
data_dict = dict.fromkeys(data_cols, 'mean')
anci_dict = dict.fromkeys(anci_cols, set)
d = {**data_dict, **info_dict, **anci_dict}

parent_plots = joined.groupby('parent_id').agg(d)

In [207]:
parent_plots.drop(columns=['latitude', 'longitude', 'parent_id', 
                           'source', 'plot_radius_m'],
                 inplace=True)

In [208]:
parent_plots['year'] = parent_plots['year'].explode()
parent_plots['source'] = 'NEON'
parent_plots.rename(columns={'Site Code':'child_site_codes',
                             'parent_longitude':'longitude',
                             'parent_latitude':'latitude',
                             'plot_radius_m_2':'plot_radius_m',
                             'water total cover (%)':'water top cover (%)',
                             'bare ground total cover (%)':'bare ground top cover (%)'}, 
                    inplace=True)
parent_plots['plot_radius_m'] = parent_plots['plot_radius_m'].astype(int)
parent_plots.index.name = 'Site Code'

In [209]:
parent_plots.head(10)

Unnamed: 0_level_0,bare ground top cover (%),bryophyte total cover (%),dead herbaceous total cover (%),dead woody total cover (%),deciduous dwarf shrub total cover (%),deciduous dwarf to low shrub total cover (%),deciduous dwarf to tall shrub total cover (%),deciduous dwarf to tree total cover (%),deciduous tree total cover (%),evergreen dwarf shrub total cover (%),...,graminoid total cover (%),lichen total cover (%),litter total cover (%),water top cover (%),latitude,longitude,plot_radius_m,year,child_site_codes,source
Site Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0_2021_NEON,,73.833333,10.0,,9.0,,,,,,...,9.333333,1.0,27.5,1.0,71.244106,-156.421395,55,2021,"{BARR_001.basePlot.div.41.4.1, BARR_001.basePl...",NEON
10_2021_NEON,9.5,44.166667,10.6,,3.0,,,,,,...,10.833333,13.0,17.666667,3.0,71.253775,-156.380753,55,2021,"{BARR_012.basePlot.div.32.4.1, BARR_012.basePl...",NEON
11_2021_NEON,,74.333333,9.8,,6.4,,,,,,...,7.2,7.2,22.8,,71.265929,-156.60609,55,2021,"{BARR_013.basePlot.div.40.3.1, BARR_013.basePl...",NEON
12_2021_NEON,,96.0,5.75,,5.0,,,,,,...,6.75,3.0,8.25,1.0,71.298787,-156.648069,55,2021,"{BARR_014.basePlot.div.40.3.1, BARR_014.basePl...",NEON
13_2021_NEON,,87.0,11.833333,,2.0,,2.0,,,,...,8.333333,6.666667,13.0,,71.267254,-156.562378,55,2021,"{BARR_015.basePlot.div.40.3.1, BARR_015.basePl...",NEON
14_2021_NEON,6.5,37.833333,29.0,,,,11.0,,,,...,12.333333,6.0,30.166667,21.0,71.246041,-156.504816,55,2021,"{BARR_016.basePlot.div.41.4.1, BARR_016.basePl...",NEON
15_2021_NEON,1.0,69.833333,5.0,,,,,,,,...,11.166667,3.4,28.0,,71.244828,-156.43466,55,2021,"{BARR_017.basePlot.div.32.2.1, BARR_017.basePl...",NEON
16_2021_NEON,,97.0,2.6,,,,,,,,...,9.833333,,3.75,6.0,71.306721,-156.616993,55,2021,"{BARR_018.basePlot.div.40.3.1, BARR_018.basePl...",NEON
17_2021_NEON,,78.333333,7.5,,3.0,,7.666667,,,,...,6.0,10.5,55.0,32.666667,71.226668,-156.546572,55,2021,"{BARR_019.basePlot.div.32.2.1, BARR_019.basePl...",NEON
18_2021_NEON,1.0,76.666667,7.166667,,,,,,,,...,16.666667,,14.5,34.0,71.261167,-156.489465,55,2021,"{BARR_020.basePlot.div.41.4.1, BARR_020.basePl...",NEON


In [210]:
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_05/temp_fcover/'
f = 'NEON_fcover_parent.csv'
parent_plots.to_csv(p + f)

## Map child and parent plots

In [77]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/supplementary/tundra_alaska/'
f = 'tundra_alaska.shp'
ak_tundra = gpd.read_file(p + f)

In [78]:
fc = gee.geopandas_to_ee(ak_tundra)
pp2 = parent_plots.reset_index()

In [79]:
par_geom = pp2[['latitude', 'longitude']]
par_points = gee.df_to_ee(par_geom, 
                          latitude='latitude', 
                          longitude='longitude')

In [80]:
chi_geom = totcover[['latitude', 'longitude']]
chi_points = gee.df_to_ee(chi_geom,
                          latitude='latitude', 
                          longitude='longitude')                    

In [81]:
Map = gee.Map()
Map.center_object(fc, 6)
Map.addLayer(fc, {}, 'arctic')
Map.addLayer(chi_points, {}, 'child')
Map.addLayer(par_points, {'color':'red'}, 'parent')
Map

Map(center=[66.56551547720869, -158.1544109640721], controls=(WidgetControl(options=['position', 'transparent_…