# NEON Standardization Notebook
---
---

# Table of contents
1. [PRE-PROCESSING](#preprocess)
    1. [Load fcover data](#loadfcover)
    2. [Join fcover to species checklist table](#joinchecklist)
    3. [Join fcover to leaf retention table](#joinleafretention)
    4. [Export species-PFT tables](#exporthabits)
    5. [Clean exported species-PFT tables](#cleanhabits)
2. [STANDARDIZATION](#standardization)
    1. [Standardize PFT names](#pftnames)
    2. [Sum species fcover to PFT-level](#sumfcover)
    3. [Standardize table layout](#tablelayout)
    4. [Standardize auxiliary columns](#aux)
        1. [Basic plot information](#basics)
        2. [Geographical intersections](#geo)
        3. [Duplicate coordinates](#dupes)
3. [EXPORT](#export)

# Load required packages

In [23]:
import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import date, timedelta
from pyogrio import read_dataframe
import glob
import geopandas as gpd
import sys
import os
import chardet
import tarfile
from urllib.request import urlretrieve
import regex as re
from shapely.validation import make_valid

In [24]:
# import local functions
current_dir = os.path.abspath('..')
if current_dir not in sys.path:
    sys.path.append(current_dir)
import standardize_pft_funcs as spf

---
---
# 1. Pre-processing <a name="preprocess"></a>
After visually inspecting each species-level fcover table for obvious errors, we saved the "cleaned" tables to a folder that we then standardized here.

---
## 1.1. Load fcover data <a name="loadfcover"></a>

In [25]:
# # complete the fcover file by adding coordinates
# DIR = '/mnt/poseidon/remotesensing/arctic/data/vectors/neon_2021'
# FILES = sorted(glob.glob(DIR + '/*Z.csv'))

# dfs = []
# for file in FILES:
#     df = read_dataframe(file)
#     dfs.append(df)
    
# spf.neon_plot_centroids(dfs, DIR)

In [26]:
source = 'neon'

In [27]:
# get data
neon = pd.read_csv(f'../{source}/input_data/neon_foliar_cover.csv', index_col='Unnamed: 0')
fcover = neon.copy()
fcover = fcover[['name', 'scientificName', 'otherVariables', 'percentCover']]
fcover.head(3)

Unnamed: 0,name,scientificName,otherVariables,percentCover
0,BARR_003.basePlot.div.32.2.1,Dupontia fisheri R. Br.,,0.5
1,BARR_003.basePlot.div.40.3.1,,moss,95.0
2,BARR_003.basePlot.div.32.4.1,Eriophorum sp.,,0.5


In [28]:
# scientific names are species, "other variables" are broad categories like "moss"
# we want to keep both
fcover['speciesName'] = fcover['scientificName'].combine_first(fcover['otherVariables'])
fcover = fcover[['name', 'speciesName', 'percentCover']]
fcover = fcover.rename(columns={'name':'plotName', 'speciesName':'datasetSpeciesName'})
fcover.head(3)

Unnamed: 0,plotName,datasetSpeciesName,percentCover
0,BARR_003.basePlot.div.32.2.1,Dupontia fisheri R. Br.,0.5
1,BARR_003.basePlot.div.40.3.1,moss,95.0
2,BARR_003.basePlot.div.32.4.1,Eriophorum sp.,0.5


In [29]:
# get unique species names from narrowed down list
u_species_names = fcover['datasetSpeciesName'].unique()
print(len(u_species_names))
u_species_names = [x for x in u_species_names if pd.notnull(x)]
print(len(u_species_names))
species_names_df = pd.DataFrame(u_species_names, columns=['datasetSpeciesName'])

123
123


---
## 1.2. Join fcover to species checklist table <a name="joinchecklist"></a>

In [30]:
# load species checklist and prep for joining
checklist_path = f'../etc/akveg_species_checklist.csv'
checklist_df = spf.checklist_df(checklist_path)
checklist_df.head(3)

Unnamed: 0,nameCode,checklistSpeciesName,nameStatus,nameAccepted,nameFamily,acceptedNameSource,nameLevel,speciesForm,speciesHabit
0,abroth,Abrothallus De Not.,accepted,Abrothallus De Not.,Abrothallaceae,CNALH,genus,lichen,lichen
1,abrpar,Abrothallus parmeliarum (Sommerf.) Arnold,accepted,Abrothallus parmeliarum (Sommerf.) Arnold,Abrothallaceae,CNALH,species,lichen,lichen
2,abrpey,Abrothallus peyritschii (Stein) Kotte,accepted,Abrothallus peyritschii (Stein) Kotte,Abrothallaceae,Dillman et al. 2012,species,lichen,lichen


In [31]:
species_names_df.head(3)

Unnamed: 0,datasetSpeciesName
0,Dupontia fisheri R. Br.
1,moss
2,Eriophorum sp.


In [32]:
# get first 2 words (genus-species) from checklist accepted name and data species name
checklist_df['joinKey'] = checklist_df['checklistSpeciesName'].apply(spf.get_substrings)
species_names_df['joinKey'] = species_names_df['datasetSpeciesName'].apply(spf.get_substrings)
habits = spf.join_to_checklist(unique_species=species_names_df, 
                               checklist=checklist_df, 
                               u_name='datasetSpeciesName', 
                               c_unofficial_name='checklistSpeciesName', 
                               c_official_name='nameAccepted', 
                               mapping_name='joinKey',
                               habit='speciesHabit')
habits.head(3)

49 species are missing habits.
26 species still missing habits.
15 species still missing habits.
14 species still missing habits.


Unnamed: 0_level_0,datasetSpeciesName,joinKey,speciesHabit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Dupontia fisheri R. Br.,Dupontia fisheri,graminoid
1,moss,moss,
2,Eriophorum sp.,Eriophorum sp.,graminoid


In [33]:
species_names_df.head(3)

Unnamed: 0,datasetSpeciesName,joinKey
0,Dupontia fisheri R. Br.,Dupontia fisheri
1,moss,moss
2,Eriophorum sp.,Eriophorum sp.


In [34]:
habits = species_names_df.merge(habits, how='left', left_on='datasetSpeciesName', right_on='datasetSpeciesName', suffixes=(None, '_1'))
habits = habits[['joinKey', 'datasetSpeciesName', 'speciesHabit']]

---
## 1.3. Join fcover to leaf retention table <a name="joinleafretention"></a>

In [35]:
leaf_df = spf.leaf_retention_df('../etc/macander_leaf_retention.csv')
leaf_df.head(3)

Unnamed: 0,leafRetention,retentionSpeciesName
0,deciduous,Salix
1,deciduous,Betula nana
2,deciduous,Vaccinium uliginosum


In [36]:
# add leaf retention column
habits_wleaf = spf.add_leaf_retention(habits, leaf_df, 'leafRetention')
habits_wleaf = habits_wleaf[['datasetSpeciesName', 'joinKey', 'speciesHabit', 'leafRetention']]
habits_wleaf.head(3)

Unnamed: 0,datasetSpeciesName,joinKey,speciesHabit,leafRetention
0,Dupontia fisheri R. Br.,Dupontia fisheri,graminoid,
1,moss,moss,,
2,Eriophorum sp.,Eriophorum sp.,graminoid,


In [37]:
# apply function
habits_wleaf['speciesHabit'] = habits_wleaf['speciesHabit'].apply(spf.clean_shrub_habits)
list(habits_wleaf['speciesHabit'].unique())

['graminoid', nan, 'forb', 'shrub', 'spore-bearing']

---
## 1.4. Export species-PFT tables <a name="exporthabits"></a>

In [38]:
# generate 3 files and dataframe of habit types
shrubs, nonshrubs, null = spf.export_habit_files(habits_wleaf,
                                                 'temp_data', 
                                                 source, 
                                                 'speciesHabit')

In [39]:
# check to make sure the files make sense
len(habits_wleaf) == (len(null) + len(nonshrubs) + len(shrubs))

True

---
## 1.5. Clean exported species-PFT tables <a name="cleanhabits"></a>
Here, we manually assessed each file. The nonshrubs were usually correct and did not have to be adjusted. Some shrubs were missing leaf retention information, so we consulted with arctic ecologists to determine it. For unknown (null) habits, we consulted arctic ecology expert, Amy Breen, to determine the issue and correct habit. After the tables were complete, all species names had a single associated habit.
- null habits
    - assign missing habits and consult ecologist when needed
- non-shrubs
    - assign leaf habit to all trees (ctrl-f to find all trees in excel)
        - coniferous tree = evergreen
        - deciduous tree  = deciduous
    - remove leaf habit if misapplied
- shrubs
    - check current leaf habits and correct if necessary
    - fill empty leaf habits
    - choose leaf habit if both evergreen & deciduous listed
    - add Stature column (dwarf shrub or not)

---
---
# 2. Standardization <a name="standardization"></a>

---
## 2.1. Standardize PFT names <a name="pftnames"></a>
deciduous shrub, evergreen shrub, graminoid, forb, litter, non-vascular (lichen, brophyte)

In [40]:
# paths to visually assessed and cleaned habit files
all_habit_types = [f'temp_data/{source}_nonshrubs_01.csv',
                   f'temp_data/{source}_shrubs_01.csv',
                   f'temp_data/{source}_nullhabit_01.csv']

# create list of habit dataframes
habit_dfs = []
for path in all_habit_types:
    habit_df = pd.read_csv(path, index_col=0)
    habit_dfs.append(habit_df)
all_habits = pd.concat(habit_dfs)

In [41]:
# show unique habit names we're working with
all_habits['speciesHabit'].unique().tolist()

['graminoid',
 'forb',
 'spore-bearing',
 'shrub',
 'lichen',
 'litter',
 'moss',
 'other',
 'fungus',
 'rock',
 'scat',
 'soil',
 'water']

In [42]:
# akveg-specific habit name standardization function
def standardize_habit(habitrow, leafrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    new_row = []
    
    for habit, leaf in zip(habitrow, leafrow):
        
        habit = habit.lower() 
        # non-shrubs
        if 'algae' in habit:
            new_habit = 'other'
        elif 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte'
        elif 'spore-bearing' in habit:
            new_habit = 'forb'
        elif 'litter' in habit or 'scat' in habit:
            new_habit = 'litter'
        elif 'bare ground' in habit or 'crust' in habit or 'mineral' in habit or 'soil' in habit:
            new_habit = 'bare ground'
            
        # shrubs
        elif 'shrub' in habit:
            new_habit = f'{leaf} shrub'
            
        else:
            new_habit = f'{habit}'
        new_row.append(new_habit)
    
    return new_row

In [43]:
# standardized PFT
standard_habit_col = standardize_habit(all_habits['speciesHabit'], 
                                       all_habits['leafRetention'])
all_habits['standardHabit'] = standard_habit_col
print(len(all_habits))
all_habits.head(3)

123


Unnamed: 0,datasetSpeciesName,joinKey,speciesHabit,leafRetention,speciesStature,standardHabit
36,Alopecurus magellanicus Lam.,Alopecurus magellanicus,graminoid,,,graminoid
104,Anemone parviflora Michx.,Anemone parviflora,forb,,,forb
29,Arctagrostis latifolia (R. Br.) Griseb.,Arctagrostis latifolia,graminoid,,,graminoid


---
## 2.2. Sum species fcover to PFT-level <a name="sumfcover"></a>

In [44]:
# loop through each plot and sum fcover per pft
all_habits = all_habits[['datasetSpeciesName', 'standardHabit']]
species_fcover = fcover.merge(all_habits, 
                              left_on='datasetSpeciesName', 
                              right_on='datasetSpeciesName')
species_fcover = species_fcover[species_fcover['percentCover'].notna() & (species_fcover['percentCover'] != 0)]
species_fcover = species_fcover[['plotName', 'datasetSpeciesName', 'standardHabit', 'percentCover']]
species_fcover['standardHabit'] = species_fcover['standardHabit'].replace('fungus', 'other')
species_fcover.head(3)

Unnamed: 0,plotName,datasetSpeciesName,standardHabit,percentCover
0,BARR_003.basePlot.div.32.2.1,Dupontia fisheri R. Br.,graminoid,0.5
1,BARR_003.basePlot.div.32.4.1,Dupontia fisheri R. Br.,graminoid,0.5
2,BARR_006.basePlot.div.31.1.1,Dupontia fisheri R. Br.,graminoid,7.0


In [45]:
# export
species_fcover.to_csv(f'output_data/{source}_nonstandard_species_fcover.csv', index=False)

In [46]:
# group by 'Site Code' and 'standardHabit', then sum the 'Cover'
temp = species_fcover.groupby(['plotName', 'standardHabit'])['percentCover'].sum().reset_index()
pft_fcover = temp.pivot(index='plotName', columns='standardHabit', values='percentCover')
pft_fcover = pft_fcover.fillna(0)

In [47]:
# clean up
pft_fcover.columns.name = None
pft_fcover.index.name = 'plotName'
pft_fcover.index = pft_fcover.index.astype(str)
pft_fcover.head(3)

Unnamed: 0_level_0,bare ground,bryophyte,deciduous shrub,evergreen shrub,forb,graminoid,lichen,litter,other,rock,water
plotName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BARR_001.basePlot.div.31.1.1,0.0,97.0,0.0,0.0,6.5,8.0,0.5,6.5,12.5,0.0,0.0
BARR_001.basePlot.div.32.2.1,0.0,96.0,9.0,0.0,1.5,9.0,1.0,8.5,8.0,0.0,0.0
BARR_001.basePlot.div.32.4.1,0.5,15.0,0.0,0.0,0.0,13.5,0.0,78.0,9.5,0.0,1.0


---
## 2.3. Standardize table layout <a name="tablelayout"></a>

In [48]:
# list of necessary PFTs
pfts = ['deciduous shrub', 'deciduous tree',
        'evergreen shrub', 'evergreen tree',
        'forb', 'graminoid', 'non-vascular',
        'bryophyte', 'lichen', 'litter',
        'bare ground', 'water', 'other']

In [49]:
# create non-vascular
pft_fcover['non-vascular'] = pft_fcover['bryophyte'] + pft_fcover['lichen']

In [50]:
# create standard columns
pft_fcover = spf.add_standard_cols(pft_fcover, pfts)
pft_fcover = pft_fcover[pfts]
pft_fcover.head(3)

Unnamed: 0_level_0,deciduous shrub,deciduous tree,evergreen shrub,evergreen tree,forb,graminoid,non-vascular,bryophyte,lichen,litter,bare ground,water,other
plotName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
BARR_001.basePlot.div.31.1.1,0.0,,0.0,,6.5,8.0,97.5,97.0,0.5,6.5,0.0,0.0,12.5
BARR_001.basePlot.div.32.2.1,9.0,,0.0,,1.5,9.0,97.0,96.0,1.0,8.5,0.0,0.0,8.0
BARR_001.basePlot.div.32.4.1,0.0,,0.0,,0.0,13.5,15.0,15.0,0.0,78.0,0.5,1.0,9.5


---
## 2.4. Standardize auxiliary columns <a name="aux"></a>

Plot information:
- [ ] UID
- [x] plotName
- [x] dataSource
- [x] dataSubsource
- [x] fcoverScale
- [x] surveyMethod
- [x] surveyPurpose
- [x] duplicatedCoords
- [x] duplicatedDate
- [x] surveyYear
- [x] surveyMonth
- [x] surveyDay

Geographical information:
- [x] bioclimateSubzone
- [x] adminCountry
- [x] adminUnit
- [x] fireYears
- [x] coordEpsg
- [x] georefSource
- [x] georefAccuracy
- [x] plotRadius
- [x] longitueX
- [x] latitudeY

In [51]:
# load auxiliary data
aux = neon.copy()
# sampling methods, coverscale, data source
methods = pd.read_csv('../etc/sampling_methods.csv')

### 2.4.1. Basic plot information <a name="basics"></a>

In [52]:
def get_subsource(row):
    new_row = []
    if 'TOOL' in row:
        subsource = 'Toolik Field Station NEON'
    elif 'BARR' in row:
        subsource = 'Utqiagvik-Barrow NEON'
    else:
        subsource = np.nan
    return subsource

In [53]:
new_aux = pd.DataFrame()

# date columns
aux['surveyDate'] = pd.to_datetime(aux['endDate'], format='mixed')
new_aux['surveyYear'] = aux['surveyDate'].dt.year.astype(int)
new_aux['surveyMonth'] = aux['surveyDate'].dt.month.astype(int)
new_aux['surveyDay'] = aux['surveyDate'].dt.day.astype(int)

# plot size
new_aux['plotRadius'] = 1

# geographical information
new_aux['latitudeY'] = aux['subplot_lat'].astype('Float32')
new_aux['longitudeX'] = aux['subplot_lon'].astype('Float32')
new_aux['georefSource'] = 'GPS'
new_aux['georefAccuracy'] = 0.25
new_aux['coordEPSG'] = 'EPSG:4326'

# main
new_aux['plotName'] = aux['name']
new_aux['dataSubsource'] = new_aux['plotName'].apply(lambda x: get_subsource(x))
new_aux = new_aux.merge(methods, how='left', left_on='dataSubsource', right_on='dataSubsource')
new_aux = new_aux.groupby('plotName')[new_aux.columns].first()
new_aux = new_aux.set_index(new_aux['plotName'])

### 2.4.2. Geographical intersections <a name="geo"></a>

In [54]:
# set geo info
geo_paths = ['../etc/gaul1/gaul1_asap.shp',
             '../etc/fire/InterAgencyFirePerimeterHistory_All_Years_View.shp',
             '../etc/bioclim/bioclimate_la_latlon.shp']
geo_names = ['gaul1',
             'fire',
             'bioclim']
geo_cols = [['name1', 'name0', 'geometry'],
            ['FIRE_YEAR', 'geometry'],
            ['zone', 'geometry']]

In [55]:
# create geospatial information
fcover_epsg = 'EPSG:4326' # wgs84
intersect_epsg = 'EPSG:5936' # alaska projected datum
fcover_and_aux = pd.concat([pft_fcover, new_aux], join='inner', axis=1)
fcover_and_aux = gpd.GeoDataFrame(fcover_and_aux, 
                                  geometry=gpd.points_from_xy(fcover_and_aux['longitudeX'],
                                                              fcover_and_aux['latitudeY']),
                                  crs=fcover_epsg)
fcover_and_aux = fcover_and_aux.to_crs(intersect_epsg)

# add new geospatial intersections
fcover_and_aux = spf.add_geospatial_aux(fcover_and_aux, 
                                        geo_paths, 
                                        geo_names, 
                                        geo_cols,
                                        intersect_epsg)

# drop unneccesary columns
fcover_and_aux = fcover_and_aux.drop(columns=['index_gaul1', 
                                              'index_fire', 
                                              'index_bioclim', 
                                              'plotName'])

In [56]:
# rename new columns
fcover_and_aux.rename(columns={'name1':'adminUnit',
                               'name0':'adminCountry',
                               'FIRE_YEAR':'fireYears',
                               'zone':'bioclimSubzone'}, inplace=True)

### 2.4.3. Duplicate coordinates <a name="dupes"></a>
This indicates that plots may have been revisited, they were very close and GPS accuracy was too low, or there is human error

In [57]:
# duplicate coord column
coords = ['longitudeX', 'latitudeY']
fcover_and_aux['duplicatedCoords'] = np.nan
fcover_and_aux = spf.find_duplicates(fcover_and_aux, coords, 'duplicatedCoords')

# duplicate date column
date = ['surveyYear', 'surveyMonth', 'surveyDay']
fcover_and_aux['duplicatedDate'] = np.nan
fcover_and_aux = spf.find_duplicates(fcover_and_aux, date, 'duplicatedDate')

no duplicates found
duplicates found


---
# 3. Export <a name="export"></a>

In [58]:
# clean up
pfts = {'deciduous shrub':'deciduousShrubCover',
        'evergreen shrub':'evergreenShrubCover',
        'deciduous tree':'deciduousTreeCover',
        'evergreen tree':'evergreenTreeCover',
        'forb':'forbCover',
        'graminoid':'graminoidCover',
        'non-vascular':'nonvascularSumCover',
        'bryophyte':'bryophyteCover',
        'lichen':'lichenCover',
        'litter':'litterCover',
        'bare ground':'baregroundCover',
        'water':'waterCover',
        'other':'otherCover'}

# rename columns and replace NaN cover with 0
fcover_and_aux.rename(columns=pfts, inplace=True)
fcover_cols = list(pfts.values())
fcover_and_aux[fcover_cols] = fcover_and_aux[fcover_cols].fillna(0.0)

# reproject and set cover data type
fcover_and_aux = fcover_and_aux.to_crs('EPSG:4326')
fcover_and_aux[fcover_cols] = fcover_and_aux[fcover_cols].astype(np.float32)
fcover_and_aux.index.name = 'plotName'

In [59]:
fcover_and_aux.columns

Index(['deciduousShrubCover', 'deciduousTreeCover', 'evergreenShrubCover',
       'evergreenTreeCover', 'forbCover', 'graminoidCover',
       'nonvascularSumCover', 'bryophyteCover', 'lichenCover', 'litterCover',
       'baregroundCover', 'waterCover', 'otherCover', 'surveyYear',
       'surveyMonth', 'surveyDay', 'plotRadius', 'latitudeY', 'longitudeX',
       'georefSource', 'georefAccuracy', 'coordEPSG', 'dataSubsource',
       'dataSource', 'surveyMethod', 'fcoverScale', 'surveyPurpose',
       'geometry', 'adminUnit', 'adminCountry', 'fireYears', 'bioclimSubzone',
       'duplicatedCoords', 'duplicatedDate'],
      dtype='object')

In [60]:
# Export cover data
covercols = [col for col in fcover_and_aux.columns if 'Cover' in col]
auxcols = [col for col in fcover_and_aux.columns if 'Cover' not in col]
pft_fcover = fcover_and_aux[covercols]
pft_fcover.to_csv(f'output_data/{source}_standard_pft_fcover.csv')

# Export aux data
pft_aux = fcover_and_aux[auxcols]
pft_aux.to_csv(f'output_data/{source}_plot_info.csv')