# NGA Standardization Notebook
---
---

# Table of contents
1. [PRE-PROCESSING](#preprocess)
    1. [Load fcover data](#loadfcover)
    2. [Join fcover to species checklist table](#joinchecklist)
    3. [Join fcover to leaf retention table](#joinleafretention)
    4. [Export species-PFT tables](#exporthabits)
    5. [Clean exported species-PFT tables](#cleanhabits)
2. [STANDARDIZATION](#standardization)
    1. [Standardize PFT names](#pftnames)
    2. [Sum species fcover to PFT-level](#sumfcover)
    3. [Standardize table layout](#tablelayout)
    4. [Standardize auxiliary columns](#aux)
        1. [Basic plot information](#basics)
        2. [Geographical intersections](#geo)
        3. [Duplicate coordinates](#dupes)
3. [EXPORT](#export)

# Load required packages

In [42]:
import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import date, timedelta
from pyogrio import read_dataframe
import glob
import geopandas as gpd
import sys
import os
import chardet
import tarfile
from urllib.request import urlretrieve
import regex as re
from shapely.validation import make_valid

In [43]:
# import local functions
current_dir = os.path.abspath('..')
if current_dir not in sys.path:
    sys.path.append(current_dir)
import standardize_pft_funcs as spf

---
---
# 1. Pre-processing <a name="preprocess"></a>
After visually inspecting each species-level fcover table for obvious errors, we saved the "cleaned" tables to a folder that we then standardized here.

---
## 1.1. Load fcover data <a name="loadfcover"></a>

In [44]:
source = 'nga'

In [45]:
# get paths to tables
species_csv_paths = sorted(glob.glob('input_data/nga_cover_tables/*_spp.csv'))
print(species_csv_paths)

['input_data/nga_cover_tables/ngee_arctic_seward_peninsula_council_perc_spp.csv', 'input_data/nga_cover_tables/ngee_arctic_seward_peninsula_kougarok_perc_spp.csv', 'input_data/nga_cover_tables/ngee_arctic_seward_peninsula_teller_perc_spp.csv']


In [46]:
# read data from 3 sites
# load tables in pandas; not all tables have known encoding
fcover_dfs = []
for path in species_csv_paths:
    with open(path, 'rb') as file:
        # dynamically detect file encoding because it is not consistent
        result = chardet.detect(file.read())
        df = pd.read_csv(path, encoding=result['encoding'],
                         skiprows=[1],
                         na_values=[-9, -9.0, '-9', '-9.0', 'None', 'NONE', None])
    fcover_dfs.append(df)
fcover_dfs[0].head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,ooo,ppp,qqq,rrr,sss,ttt,uuu,...,nnnn,oooo,pppp,qqqq,rrrr,ssss,tttt,uuuu,vvvv,wwww
0,Dicranum groenlandicum,Dicranum groenlandicum,Dicranum groenlandicum,,,,,,,,...,1.0,1.0,3.0,0.0,0.0,,,,,
1,Orthocaulis attenuates,Orthocaulis attenuates,Orthocaulis attenuates,,,,,,,,...,,,,,,,,,,
2,Aulacomnium palustre var. imbricatum,Aulacomnium palustre var. imbricatum,Aulacomnium palustre var. imbricatum,,,,,,,,...,0.0,3.0,2.0,0.0,3.0,,,,,


In [47]:
# get unique species names from ALL tables (not just post-2000)
species_names = []
for df in fcover_dfs: 
    names = df.iloc[:, 0] # first col without author
    for name in names:
        if pd.notnull(name):
            name = name.strip('[]')
            name = name.strip()
            species_names.append(name)
u_species_names = list(set(species_names))
print(len(u_species_names))

# remove null species names
u_species_names = [x for x in u_species_names if pd.notnull(x)]
print(len(u_species_names))
species_names_df = pd.DataFrame(u_species_names, columns=['datasetSpeciesName'])

307
307


---
## 1.2. Join fcover to species checklist table <a name="joinchecklist"></a>

In [48]:
# load species checklist and prep for joining
checklist_path = '../etc/akveg_species_checklist.csv'
checklist_df = spf.checklist_df(checklist_path)
checklist_df.head(3)

Unnamed: 0,nameCode,checklistSpeciesName,nameStatus,nameAccepted,nameFamily,acceptedNameSource,nameLevel,speciesForm,speciesHabit
0,abroth,Abrothallus De Not.,accepted,Abrothallus De Not.,Abrothallaceae,CNALH,genus,lichen,lichen
1,abrpar,Abrothallus parmeliarum (Sommerf.) Arnold,accepted,Abrothallus parmeliarum (Sommerf.) Arnold,Abrothallaceae,CNALH,species,lichen,lichen
2,abrpey,Abrothallus peyritschii (Stein) Kotte,accepted,Abrothallus peyritschii (Stein) Kotte,Abrothallaceae,Dillman et al. 2012,species,lichen,lichen


In [49]:
species_names_df.head(3)

Unnamed: 0,datasetSpeciesName
0,Unknown forb
1,Mertensia paniculata
2,Sciuro-hypnum reflexum


In [50]:
# get first 2 words (genus-species) from checklist accepted name and data species name
checklist_df['joinKey'] = checklist_df['checklistSpeciesName'].apply(spf.get_substrings)
species_names_df['joinKey'] = species_names_df['datasetSpeciesName'].apply(spf.get_substrings)
habits = spf.join_to_checklist(unique_species=species_names_df, 
                               checklist=checklist_df, 
                               u_name='datasetSpeciesName', 
                               c_unofficial_name='checklistSpeciesName', 
                               c_official_name='nameAccepted', 
                               mapping_name='joinKey',
                               habit='speciesHabit')
habits.head(3)

117 species are missing habits.
69 species still missing habits.
23 species still missing habits.
22 species still missing habits.


Unnamed: 0_level_0,datasetSpeciesName,joinKey,speciesHabit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Unknown forb,forb,
1,Mertensia paniculata,Mertensia paniculata,forb
2,Sciuro-hypnum reflexum,Sciuro-hypnum reflexum,moss


In [51]:
species_names_df.head(3)

Unnamed: 0,datasetSpeciesName,joinKey
0,Unknown forb,forb
1,Mertensia paniculata,Mertensia paniculata
2,Sciuro-hypnum reflexum,Sciuro-hypnum reflexum


In [52]:
habits = species_names_df.merge(habits, how='left', left_on='datasetSpeciesName', right_on='datasetSpeciesName', suffixes=(None, '_1'))
habits = habits[['joinKey', 'datasetSpeciesName', 'speciesHabit']]

---
## 1.3. Join fcover to leaf retention table <a name="joinleafretention"></a>

In [53]:
# load leaf retention table and clean
path = '../etc/macander_leaf_retention.csv'
leaf_df = spf.leaf_retention_df(path)
leaf_df.head(3)

Unnamed: 0,leafRetention,retentionSpeciesName
0,deciduous,Salix
1,deciduous,Betula nana
2,deciduous,Vaccinium uliginosum


In [54]:
# add leaf retention column
habits_wleaf = spf.add_leaf_retention(habits, leaf_df, 'leafRetention')
habits_wleaf = habits_wleaf[['datasetSpeciesName', 'joinKey', 'speciesHabit', 'leafRetention']]
habits_wleaf.head(5)

Unnamed: 0,datasetSpeciesName,joinKey,speciesHabit,leafRetention
0,Unknown forb,forb,,
1,Mertensia paniculata,Mertensia paniculata,forb,
2,Sciuro-hypnum reflexum,Sciuro-hypnum reflexum,moss,
3,Plagiothecium laetum,Plagiothecium laetum,moss,
4,Alnus viridis s. fruticosa,Alnus viridis,shrub,deciduous


In [55]:
# apply function
habits_wleaf['speciesHabit'] = habits_wleaf['speciesHabit'].apply(spf.clean_shrub_habits)
list(habits_wleaf['speciesHabit'].unique())

[nan,
 'forb',
 'moss',
 'shrub',
 'lichen',
 'liverwort',
 'graminoid',
 'spore-bearing',
 'coniferous tree']

---
## 1.4. Export species-PFT tables <a name="exporthabits"></a>

In [56]:
# generate 3 files and dataframe of habit types
shrubs, nonshrubs, null = spf.export_habit_files(habits_wleaf,
                                                 'temp_data', 
                                                 source, 
                                                 'speciesHabit')

In [57]:
# check to make sure the files make sense
len(habits_wleaf) == (len(null) + len(nonshrubs) + len(shrubs))

True

---
## 1.5. Clean exported species-PFT tables <a name="cleanhabits"></a>
Here, we manually assessed each file. The nonshrubs were usually correct and did not have to be adjusted. Some shrubs were missing leaf retention information, so we consulted with arctic ecologists to determine it. For unknown (null) habits, we consulted arctic ecology expert, Amy Breen, to determine the issue and correct habit. After the tables were complete, all species names had a single associated habit.
- null habits
    - assign missing habits and consult ecologist when needed
- non-shrubs
    - assign leaf habit to all trees (ctrl-f to find all trees in excel)
        - coniferous tree = evergreen
        - deciduous tree  = deciduous
    - remove leaf habit if misapplied
- shrubs
    - check current leaf habits and correct if necessary
    - fill empty leaf habits
    - choose leaf habit if both evergreen & deciduous listed
    - add Stature column (dwarf shrub or not)

---
---
# 2. Standardization <a name="standardization"></a>

---
## 2.1. Standardize PFT names <a name="pftnames"></a>
deciduous shrub, evergreen shrub, graminoid, forb, litter, non-vascular (lichen, brophyte)

In [58]:
# paths to visually assessed and cleaned habit files
all_habit_types = [f'temp_data/{source}_nonshrubs_01.csv',
                   f'temp_data/{source}_shrubs_01.csv',
                   f'temp_data/{source}_nullhabit_01.csv']

# create list of habit dataframes
habit_dfs = []
for path in all_habit_types:
    habit_df = pd.read_csv(path, index_col=0)
    habit_dfs.append(habit_df)
all_habits = pd.concat(habit_dfs)

In [59]:
# show unique habit names we're working with
all_habits['speciesHabit'].unique().tolist()

['forb',
 'lichen',
 'graminoid',
 'moss',
 'liverwort',
 'spore-bearing',
 'coniferous tree',
 'shrub',
 'bryophyte',
 'bare rock',
 'bare soil',
 'bare water',
 'litter']

In [60]:
# ava-specific habit name standardization function
def standardize_habit(habitrow, leafrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    new_row = []
    
    for habit, leaf in zip(habitrow, leafrow):
        
        habit = habit.lower()
        # non-shrubs
        if 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte'
        elif 'spore-bearing' in habit:
            new_habit = 'forb'
            
        # shrubs and trees
        elif 'shrub' in habit:
            new_habit = f'{leaf} shrub'
        elif 'tree' in habit:
            new_habit = f'{leaf} tree'
            
        # etc
        elif 'soil' in habit or 'rock' in habit:
            new_habit = 'bare ground'
        elif 'water' in habit:
            new_habit = 'water'

        else:
            new_habit = f'{habit}'
        new_row.append(new_habit)
    
    return new_row

In [61]:
# standardized PFT
standard_habit_col = standardize_habit(all_habits['speciesHabit'], 
                                       all_habits['leafRetention'])
all_habits['standardHabit'] = standard_habit_col
all_habits.rename(columns={'speciesHabit':'nonstandardHabit'}, inplace=True)
print(len(all_habits))
all_habits.head(3)

307


Unnamed: 0,datasetSpeciesName,joinKey,nonstandardHabit,leafRetention,speciesStature,standardHabit
244,Achillea sibirica,Achillea sibirica,forb,,,forb
236,Aconitum delphiniifolium,Aconitum delphiniifolium,forb,,,forb
96,Alectoria ochroleuca,Alectoria ochroleuca,lichen,,,lichen


In [62]:
# add accepted name and standard habit to species level fcover
all_habits = all_habits[['datasetSpeciesName', 'standardHabit', 'nonstandardHabit']]
fcover = pd.concat(fcover_dfs)
fcover = fcover.drop(columns=['Unnamed: 1', 'Unnamed: 2'])
fcover = fcover.rename(columns={'Unnamed: 0': 'datasetSpeciesName'})
species_fcover = fcover.merge(all_habits, 
                               left_on='datasetSpeciesName', 
                               right_on='datasetSpeciesName')
species_fcover.head(3)

Unnamed: 0,datasetSpeciesName,ooo,ppp,qqq,rrr,sss,ttt,uuu,vvv,www,...,cc,dd,ee,ff,gg,hh,ii,jj,standardHabit,nonstandardHabit
0,Dicranum groenlandicum,,,,,,,,,,...,,,,,,,,,bryophyte,moss
1,Orthocaulis attenuates,,,,,,,,,,...,,,,,,,,,bryophyte,liverwort
2,Aulacomnium palustre var. imbricatum,,,,,,,,,,...,,,,,,,,,bryophyte,moss


In [63]:
# Use pd.melt to unpivot the dataframe
melted_df = pd.melt(species_fcover, 
                    id_vars=['datasetSpeciesName', 'standardHabit', 'nonstandardHabit'], 
                    var_name='plotVisit', 
                    value_name='percentCover')
species_fcover = melted_df[['plotVisit', 'datasetSpeciesName', 'standardHabit', 
                            'nonstandardHabit', 'percentCover']]
species_fcover.head(3)

Unnamed: 0,plotVisit,datasetSpeciesName,standardHabit,nonstandardHabit,percentCover
0,ooo,Dicranum groenlandicum,bryophyte,moss,
1,ooo,Orthocaulis attenuates,bryophyte,liverwort,
2,ooo,Aulacomnium palustre var. imbricatum,bryophyte,moss,


In [64]:
species_fcover.to_csv(f'output_data/{source}_nonstandard_species_fcover.csv', index=False)

---
## 2.2. Sum species fcover to PFT-level <a name="sumfcover"></a>

In [65]:
# ava-specific function to clean fcover values
from IPython.display import display, clear_output
import time
def get_fcover_percent(dfs, all_habits, habit_col, species_col):
    
    pft_fcover_tables = []
    fcover_tables = []
    
    # loop through each fcover table
    for df in dfs:

        # rename columns
        cols = df.columns.to_numpy()
        cols[0] = 'speciesNameNoAuth'
        cols[1] = 'speciesNameAccepted'
        cols[2] = 'speciesName'
        df.columns = cols

        # get list of plot IDs
        plot_id_cols = df.columns.to_list()[3:]

        # get associated habit for each species
        df = df.merge(all_habits, left_on='speciesNameNoAuth', right_on=species_col)
        df = df.astype(str)

        # clean % data
        fcover = df.drop(columns=['speciesNameNoAuth', 'speciesNameAccepted', 'speciesName', 
                                  'nonstandardHabit', species_col])
        fcover = fcover.loc[:, fcover.columns != habit_col].replace(' ', np.nan, regex=True)
        fcover = fcover.loc[:, fcover.columns != habit_col].replace('', np.nan, regex=True)
        fcover = fcover.loc[:, fcover.columns != habit_col].replace('\*', np.nan, regex=True)
        fcover = fcover.loc[:, fcover.columns != habit_col].astype(float)

        # add back the standard habit column
        fcover = fcover.copy()
        fcover[habit_col] = df[habit_col] # replace codes with new fcover %
        
        # create species-level fcover table
        species_fcover = fcover.copy()
        # add species and leaf col
        species_fcover[species_col] = df[species_col]
        species_fcover[species_col] = species_fcover[species_col].fillna(species_fcover[habit_col])
        # restructure table
        species_fcover = species_fcover.melt(id_vars=[species_col, habit_col], var_name='plotVisit', value_name='speciesCover')
        fcover_tables.append(species_fcover)

        # loop through each plot and sum fcover per pft
        pft_fcover = fcover.groupby(by=[habit_col]).sum().T
        pft_fcover_tables.append(pft_fcover)
        
    return fcover_tables, pft_fcover_tables

In [66]:
# this section is working on ALL plot data
fcover_dfs, pft_fcover_dfs = get_fcover_percent(fcover_dfs, all_habits, 'standardHabit', 'datasetSpeciesName')

In [67]:
pft_fcover = pd.concat(pft_fcover_dfs, axis=0)
pft_fcover.index.name = 'plotVisit'
pft_fcover.index = pft_fcover.index.astype(str)
pft_fcover.columns.name = None
pft_fcover.head(3)

Unnamed: 0_level_0,bare ground,bryophyte,deciduous shrub,evergreen shrub,evergreen tree,forb,graminoid,lichen,litter,water
plotVisit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ooo,1.0,9.0,15.0,51.0,0.0,0.0,3.0,43.0,2.0,0.0
ppp,3.0,5.0,6.0,42.0,0.0,2.0,4.0,77.0,2.0,0.0
qqq,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
len(pft_fcover.index.unique())

101

---
## 2.3. Standardize table layout <a name="tablelayout"></a>

In [69]:
# list of necessary PFTs
pfts = ['deciduous shrub', 'deciduous tree',
        'evergreen shrub', 'evergreen tree',
        'forb', 'graminoid', 'non-vascular',
        'bryophyte', 'lichen', 'litter',
        'bare ground', 'water', 'other']

In [70]:
# create non-vascular
pft_fcover['non-vascular'] = pft_fcover[['bryophyte', 'lichen']].sum(axis=1, skipna=True)

In [71]:
# create standard columns
pft_fcover = spf.add_standard_cols(pft_fcover, pfts)
pft_fcover = pft_fcover[pfts]
pft_fcover.head(3)

Unnamed: 0_level_0,deciduous shrub,deciduous tree,evergreen shrub,evergreen tree,forb,graminoid,non-vascular,bryophyte,lichen,litter,bare ground,water,other
plotVisit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ooo,15.0,,51.0,0.0,0.0,3.0,52.0,9.0,43.0,2.0,1.0,0.0,
ppp,6.0,,42.0,0.0,2.0,4.0,82.0,5.0,77.0,2.0,3.0,0.0,
qqq,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


---
## 2.4. Standardize auxiliary columns <a name="aux"></a>

Plot information:
- [ ] UID
- [x] plotName
- [x] dataSource
- [x] dataSubsource
- [x] fcoverScale
- [x] surveyMethod
- [x] surveyPurpose
- [x] duplicatedCoords
- [x] duplicatedDate
- [x] surveyYear
- [x] surveyMonth
- [x] surveyDay

Geographical information:
- [x] bioclimateSubzone
- [x] adminCountry
- [x] adminUnit
- [x] fireYears
- [x] coordEpsg
- [x] georefSource
- [x] georefAccuracy
- [x] plotRadius
- [x] longitueX
- [x] latitudeY

In [72]:
# load auxiliary data
aux = pd.read_csv(f'input_data/{source}_ancillary.csv')
# sampling methods, coverscale, data source
methods = pd.read_csv('../etc/sampling_methods.csv')

In [73]:
aux.columns

Index(['TID', 'Site Code', 'plot_radius_m', 'date', 'latitude', 'longitude',
       'year', 'source', 'dataset'],
      dtype='object')

### 2.4.1. Basic plot information <a name="basics"></a>

In [74]:
new_aux = pd.DataFrame()

# date columns
aux['date'] = (aux['date'].fillna(-1).astype(int)).astype(str)
aux['surveyDate'] = pd.to_datetime(aux['date'], format='mixed', errors='coerce')
new_aux['surveyYear'] = aux['surveyDate'].dt.year
new_aux['surveyMonth'] = aux['surveyDate'].dt.month
new_aux['surveyDay'] = aux['surveyDate'].dt.day

# plot size
new_aux['plotArea'] = aux['plot_radius_m'].apply(lambda r: np.pi * (float(r)**2) if pd.notna(r) else np.NaN)
new_aux['plotShape'] = 'circle'

# geographical information
new_aux['latitudeY'] = aux['latitude']
new_aux['longitudeX'] = aux['longitude']
new_aux['georefSource'] = 'GPS'
new_aux['georefAccuracy'] = 0.25
new_aux['coordEPSG'] = 'EPSG:4326'

# main
new_aux['plotName'] = aux['Site Code']
new_aux['plotVisit'] = aux['TID']
new_aux['dataSubsource'] = 'NGEE-Arctic Amy Breen (pub 2020)'
new_aux = new_aux.merge(methods, how='left', left_on='dataSubsource', right_on='dataSubsource')
new_aux['dataSource'] = 'NGA'
new_aux = new_aux.set_index(new_aux['plotVisit'])
new_aux

Unnamed: 0_level_0,surveyYear,surveyMonth,surveyDay,plotArea,plotShape,latitudeY,longitudeX,georefSource,georefAccuracy,coordEPSG,plotName,plotVisit,dataSubsource,dataSource,dataSourceHtml,dataSubsourceCitation,surveyMethod,fcoverScale,surveyPurpose
plotVisit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
kk,2016.0,7.0,23.0,19.634954,circle,65.164850,-164.823195,GPS,0.25,EPSG:4326,KG_AS1_VgComp,kk,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...
ll,2016.0,7.0,23.0,19.634954,circle,65.158620,-164.822328,GPS,0.25,EPSG:4326,KG_AS2_VgComp,ll,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...
mm,2016.0,7.0,23.0,19.634954,circle,65.161775,-164.831645,GPS,0.25,EPSG:4326,KG_AS3_VgComp,mm,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...
nn,2016.0,7.0,23.0,19.634954,circle,65.164293,-164.823757,GPS,0.25,EPSG:4326,KG_AS4_VgComp,nn,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...
oo,2016.0,7.0,23.0,19.634954,circle,65.157525,-164.825048,GPS,0.25,EPSG:4326,KG_AS5_VgComp,oo,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ssss,2018.0,8.0,1.0,4.908739,circle,64.859265,-163.690300,GPS,0.25,EPSG:4326,CL_WAC-5-1_Vg_Comp,ssss,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...
tttt,2018.0,8.0,1.0,4.908739,circle,64.860260,-163.697757,GPS,0.25,EPSG:4326,CL_WAC2_Vg_Comp,tttt,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...
uuuu,2018.0,8.0,1.0,4.908739,circle,64.856472,-163.689893,GPS,0.25,EPSG:4326,CL_WAC3_Vg_Comp,uuuu,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...
vvvv,2018.0,8.0,1.0,4.908739,circle,64.859385,-163.703835,GPS,0.25,EPSG:4326,CL_WAC4_Vg_Comp,vvvv,NGEE-Arctic Amy Breen (pub 2020),NGA,https://data.ngee-arctic.ornl.gov/data/pages/N...,"Breen, A. et al. NGEE Arctic plant traits: pla...",simple plot,percent,Improve predictability of carbon-rich Arctic s...


### 2.4.2. Geographical intersections <a name="geo"></a>

In [75]:
# set geo info
geo_paths = ['../etc/gaul1/gaul1_asap.shp',
             '../etc/fire/InterAgencyFirePerimeterHistory_All_Years_View.shp',
             '../etc/bioclim/bioclimate_la_latlon.shp']
geo_names = ['gaul1',
             'fire',
             'bioclim']
geo_cols = [['name1', 'name0', 'geometry'],
            ['FIRE_YEAR', 'geometry'],
            ['zone', 'geometry']]

In [76]:
# create geospatial information
fcover_epsg = 'EPSG:4326' # wgs 84
intersect_epsg = 'EPSG:5936' # alaska projected datum
fcover_and_aux = pd.concat([pft_fcover, new_aux], join='inner', axis=1)
fcover_and_aux = gpd.GeoDataFrame(fcover_and_aux, 
                                  geometry=gpd.points_from_xy(fcover_and_aux['longitudeX'],
                                                              fcover_and_aux['latitudeY']),
                                  crs=fcover_epsg)
fcover_and_aux = fcover_and_aux.to_crs(intersect_epsg)

# add new geospatial intersections
fcover_and_aux = spf.add_geospatial_aux(fcover_and_aux, 
                                        geo_paths, 
                                        geo_names, 
                                        geo_cols,
                                        intersect_epsg)

# drop unneccesary columns
fcover_and_aux = fcover_and_aux.drop(columns=['index_gaul1', 
                                              'index_fire', 
                                              'index_bioclim', 
                                              'plotVisit'])

In [77]:
# rename new columns
fcover_and_aux.rename(columns={'name1':'adminUnit',
                               'name0':'adminCountry',
                               'FIRE_YEAR':'fireYears',
                               'zone':'bioclimSubzone'}, inplace=True)

In [78]:
fcover_and_aux.columns

Index(['deciduous shrub', 'deciduous tree', 'evergreen shrub',
       'evergreen tree', 'forb', 'graminoid', 'non-vascular', 'bryophyte',
       'lichen', 'litter', 'bare ground', 'water', 'other', 'surveyYear',
       'surveyMonth', 'surveyDay', 'plotArea', 'plotShape', 'latitudeY',
       'longitudeX', 'georefSource', 'georefAccuracy', 'coordEPSG', 'plotName',
       'dataSubsource', 'dataSource', 'dataSourceHtml',
       'dataSubsourceCitation', 'surveyMethod', 'fcoverScale', 'surveyPurpose',
       'geometry', 'adminUnit', 'adminCountry', 'fireYears', 'bioclimSubzone'],
      dtype='object')

### 2.4.3. Duplicate coordinates <a name="dupes"></a>
This indicates that plots may have been revisited, they were very close and GPS accuracy was too low, or there is human error

In [79]:
# find duplicates
fcover_and_aux = spf.find_duplicates(fcover_and_aux, ['longitudeX', 'latitudeY'], 'duplicatedCoords')
fcover_and_aux = spf.find_duplicates(fcover_and_aux, ['surveyYear', 'surveyMonth', 'surveyDay'], 'duplicatedDate')

  group_ids = df.groupby(subset, sort=False).grouper.group_info[0]
  group_ids = df.groupby(subset, sort=False).grouper.group_info[0]


---
# 3. Export <a name="export"></a>

In [80]:
# clean up
pfts = {'deciduous shrub':'deciduousShrubCover',
        'evergreen shrub':'evergreenShrubCover',
        'deciduous tree':'deciduousTreeCover',
        'evergreen tree':'evergreenTreeCover',
        'forb':'forbCover',
        'graminoid':'graminoidCover',
        'non-vascular':'nonvascularSumCover',
        'bryophyte':'bryophyteCover',
        'lichen':'lichenCover',
        'litter':'litterCover',
        'bare ground':'baregroundCover',
        'water':'waterCover',
        'other':'otherCover'}

# rename columns
fcover_and_aux.rename(columns=pfts, inplace=True)
fcover_cols = list(pfts.values())

# reproject and set cover data type
fcover_and_aux = fcover_and_aux.to_crs('EPSG:4326')
fcover_and_aux[fcover_cols] = fcover_and_aux[fcover_cols].astype(np.float32)
fcover_and_aux.index.name = 'plotVisit'

In [81]:
fcover_and_aux.columns

Index(['deciduousShrubCover', 'deciduousTreeCover', 'evergreenShrubCover',
       'evergreenTreeCover', 'forbCover', 'graminoidCover',
       'nonvascularSumCover', 'bryophyteCover', 'lichenCover', 'litterCover',
       'baregroundCover', 'waterCover', 'otherCover', 'surveyYear',
       'surveyMonth', 'surveyDay', 'plotArea', 'plotShape', 'latitudeY',
       'longitudeX', 'georefSource', 'georefAccuracy', 'coordEPSG', 'plotName',
       'dataSubsource', 'dataSource', 'dataSourceHtml',
       'dataSubsourceCitation', 'surveyMethod', 'fcoverScale', 'surveyPurpose',
       'geometry', 'adminUnit', 'adminCountry', 'fireYears', 'bioclimSubzone',
       'duplicatedCoords', 'duplicatedDate'],
      dtype='object')

In [82]:
# Export cover data
covercols = [col for col in fcover_and_aux.columns if 'Cover' in col]
auxcols = [col for col in fcover_and_aux.columns if 'Cover' not in col]
pft_fcover = fcover_and_aux[covercols]
pft_fcover.to_csv(f'output_data/{source}_standard_pft_fcover.csv', encoding='utf-8-sig')

# Export aux data
pft_aux = fcover_and_aux[auxcols]
pft_aux.to_csv(f'output_data/{source}_plot_info.csv', encoding='utf-8-sig')