# AKVEG Standardization Notebook
---
---

# Table of contents
1. [PRE-PROCESSING](#preprocess)
    1. [Load fcover data](#loadfcover)
    2. [Join fcover to species checklist table](#joinchecklist)
    3. [Join fcover to leaf retention table](#joinleafretention)
    4. [Export species-PFT tables](#exporthabits)
    5. [Clean exported species-PFT tables](#cleanhabits)
2. [STANDARDIZATION](#standardization)
    1. [Standardize PFT names](#pftnames)
    2. [Sum species fcover to PFT-level](#sumfcover)
    3. [Standardize table layout](#tablelayout)
    4. [Standardize auxiliary columns](#aux)
        1. [Basic plot information](#basics)
        2. [Geographical intersections](#geo)
        3. [Duplicate coordinates](#dupes)
3. [EXPORT](#export)

# Load required packages

In [133]:
import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import date, timedelta
from pyogrio import read_dataframe
import glob
import geopandas as gpd
import sys
import os
import chardet
import tarfile
from urllib.request import urlretrieve
import regex as re
from shapely.validation import make_valid

In [134]:
# import local functions
current_dir = os.path.abspath('..')
if current_dir not in sys.path:
    sys.path.append(current_dir)
import standardize_pft_funcs as spf

---
---
# 1. Pre-processing <a name="preprocess"></a>
After visually inspecting each species-level fcover table for obvious errors, we saved the "cleaned" tables to a folder that we then standardized here.

---
## 1.1. Load fcover data <a name="loadfcover"></a>

In [135]:
source = 'akveg'

In [136]:
# read akveg npr-a data, checklist, and ancillary data
fcover_path = f'input_data/{source}_foliar_cover.csv'
fcover = pd.read_csv(fcover_path, index_col=0)
fcover.head(3)

Unnamed: 0_level_0,Project,Site Code,Date,Observer,Recorder,Cover Type,Accepted Name,Cover
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,AIM NPR-A,AB-1B,2012-08-01,Tina Boucher,,total cover,Arnica lessingii,0.0
2,AIM NPR-A,AB-1B,2012-08-01,Tina Boucher,,total cover,Artemisia arctica,0.0
3,AIM NPR-A,AB-1B,2012-08-01,Tina Boucher,,total cover,Cardamine bellidifolia,0.0


In [137]:
len(fcover['Accepted Name'].unique())

535

In [138]:
len(fcover['Site Code'].unique())

185

In [139]:
# 0 means a trace amount was found; convert to 0.05 like "r" in BB
temp = fcover.copy()
temp['Cover'].replace(0, 0.05, inplace=True)
fcover = temp.dropna(subset=['Cover'])
aux = fcover.iloc[:,:-1]
fcover = fcover[['Site Code', 'Accepted Name', 'Cover']]

In [140]:
# drop species that didn't have any data recorded
nonzero_cover = fcover.copy()
nonzero_cover = nonzero_cover.dropna(how='any')
nonzero_cover = nonzero_cover[nonzero_cover['Cover'] != 0]

In [141]:
# get unique species names from narrowed down list
u_species_names = nonzero_cover['Accepted Name'].unique()
print(len(u_species_names))
u_species_names = [x for x in u_species_names if pd.notnull(x)]
print(len(u_species_names))
species_names_df = pd.DataFrame(u_species_names, columns=['datasetSpeciesName'])

535
535


---
## 1.2. Join fcover to species checklist table <a name="joinchecklist"></a>

In [142]:
# load species checklist and prep for joining
checklist_path = f'../etc/akveg_species_checklist.csv'
checklist_df = spf.checklist_df(checklist_path)
checklist_df.head(3)

Unnamed: 0,nameCode,checklistSpeciesName,nameStatus,nameAccepted,nameFamily,acceptedNameSource,nameLevel,speciesForm,speciesHabit
0,abroth,Abrothallus De Not.,accepted,Abrothallus De Not.,Abrothallaceae,CNALH,genus,lichen,lichen
1,abrpar,Abrothallus parmeliarum (Sommerf.) Arnold,accepted,Abrothallus parmeliarum (Sommerf.) Arnold,Abrothallaceae,CNALH,species,lichen,lichen
2,abrpey,Abrothallus peyritschii (Stein) Kotte,accepted,Abrothallus peyritschii (Stein) Kotte,Abrothallaceae,Dillman et al. 2012,species,lichen,lichen


In [143]:
species_names_df.head(3)

Unnamed: 0,datasetSpeciesName
0,Arnica lessingii
1,Artemisia arctica
2,Cardamine bellidifolia


In [144]:
# get first 2 words (genus-species) from checklist accepted name and data species name
checklist_df['joinKey'] = checklist_df['checklistSpeciesName'].apply(spf.get_substrings)
species_names_df['joinKey'] = species_names_df['datasetSpeciesName'].apply(spf.get_substrings)
habits = spf.join_to_checklist(unique_species=species_names_df, 
                               checklist=checklist_df, 
                               u_name='datasetSpeciesName', 
                               c_unofficial_name='checklistSpeciesName', 
                               c_official_name='nameAccepted', 
                               mapping_name='joinKey',
                               habit='speciesHabit')

67 species are missing habits.
62 species still missing habits.
7 species still missing habits.
7 species still missing habits.


In [145]:
habits.head(3)

Unnamed: 0_level_0,datasetSpeciesName,joinKey,speciesHabit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Arnica lessingii,Arnica lessingii,forb
1,Artemisia arctica,Artemisia arctica,forb
2,Cardamine bellidifolia,Cardamine bellidifolia,forb


In [146]:
species_names_df.head(3)

Unnamed: 0,datasetSpeciesName,joinKey
0,Arnica lessingii,Arnica lessingii
1,Artemisia arctica,Artemisia arctica
2,Cardamine bellidifolia,Cardamine bellidifolia


In [147]:
habits = species_names_df.merge(habits, how='left', left_on='datasetSpeciesName', right_on='datasetSpeciesName', suffixes=(None, '_1'))
habits = habits[['joinKey', 'datasetSpeciesName', 'speciesHabit']]

---
## 1.3. Join fcover to leaf retention table <a name="joinleafretention"></a>

In [148]:
leaf_df = spf.leaf_retention_df('../etc/macander_leaf_retention.csv')
leaf_df.head(3)

Unnamed: 0,leafRetention,retentionSpeciesName
0,deciduous,Salix
1,deciduous,Betula nana
2,deciduous,Vaccinium uliginosum


In [149]:
# add leaf retention column
habits_wleaf = spf.add_leaf_retention(habits, leaf_df, 'leafRetention')
habits_wleaf = habits_wleaf[['datasetSpeciesName', 'joinKey', 'speciesHabit', 'leafRetention']]
habits_wleaf.head(3)

Unnamed: 0,datasetSpeciesName,joinKey,speciesHabit,leafRetention
0,Arnica lessingii,Arnica lessingii,forb,
1,Artemisia arctica,Artemisia arctica,forb,
2,Cardamine bellidifolia,Cardamine bellidifolia,forb,


In [150]:
# apply function to remove any extra words with shrub
habits_wleaf['speciesHabit'] = habits_wleaf['speciesHabit'].apply(spf.clean_shrub_habits)
list(habits_wleaf['speciesHabit'].unique())

['forb',
 'graminoid',
 'shrub',
 'spore-bearing',
 'lichen',
 'moss',
 'liverwort',
 nan]

---
## 1.4. Export species-PFT tables <a name="exporthabits"></a>

In [151]:
# generate 3 files and dataframe of habit types
shrubs, nonshrubs, null = spf.export_habit_files(habits_wleaf,
                                                 'temp_data', 
                                                 source, 
                                                 'speciesHabit')

In [152]:
# check to make sure the files make sense
len(habits_wleaf) == (len(null) + len(nonshrubs) + len(shrubs))

True

---
## 1.5. Clean exported species-PFT tables <a name="cleanhabits"></a>
Here, we manually assessed each file. The nonshrubs were usually correct and did not have to be adjusted. Some shrubs were missing leaf retention information, so we consulted with arctic ecologists to determine it. For unknown (null) habits, we consulted arctic ecology expert, Amy Breen, to determine the issue and correct habit. After the tables were complete, all species names had a single associated habit.
- null habits
    - assign missing habits and consult ecologist when needed
- non-shrubs
    - assign leaf habit to all trees (ctrl-f to find all trees in excel)
        - coniferous tree = evergreen
        - deciduous tree  = deciduous
    - remove leaf habit if misapplied
- shrubs
    - check current leaf habits and correct if necessary
    - fill empty leaf habits
    - choose leaf habit if both evergreen & deciduous listed
    - add Stature column (dwarf shrub or not)

---
---
# 2. Standardization <a name="standardization"></a>

---
## 2.1. Standardize PFT names <a name="pftnames"></a>
deciduous shrub, evergreen shrub, graminoid, forb, litter, non-vascular (lichen, brophyte)

In [153]:
# paths to visually assessed and cleaned habit files
all_habit_types = [f'temp_data/{source}_nonshrubs_01.csv',
                   f'temp_data/{source}_shrubs_01.csv',
                   f'temp_data/{source}_nullhabit_01.csv']

# create list of habit dataframes
habit_dfs = []
for path in all_habit_types:
    habit_df = pd.read_csv(path, index_col=0)
    habit_dfs.append(habit_df)
all_habits = pd.concat(habit_dfs)

In [154]:
# show unique habit names we're working with
all_habits['speciesHabit'].unique().tolist()

['forb',
 'graminoid',
 'spore-bearing',
 'lichen',
 'moss',
 'liverwort',
 'shrub',
 'algae',
 'fungus']

In [155]:
# akveg-specific habit name standardization function
def standardize_habit(habitrow, leafrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    new_row = []
    
    for habit, leaf in zip(habitrow, leafrow):
        
        habit = habit.lower()
        # non-shrubs
        if 'algae' in habit:
            new_habit = 'other'
        elif 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte'
        elif 'spore-bearing' in habit:
            new_habit = 'forb'
            
        # shrubs and trees
        elif 'shrub' in habit:
            new_habit = f'{leaf} shrub'
            
        else:
            new_habit = f'{habit}'
        new_row.append(new_habit)
    
    return new_row

In [156]:
# standardized PFT
standard_habit_col = standardize_habit(all_habits['speciesHabit'], 
                                       all_habits['leafRetention'])
all_habits['standardHabit'] = standard_habit_col
all_habits.rename(columns={'speciesHabit':'nonstandardHabit'}, inplace=True)
print(len(all_habits))
all_habits.head(3)

535


Unnamed: 0,datasetSpeciesName,joinKey,nonstandardHabit,leafRetention,speciesStature,standardHabit
0,Arnica lessingii,Arnica lessingii,forb,,,forb
1,Artemisia arctica,Artemisia arctica,forb,,,forb
2,Cardamine bellidifolia,Cardamine bellidifolia,forb,,,forb


---
## 2.2. Sum species fcover to PFT-level <a name="sumfcover"></a>

In [157]:
# loop through each plot and sum fcover per pft
all_habits = all_habits[['datasetSpeciesName', 'standardHabit', 'nonstandardHabit']]
species_fcover = fcover.merge(all_habits, 
                               left_on='Accepted Name', 
                               right_on='datasetSpeciesName')
species_fcover = species_fcover.rename(columns={'Cover':'percentCover', 'Site Code':'plotName'})
species_fcover = species_fcover.drop(columns=['Accepted Name'])
species_fcover.head(3)

Unnamed: 0,plotName,percentCover,datasetSpeciesName,standardHabit,nonstandardHabit
0,AB-1B,0.05,Arnica lessingii,forb,forb
1,AB-5B,0.7,Arnica lessingii,forb,forb
2,ADST-76,1.3,Arnica lessingii,forb,forb


In [158]:
species_fcover = species_fcover[species_fcover['percentCover'].notna() & (species_fcover['percentCover'] != 0)]
species_fcover = species_fcover[['plotName', 'datasetSpeciesName', 'standardHabit', 
                                 'nonstandardHabit', 'percentCover']]
species_fcover['standardHabit'] = species_fcover['standardHabit'].replace('fungus', 'other')
species_fcover.head(3)

Unnamed: 0,plotName,datasetSpeciesName,standardHabit,nonstandardHabit,percentCover
0,AB-1B,Arnica lessingii,forb,forb,0.05
1,AB-5B,Arnica lessingii,forb,forb,0.7
2,ADST-76,Arnica lessingii,forb,forb,1.3


In [159]:
# export
species_fcover.to_csv(f'output_data/{source}_nonstandard_species_fcover.csv', index=False)

In [160]:
# group by 'Site Code' and 'standardHabit', then sum the 'Cover'
temp = species_fcover.groupby(['plotName', 'standardHabit'])['percentCover'].sum().reset_index()
pft_fcover = temp.pivot(index='plotName', columns='standardHabit', values='percentCover')
pft_fcover = pft_fcover.fillna(0)

In [161]:
# clean up
pft_fcover.columns.name = None
pft_fcover.index.name = 'plotName'
pft_fcover.index = pft_fcover.index.astype(str)
pft_fcover.head(3)

Unnamed: 0_level_0,bryophyte,deciduous shrub,evergreen shrub,forb,graminoid,lichen,other
plotName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AB-1B,0.0,0.05,0.0,2.05,1.1,12.7,0.0
AB-5B,2.7,14.75,22.0,3.4,7.05,14.1,0.0
ADST-2,2.6,10.05,18.7,5.05,3.6,7.5,0.0


---
## 2.3. Standardize table layout <a name="tablelayout"></a>

In [162]:
# list of necessary PFTs
pfts = ['deciduous shrub', 'deciduous tree',
        'evergreen shrub', 'evergreen tree',
        'forb', 'graminoid', 'non-vascular',
        'bryophyte', 'lichen', 'other']

In [163]:
# load non-vegetation data and select
nonveg = pd.read_csv(f'input_data/akveg_nonfoliar_cover.csv')
nonveg = nonveg[['AH_TotalLitterCover', 'FH_RockCover', 
                 'FH_WaterCover', 'BareSoilCover', 'PlotID']]
nonveg = nonveg.rename(columns=dict(PlotID='plotName',
                                    AH_TotalLitterCover='litter',
                                    FH_RockCover='bare ground',
                                    FH_WaterCover='water',
                                    BareSoilCover='bare ground'))
nonveg = nonveg.groupby(nonveg.columns, axis=1).sum()
nonveg = nonveg.set_index('plotName')
nonveg.head(3)

Unnamed: 0_level_0,bare ground,litter,water
plotName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FMTT-5B,0.0,40.816327,0.0
FWMM-5B,0.0,48.0,4.0
FWMM-1,0.0,65.333333,2.0


In [164]:
# add non-vegetation information
pft_fcover = pft_fcover.merge(nonveg,
                              how='left',
                              left_index=True,
                              right_index=True)

In [165]:
# create non-vascular
pft_fcover['non-vascular'] = pft_fcover['bryophyte'] + pft_fcover['lichen']

In [166]:
# create standard columns
pft_fcover = spf.add_standard_cols(pft_fcover, pfts)
pft_fcover = pft_fcover[pfts]
pft_fcover = pft_fcover.merge(nonveg, how='left', left_index=True, right_index=True)
pft_fcover.head(3)

Unnamed: 0_level_0,deciduous shrub,deciduous tree,evergreen shrub,evergreen tree,forb,graminoid,non-vascular,bryophyte,lichen,other,bare ground,litter,water
plotName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AB-1B,0.05,,0.0,,2.05,1.1,12.7,0.0,12.7,0.0,20.666667,0.0,0.0
AB-5B,14.75,,22.0,,3.4,7.05,16.8,2.7,14.1,0.0,14.0,16.0,0.0
ADST-2,10.05,,18.7,,5.05,3.6,10.1,2.6,7.5,0.0,14.0,10.0,0.0


In [167]:
len(pft_fcover.index.unique())

185

---
## 2.4. Standardize auxiliary columns <a name="aux"></a>

Plot information:
- [ ] UID
- [x] plotName
- [x] dataSource
- [x] dataSubsource
- [x] fcoverScale
- [x] surveyMethod
- [x] surveyPurpose
- [x] duplicatedCoords
- [x] duplicatedDate
- [x] surveyYear
- [x] surveyMonth
- [x] surveyDay

Geographical information:
- [x] bioclimateSubzone
- [x] adminCountry
- [x] adminUnit
- [x] fireYears
- [x] coordEpsg
- [x] georefSource
- [x] georefAccuracy
- [x] plotRadius
- [x] longitueX
- [x] latitudeY

In [168]:
# load auxiliary data
fcover_aux = pd.read_csv(f'input_data/akveg_foliar_cover.csv', index_col=0)
fcover_aux = fcover_aux.groupby('Site Code').first()
aux = pd.read_csv(f'input_data/akveg_ancillary.csv', index_col=0)
# sampling methods, coverscale, data source
methods = pd.read_csv('../etc/sampling_methods.csv')

In [169]:
# merge aux tables into one
aux = fcover_aux.merge(aux, how='left', 
                       left_on='Site Code', 
                       right_on='Site Code')

### 2.4.1. Basic plot information <a name="basics"></a>

In [170]:
new_aux = pd.DataFrame()

# date columns
aux['surveyDate'] = pd.to_datetime(aux['Date'], format='mixed')
new_aux['surveyYear'] = aux['surveyDate'].dt.year.astype(int)
new_aux['surveyMonth'] = aux['surveyDate'].dt.month.astype(int)
new_aux['surveyDay'] = aux['surveyDate'].dt.day.astype(int)

# plot size
new_aux['plotRadius'] = aux['Plot Dimensions'].apply(lambda x: 
                                                     re.findall(r'\b\d+\b', x)[0]).astype(int)

# geographical information
new_aux['latitudeY'] = aux['Latitude'].astype('Float32')
new_aux['longitudeX'] = aux['Longitude'].astype('Float32')
new_aux['georefSource'] = 'GPS'
new_aux['georefAccuracy'] = aux['Uncertainty'].astype('Float32')
new_aux['coordEPSG'] = 'EPSG:4326' # will need to convert from EPSG:4269

# main
new_aux['plotName'] = aux['Site Code']
# new_aux['dataSource'] = 'AKVEG'
new_aux['dataSubsource'] = 'AIM NPR-A Nawrocki (pub 2020)'
new_aux = new_aux.merge(methods, how='left', left_on='dataSubsource', right_on='dataSubsource')
new_aux = new_aux.set_index(new_aux['plotName'])

### 2.4.2. Geographical intersections <a name="geo"></a>

In [171]:
# set geo info
geo_paths = ['../etc/gaul1/gaul1_asap.shp',
             '../etc/fire/InterAgencyFirePerimeterHistory_All_Years_View.shp',
             '../etc/bioclim/bioclimate_la_latlon.shp']
geo_names = ['gaul1',
             'fire',
             'bioclim']
geo_cols = [['name1', 'name0', 'geometry'],
            ['FIRE_YEAR', 'geometry'],
            ['zone', 'geometry']]

In [172]:
# create geospatial information
fcover_epsg = 'EPSG:4269' # latlon NAD83
intersect_epsg = 'EPSG:5936' # alaska projected datum
fcover_and_aux = pd.concat([pft_fcover, new_aux], join='inner', axis=1)
fcover_and_aux = gpd.GeoDataFrame(fcover_and_aux, 
                                  geometry=gpd.points_from_xy(fcover_and_aux['longitudeX'],
                                                              fcover_and_aux['latitudeY']),
                                  crs=fcover_epsg)
fcover_and_aux = fcover_and_aux.to_crs(intersect_epsg)

# add new geospatial intersections
fcover_and_aux = spf.add_geospatial_aux(fcover_and_aux, 
                                        geo_paths, 
                                        geo_names, 
                                        geo_cols,
                                        intersect_epsg)

# drop unneccesary columns
fcover_and_aux = fcover_and_aux.drop(columns=['index_gaul1', 
                                              'index_fire', 
                                              'index_bioclim', 
                                              'plotName'])

In [173]:
# rename new columns
fcover_and_aux.rename(columns={'name1':'adminUnit',
                               'name0':'adminCountry',
                               'FIRE_YEAR':'fireYears',
                               'zone':'bioclimSubzone'}, inplace=True)

### 2.4.3. Duplicate coordinates <a name="dupes"></a>
This indicates that plots may have been revisited, they were very close and GPS accuracy was too low, or there is human error

In [174]:
# duplicate coord column
coords = ['longitudeX', 'latitudeY']
fcover_and_aux['duplicatedCoords'] = np.nan
fcover_and_aux = spf.find_duplicates(fcover_and_aux, coords, 'duplicatedCoords')

# duplicate date column
date = ['surveyYear', 'surveyMonth', 'surveyDay']
fcover_and_aux['duplicatedDate'] = np.nan
fcover_and_aux = spf.find_duplicates(fcover_and_aux, date, 'duplicatedDate')

no duplicates found
duplicates found


---
# 3. Export <a name="export"></a>

In [175]:
# clean up
pfts = {'deciduous shrub':'deciduousShrubCover',
        'evergreen shrub':'evergreenShrubCover',
        'deciduous tree':'deciduousTreeCover',
        'evergreen tree':'evergreenTreeCover',
        'forb':'forbCover',
        'graminoid':'graminoidCover',
        'non-vascular':'nonvascularSumCover',
        'bryophyte':'bryophyteCover',
        'lichen':'lichenCover',
        'litter':'litterCover',
        'bare ground':'baregroundCover',
        'water':'waterCover',
        'other':'otherCover'}

# rename columns and replace NaN cover with 0
fcover_and_aux.rename(columns=pfts, inplace=True)
fcover_cols = list(pfts.values())
fcover_and_aux[fcover_cols] = fcover_and_aux[fcover_cols].fillna(0.0)

# reproject and set cover data type
fcover_and_aux = fcover_and_aux.to_crs('EPSG:4326')
fcover_and_aux[fcover_cols] = fcover_and_aux[fcover_cols].astype(np.float32)
fcover_and_aux.index.name = 'plotName'

In [176]:
fcover_and_aux.columns

Index(['deciduousShrubCover', 'deciduousTreeCover', 'evergreenShrubCover',
       'evergreenTreeCover', 'forbCover', 'graminoidCover',
       'nonvascularSumCover', 'bryophyteCover', 'lichenCover', 'otherCover',
       'baregroundCover', 'litterCover', 'waterCover', 'surveyYear',
       'surveyMonth', 'surveyDay', 'plotRadius', 'latitudeY', 'longitudeX',
       'georefSource', 'georefAccuracy', 'coordEPSG', 'dataSubsource',
       'dataSource', 'dataSourceHtml', 'dataSubsourceCitation', 'surveyMethod',
       'fcoverScale', 'surveyPurpose', 'geometry', 'adminUnit', 'adminCountry',
       'fireYears', 'bioclimSubzone', 'duplicatedCoords', 'duplicatedDate'],
      dtype='object')

In [177]:
# Export cover data
covercols = [col for col in fcover_and_aux.columns if 'Cover' in col]
auxcols = [col for col in fcover_and_aux.columns if 'Cover' not in col]
pft_fcover = fcover_and_aux[covercols]
pft_fcover.to_csv(f'output_data/{source}_standard_pft_fcover.csv')

# Export aux data
pft_aux = fcover_and_aux[auxcols]
pft_aux.to_csv(f'output_data/{source}_plot_info.csv')