# AKVEG Standardization Notebook
---
---

# Table of contents
1. [PRE-PROCESSING](#preprocess)
    1. [Load fcover data](#loadfcover)
    2. [Join fcover to species checklist table](#joinchecklist)
    3. [Join fcover to leaf retention table](#joinleafretention)
    4. [Export species-PFT tables](#exporthabits)
    5. [Clean exported species-PFT tables](#cleanhabits)
2. [STANDARDIZATION](#standardization)
    1. [Standardize PFT names](#pftnames)
    2. [Sum species fcover to PFT-level](#sumfcover)
    3. [Standardize table layout](#tablelayout)
    4. [Standardize auxiliary columns](#aux)
        1. [Basic plot information](#basics)
        2. [Geographical intersections](#geo)
        3. [Duplicate coordinates](#dupes)
3. [EXPORT](#export)

# Load required packages

In [148]:
import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import date, timedelta
from pyogrio import read_dataframe
import glob
import geopandas as gpd
import sys
import os
import chardet
import tarfile
from urllib.request import urlretrieve
import regex as re
from shapely.validation import make_valid

In [149]:
# import local functions
current_dir = os.path.abspath('..')
if current_dir not in sys.path:
    sys.path.append(current_dir)
import standardize_pft_funcs as spf

---
---
# 1. Pre-processing <a name="preprocess"></a>
After visually inspecting each species-level fcover table for obvious errors, we saved the "cleaned" tables to a folder that we then standardized here.

---
## 1.1. Load fcover data <a name="loadfcover"></a>

In [150]:
source = 'akveg'

In [151]:
# read akveg npr-a data, checklist, and ancillary data
fcover_path = f'input_data/{source}_foliar_cover.csv'
fcover = pd.read_csv(fcover_path, index_col=0)
fcover.head(3)

Unnamed: 0_level_0,Project,Site Code,Date,Observer,Recorder,Cover Type,Accepted Name,Cover
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,AIM NPR-A,AB-1B,2012-08-01,Tina Boucher,,total cover,Arnica lessingii,0.0
2,AIM NPR-A,AB-1B,2012-08-01,Tina Boucher,,total cover,Artemisia arctica,0.0
3,AIM NPR-A,AB-1B,2012-08-01,Tina Boucher,,total cover,Cardamine bellidifolia,0.0


In [152]:
len(fcover['Accepted Name'].unique())

535

In [153]:
len(fcover['Site Code'].unique())

185

In [154]:
# 0 means a trace amount was found; convert to 0.05 like "r" in BB
temp = fcover.copy()
temp['Cover'].replace(0, 0.05, inplace=True)
fcover = temp.dropna(subset=['Cover'])
fcover['plotVisit'] = fcover['Project'].str.replace(' ', '-') + '_' + fcover['Site Code'].str.replace(' ', '-') + '_' + fcover['Date'].str.replace(' ', '-')
fcover = fcover[['plotVisit', 'Accepted Name', 'Cover']]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  temp['Cover'].replace(0, 0.05, inplace=True)


In [155]:
fcover

Unnamed: 0_level_0,plotVisit,Accepted Name,Cover
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,AIM-NPR-A_AB-1B_2012-08-01,Arnica lessingii,0.05
2,AIM-NPR-A_AB-1B_2012-08-01,Artemisia arctica,0.05
3,AIM-NPR-A_AB-1B_2012-08-01,Cardamine bellidifolia,0.05
4,AIM-NPR-A_AB-1B_2012-08-01,Carex microchaeta,0.70
5,AIM-NPR-A_AB-1B_2012-08-01,Draba,0.05
...,...,...,...
5694,AIM-NPR-A_TMCD-3_2012-08-05,Alopecurus,0.05
5695,AIM-NPR-A_TMCW-1_2012-08-07,Pedicularis,0.05
5696,AIM-NPR-A_TMCW-53_2014-07-25,Alopecurus,0.05
5697,AIM-NPR-A_TMCW-58_2014-07-25,Alopecurus,0.05


In [156]:
# get unique species names from narrowed down list
u_species_names = fcover['Accepted Name'].unique()
print(len(u_species_names))
u_species_names = [x for x in u_species_names if pd.notnull(x)]
print(len(u_species_names))
species_names_df = pd.DataFrame(u_species_names, columns=['datasetSpeciesName'])

535
535


---
## 1.2. Join fcover to species checklist table <a name="joinchecklist"></a>

In [157]:
# load species checklist and prep for joining
checklist_path = f'../etc/akveg_species_checklist.csv'
checklist_df = spf.checklist_df(checklist_path)
checklist_df.head(3)

Unnamed: 0,nameCode,checklistSpeciesName,nameStatus,nameAccepted,nameFamily,acceptedNameSource,nameLevel,speciesForm,speciesHabit
0,abroth,Abrothallus De Not.,accepted,Abrothallus De Not.,Abrothallaceae,CNALH,genus,lichen,lichen
1,abrpar,Abrothallus parmeliarum (Sommerf.) Arnold,accepted,Abrothallus parmeliarum (Sommerf.) Arnold,Abrothallaceae,CNALH,species,lichen,lichen
2,abrpey,Abrothallus peyritschii (Stein) Kotte,accepted,Abrothallus peyritschii (Stein) Kotte,Abrothallaceae,Dillman et al. 2012,species,lichen,lichen


In [158]:
species_names_df.head(3)

Unnamed: 0,datasetSpeciesName
0,Arnica lessingii
1,Artemisia arctica
2,Cardamine bellidifolia


In [159]:
# get first 2 words (genus-species) from checklist accepted name and data species name
checklist_df['joinKey'] = checklist_df['checklistSpeciesName'].apply(spf.get_substrings)
species_names_df['joinKey'] = species_names_df['datasetSpeciesName'].apply(spf.get_substrings)
habits = spf.join_to_checklist(unique_species=species_names_df, 
                               checklist=checklist_df, 
                               u_name='datasetSpeciesName', 
                               c_unofficial_name='checklistSpeciesName', 
                               c_official_name='nameAccepted', 
                               mapping_name='joinKey',
                               habit='speciesHabit')

67 species are missing habits.
62 species still missing habits.
7 species still missing habits.
7 species still missing habits.


In [160]:
habits.head(3)

Unnamed: 0_level_0,datasetSpeciesName,joinKey,speciesHabit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Arnica lessingii,Arnica lessingii,forb
1,Artemisia arctica,Artemisia arctica,forb
2,Cardamine bellidifolia,Cardamine bellidifolia,forb


In [161]:
species_names_df.head(3)

Unnamed: 0,datasetSpeciesName,joinKey
0,Arnica lessingii,Arnica lessingii
1,Artemisia arctica,Artemisia arctica
2,Cardamine bellidifolia,Cardamine bellidifolia


In [162]:
habits = species_names_df.merge(habits, how='left', left_on='datasetSpeciesName', right_on='datasetSpeciesName', suffixes=(None, '_1'))
habits = habits[['joinKey', 'datasetSpeciesName', 'speciesHabit']]

---
## 1.3. Join fcover to leaf retention table <a name="joinleafretention"></a>

In [163]:
leaf_df = spf.leaf_retention_df('../etc/macander_leaf_retention.csv')
leaf_df.head(3)

Unnamed: 0,leafRetention,retentionSpeciesName
0,deciduous,Salix
1,deciduous,Betula nana
2,deciduous,Vaccinium uliginosum


In [164]:
# add leaf retention column
habits_wleaf = spf.add_leaf_retention(habits, leaf_df, 'leafRetention')
habits_wleaf = habits_wleaf[['datasetSpeciesName', 'joinKey', 'speciesHabit', 'leafRetention']]
habits_wleaf.head(3)

Unnamed: 0,datasetSpeciesName,joinKey,speciesHabit,leafRetention
0,Arnica lessingii,Arnica lessingii,forb,
1,Artemisia arctica,Artemisia arctica,forb,
2,Cardamine bellidifolia,Cardamine bellidifolia,forb,


In [165]:
# apply function to remove any extra words with shrub
habits_wleaf['speciesHabit'] = habits_wleaf['speciesHabit'].apply(spf.clean_shrub_habits)
list(habits_wleaf['speciesHabit'].unique())

['forb',
 'graminoid',
 'shrub',
 'spore-bearing',
 'lichen',
 'moss',
 'liverwort',
 nan]

---
## 1.4. Export species-PFT tables <a name="exporthabits"></a>

In [166]:
# generate 3 files and dataframe of habit types
shrubs, nonshrubs, null = spf.export_habit_files(habits_wleaf,
                                                 'temp_data', 
                                                 source, 
                                                 'speciesHabit')

In [167]:
# check to make sure the files make sense
len(habits_wleaf) == (len(null) + len(nonshrubs) + len(shrubs))

True

---
## 1.5. Clean exported species-PFT tables <a name="cleanhabits"></a>
Here, we manually assessed each file. The nonshrubs were usually correct and did not have to be adjusted. Some shrubs were missing leaf retention information, so we consulted with arctic ecologists to determine it. For unknown (null) habits, we consulted arctic ecology expert, Amy Breen, to determine the issue and correct habit. After the tables were complete, all species names had a single associated habit.
- null habits
    - assign missing habits and consult ecologist when needed
- non-shrubs
    - assign leaf habit to all trees (ctrl-f to find all trees in excel)
        - coniferous tree = evergreen
        - deciduous tree  = deciduous
    - remove leaf habit if misapplied
- shrubs
    - check current leaf habits and correct if necessary
    - fill empty leaf habits
    - choose leaf habit if both evergreen & deciduous listed
    - add Stature column (dwarf shrub or not)

---
---
# 2. Standardization <a name="standardization"></a>

---
## 2.1. Standardize PFT names <a name="pftnames"></a>
deciduous shrub, evergreen shrub, graminoid, forb, litter, non-vascular (lichen, brophyte)

In [168]:
# paths to visually assessed and cleaned habit files
all_habit_types = [f'temp_data/{source}_nonshrubs_01.csv',
                   f'temp_data/{source}_shrubs_01.csv',
                   f'temp_data/{source}_nullhabit_01.csv']

# create list of habit dataframes
habit_dfs = []
for path in all_habit_types:
    habit_df = pd.read_csv(path, index_col=0)
    habit_dfs.append(habit_df)
all_habits = pd.concat(habit_dfs)

In [169]:
# show unique habit names we're working with
all_habits['speciesHabit'].unique().tolist()

['forb',
 'graminoid',
 'spore-bearing',
 'lichen',
 'moss',
 'liverwort',
 'shrub',
 'algae',
 'fungus']

In [170]:
# akveg-specific habit name standardization function
def standardize_habit(habitrow, leafrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    new_row = []
    
    for habit, leaf in zip(habitrow, leafrow):
        
        habit = habit.lower()
        # non-shrubs
        if 'algae' in habit:
            new_habit = 'other'
        elif 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte'
        elif 'spore-bearing' in habit:
            new_habit = 'forb'
            
        # shrubs and trees
        elif 'shrub' in habit:
            new_habit = f'{leaf} shrub'
            
        else:
            new_habit = f'{habit}'
        new_row.append(new_habit)
    
    return new_row

In [171]:
# standardized PFT
standard_habit_col = standardize_habit(all_habits['speciesHabit'], 
                                       all_habits['leafRetention'])
all_habits['standardHabit'] = standard_habit_col
all_habits.rename(columns={'speciesHabit':'nonstandardHabit'}, inplace=True)
print(len(all_habits))
all_habits.head(3)

535


Unnamed: 0,datasetSpeciesName,joinKey,nonstandardHabit,leafRetention,speciesStature,standardHabit
0,Arnica lessingii,Arnica lessingii,forb,,,forb
1,Artemisia arctica,Artemisia arctica,forb,,,forb
2,Cardamine bellidifolia,Cardamine bellidifolia,forb,,,forb


---
## 2.2. Sum species fcover to PFT-level <a name="sumfcover"></a>

In [172]:
# loop through each plot and sum fcover per pft
all_habits = all_habits[['datasetSpeciesName', 'standardHabit', 'nonstandardHabit']]
species_fcover = fcover.merge(all_habits, 
                               left_on='Accepted Name', 
                               right_on='datasetSpeciesName')
species_fcover = species_fcover.rename(columns={'Cover':'percentCover'})
species_fcover = species_fcover.drop(columns=['Accepted Name'])
species_fcover.head(3)

Unnamed: 0,plotVisit,percentCover,datasetSpeciesName,standardHabit,nonstandardHabit
0,AIM-NPR-A_AB-1B_2012-08-01,0.05,Arnica lessingii,forb,forb
1,AIM-NPR-A_AB-1B_2012-08-01,0.05,Artemisia arctica,forb,forb
2,AIM-NPR-A_AB-1B_2012-08-01,0.05,Cardamine bellidifolia,forb,forb


In [173]:
species_fcover = species_fcover[species_fcover['percentCover'].notna() & (species_fcover['percentCover'] != 0)]
species_fcover = species_fcover[['plotVisit', 'datasetSpeciesName', 'standardHabit', 
                                 'nonstandardHabit', 'percentCover']]
species_fcover['standardHabit'] = species_fcover['standardHabit'].replace('fungus', 'other')
species_fcover.head(3)

Unnamed: 0,plotVisit,datasetSpeciesName,standardHabit,nonstandardHabit,percentCover
0,AIM-NPR-A_AB-1B_2012-08-01,Arnica lessingii,forb,forb,0.05
1,AIM-NPR-A_AB-1B_2012-08-01,Artemisia arctica,forb,forb,0.05
2,AIM-NPR-A_AB-1B_2012-08-01,Cardamine bellidifolia,forb,forb,0.05


In [174]:
# export
species_fcover.to_csv(f'output_data/{source}_nonstandard_species_fcover.csv', index=False)

In [175]:
# group by 'Site Code' and 'standardHabit', then sum the 'Cover'
temp = species_fcover.groupby(['plotVisit', 'standardHabit'])['percentCover'].sum().reset_index()
pft_fcover = temp.pivot(index='plotVisit', columns='standardHabit', values='percentCover')
pft_fcover = pft_fcover.fillna(0)

In [176]:
# clean up
pft_fcover.columns.name = None
pft_fcover.index.name = 'plotVisit'
pft_fcover.index = pft_fcover.index.astype(str)
pft_fcover.head(3)

Unnamed: 0_level_0,bryophyte,deciduous shrub,evergreen shrub,forb,graminoid,lichen,other
plotVisit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AIM-NPR-A_AB-1B_2012-08-01,0.0,0.05,0.0,2.05,1.1,12.7,0.0
AIM-NPR-A_AB-5B_2012-08-01,2.7,14.75,22.0,3.4,7.05,14.1,0.0
AIM-NPR-A_ADST-21_2013-07-30,26.5,19.5,52.6,6.55,6.25,41.3,0.0


---
## 2.3. Standardize table layout <a name="tablelayout"></a>

In [177]:
# list of necessary PFTs
pfts = ['deciduous shrub', 'deciduous tree',
        'evergreen shrub', 'evergreen tree',
        'forb', 'graminoid', 'non-vascular',
        'bryophyte', 'lichen', 'other']

In [178]:
# load non-vegetation data and format plotVisit
nonveg = pd.read_csv(f'input_data/akveg_nonfoliar_cover.csv')

# 1. Update 'ProjectName' if it contains 'NPR-A'
nonveg['ProjectName'] = nonveg['ProjectName'].apply(
    lambda x: 'AIM-NPR-A' if 'NPR-A' in str(x) else x
)

# 2. Convert 'DateVisited' to YYYY-MM-DD
nonveg['DateVisited'] = pd.to_datetime(nonveg['DateVisited']).dt.strftime('%Y-%m-%d')

# If 'ProjectName' is NaN, set it to 'AIM-NPR-A'
nonveg['ProjectName'] = nonveg['ProjectName'].fillna('AIM-NPR-A')

# Now create 'plotVisit' while ignoring other NaNs
nonveg['plotVisit'] = nonveg[['ProjectName', 'PlotID', 'DateVisited']].apply(
    lambda x: '_'.join(x.dropna().astype(str)), axis=1
)

In [179]:
nonveg = nonveg[['AH_TotalLitterCover', 'FH_RockCover', 
                 'FH_WaterCover', 'BareSoilCover', 'plotVisit']]
nonveg

Unnamed: 0,AH_TotalLitterCover,FH_RockCover,FH_WaterCover,BareSoilCover,plotVisit
0,40.816327,0.000000,0.00,0.00,AIM-NPR-A_FMTT-5B_2012-08-03
1,48.000000,0.000000,4.00,0.00,AIM-NPR-A_FWMM-5B_2012-07-31
2,65.333333,0.000000,2.00,0.00,AIM-NPR-A_FWMM-1_2012-07-31
3,22.000000,0.000000,0.00,0.00,AIM-NPR-A_ADST-4_2012-08-01
4,28.000000,2.666667,0.00,0.00,AIM-NPR-A_ADST-1_2012-08-01
...,...,...,...,...,...
263,29.630000,0.000000,10.37,0.00,AIM-NPR-A_CPHCP-78_2017-07-21
264,16.670000,28.670000,0.00,4.67,AIM-NPR-A_ADST-75_2017-07-22
265,37.330000,0.000000,0.00,0.00,AIM-NPR-A_ADST-76_2017-07-22
266,16.000000,28.000000,0.00,0.00,AIM-NPR-A_ADST-79_2017-07-22


In [180]:
nonveg = nonveg.rename(columns=dict(AH_TotalLitterCover='litter',
                                    FH_RockCover='bare ground',
                                    FH_WaterCover='water',
                                    BareSoilCover='bare ground'))
nonveg = nonveg.groupby(nonveg.columns, axis=1).sum()
nonveg = nonveg.set_index('plotVisit')
nonveg.head(3)

  nonveg = nonveg.groupby(nonveg.columns, axis=1).sum()


Unnamed: 0_level_0,bare ground,litter,water
plotVisit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AIM-NPR-A_FMTT-5B_2012-08-03,0.0,40.816327,0.0
AIM-NPR-A_FWMM-5B_2012-07-31,0.0,48.0,4.0
AIM-NPR-A_FWMM-1_2012-07-31,0.0,65.333333,2.0


In [181]:
# add non-vegetation information
pft_fcover = pft_fcover.merge(nonveg,
                              how='left',
                              left_index=True,
                              right_index=True)

In [182]:
pft_fcover

Unnamed: 0_level_0,bryophyte,deciduous shrub,evergreen shrub,forb,graminoid,lichen,other,bare ground,litter,water
plotVisit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AIM-NPR-A_AB-1B_2012-08-01,0.0,0.05,0.00,2.05,1.10,12.7,0.0,20.666667,0.000000,0.000000
AIM-NPR-A_AB-5B_2012-08-01,2.7,14.75,22.00,3.40,7.05,14.1,0.0,14.000000,16.000000,0.000000
AIM-NPR-A_ADST-21_2013-07-30,26.5,19.50,52.60,6.55,6.25,41.3,0.0,14.666667,44.666667,0.000000
AIM-NPR-A_ADST-2_2012-08-02,2.6,10.05,18.70,5.05,3.60,7.5,0.0,14.000000,10.000000,0.000000
AIM-NPR-A_ADST-33_2013-08-01,8.1,14.70,30.05,28.70,7.70,7.5,0.0,27.333333,21.333333,0.000000
...,...,...,...,...,...,...,...,...,...,...
AIM-NPR-A_TMCW-67_2015-07-30,0.0,0.00,0.00,6.80,58.75,0.0,0.0,46.000000,14.666667,0.000000
AIM-NPR-A_TMCW-68_2015-07-30,21.3,18.05,0.00,2.35,76.05,3.4,0.0,0.666667,64.000000,0.666667
AIM-NPR-A_TMCW-71_2017-07-30,0.7,0.70,0.00,1.40,44.75,0.0,0.0,29.330000,17.330000,23.330000
AIM-NPR-A_TMCW-72_2017-07-30,1.3,0.00,0.00,17.45,85.30,0.0,64.0,0.000000,14.000000,0.670000


In [183]:
# create non-vascular
pft_fcover['non-vascular'] = pft_fcover[['bryophyte', 'lichen']].sum(axis=1, skipna=True)

In [184]:
# create standard columns
pft_fcover = spf.add_standard_cols(pft_fcover, pfts)
pft_fcover = pft_fcover[pfts]
pft_fcover = pft_fcover.merge(nonveg, how='left', left_index=True, right_index=True)
pft_fcover.head(3)

Unnamed: 0_level_0,deciduous shrub,deciduous tree,evergreen shrub,evergreen tree,forb,graminoid,non-vascular,bryophyte,lichen,other,bare ground,litter,water
plotVisit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AIM-NPR-A_AB-1B_2012-08-01,0.05,,0.0,,2.05,1.1,12.7,0.0,12.7,0.0,20.666667,0.0,0.0
AIM-NPR-A_AB-5B_2012-08-01,14.75,,22.0,,3.4,7.05,16.8,2.7,14.1,0.0,14.0,16.0,0.0
AIM-NPR-A_ADST-21_2013-07-30,19.5,,52.6,,6.55,6.25,67.8,26.5,41.3,0.0,14.666667,44.666667,0.0


In [185]:
len(pft_fcover.index.unique())

185

---
## 2.4. Standardize auxiliary columns <a name="aux"></a>

Plot information:
- [ ] UID
- [x] plotName
- [x] dataSource
- [x] dataSubsource
- [x] fcoverScale
- [x] surveyMethod
- [x] surveyPurpose
- [x] duplicatedCoords
- [x] duplicatedDate
- [x] surveyYear
- [x] surveyMonth
- [x] surveyDay

Geographical information:
- [x] bioclimateSubzone
- [x] adminCountry
- [x] adminUnit
- [x] fireYears
- [x] coordEpsg
- [x] georefSource
- [x] georefAccuracy
- [x] plotRadius
- [x] longitueX
- [x] latitudeY

In [186]:
# load auxiliary data from fcover tables
fcover_aux = pd.read_csv(f'input_data/akveg_foliar_cover.csv', index_col=0)
fcover_aux['plotVisit'] = fcover_aux['Project'].str.replace(' ', '-') + '_' + fcover_aux['Site Code'].str.replace(' ', '-') + '_' + fcover_aux['Date'].str.replace(' ', '-')
fcover_aux['joinKey'] = fcover_aux['Project'].str.replace(' ', '-') + '_' + fcover_aux['Site Code'].str.replace(' ', '-')
fcover_aux = fcover_aux.groupby('plotVisit').first()
fcover_aux.reset_index(inplace=True)

In [187]:

# load auxiliary data provided by akveg (only has Site Code and Initial Project)
aux = pd.read_csv(f'input_data/akveg_ancillary.csv', index_col=0)
aux['joinKey'] = aux['Initial Project'].str.replace(' ', '-') + '_' + aux['Site Code'].str.replace(' ', '-')

# load sampling methods, coverscale, data source
methods = pd.read_csv('../etc/sampling_methods.csv')

In [188]:
# merge aux tables into one
aux = fcover_aux.merge(aux, how='left', 
                       left_on='joinKey', 
                       right_on='joinKey')

In [189]:
aux.columns

Index(['plotVisit', 'Project', 'Site Code_x', 'Date', 'Observer', 'Recorder',
       'Cover Type', 'Accepted Name', 'Cover', 'joinKey', 'Site Code_y',
       'Initial Project', 'Perspective', 'Cover Method', 'Scope Vascular',
       'Scope Bryophyte', 'Scope Lichen', 'Plot Dimensions', 'Datum',
       'Latitude', 'Longitude', 'Uncertainty'],
      dtype='object')

### 2.4.1. Basic plot information <a name="basics"></a>

In [190]:
new_aux = pd.DataFrame()

# date columns
aux['Date'] = (aux['Date'].fillna(-1)).astype(str)
aux['surveyDate'] = pd.to_datetime(aux['Date'], format='mixed', errors='coerce') # convert -1 to NaT
new_aux['surveyYear'] = aux['surveyDate'].dt.year.astype(int)
new_aux['surveyMonth'] = aux['surveyDate'].dt.month.astype(int)
new_aux['surveyDay'] = aux['surveyDate'].dt.day.astype(int)

# plot size
new_aux['plotArea'] = aux['Plot Dimensions']

# Extract plot shape and calculate plot size
def process_plot_dimensions(dimension):
    if 'radius' in dimension:
        # Extract radius and calculate area of the circle
        try:
            radius = float(dimension.split(' radius')[0].strip())
            area = np.pi * (radius**2)
            return {'plotShape': 'circle', 'plotArea': area}
        except ValueError:
            return {'plotShape': 'unknown', 'plotArea': np.NaN}
    elif 'x' in dimension:
        # Extract dimensions and calculate rectangular or square area
        try:
            dimensions = dimension.split('x')
            length = float(dimensions[0].strip())
            width = float(dimensions[1].strip())
            area = length * width
            shape = 'square' if length == width else 'rectangle'
            return {'plotShape': shape, 'plotArea': area}
        except (ValueError, IndexError):
            return {'plotShape': np.NaN, 'plotArea': np.NaN}
    else:
        # Unknown dimension format
        return {'plotShape': 'unknown', 'plotArea': dimension}

# Apply function and update new_aux
processed_dimensions = aux['Plot Dimensions'].apply(process_plot_dimensions)
new_aux['plotShape'] = processed_dimensions.apply(lambda x: x['plotShape'])
new_aux['plotArea'] = processed_dimensions.apply(lambda x: x['plotArea'])

# geographical information
new_aux['latitudeY'] = aux['Latitude'].astype('Float32')
new_aux['longitudeX'] = aux['Longitude'].astype('Float32')
new_aux['georefSource'] = 'GPS'
new_aux['georefAccuracy'] = aux['Uncertainty'].astype('Float32')
new_aux['coordEPSG'] = 'EPSG:4326' # will need to convert from EPSG:4269

# main
new_aux['plotVisit'] = aux['plotVisit']
new_aux['plotName'] = aux['Site Code_x']
# new_aux['dataSource'] = 'AKVEG'
new_aux['dataSubsource'] = 'AIM NPR-A Nawrocki (pub 2020)'
new_aux = new_aux.merge(methods, how='left', left_on='dataSubsource', right_on='dataSubsource')
new_aux = new_aux.set_index(new_aux['plotVisit'])

### 2.4.2. Geographical intersections <a name="geo"></a>

In [191]:
# set geo info
geo_paths = ['../etc/gaul1/gaul1_asap.shp',
             '../etc/fire/InterAgencyFirePerimeterHistory_All_Years_View.shp',
             '../etc/bioclim/bioclimate_la_latlon.shp']
geo_names = ['gaul1',
             'fire',
             'bioclim']
geo_cols = [['name1', 'name0', 'geometry'],
            ['FIRE_YEAR', 'geometry'],
            ['zone', 'geometry']]

In [192]:
# create geospatial information
fcover_epsg = 'EPSG:4269' # latlon NAD83
intersect_epsg = 'EPSG:5936' # alaska projected datum
fcover_and_aux = pd.concat([pft_fcover, new_aux], join='inner', axis=1)
fcover_and_aux = gpd.GeoDataFrame(fcover_and_aux, 
                                  geometry=gpd.points_from_xy(fcover_and_aux['longitudeX'],
                                                              fcover_and_aux['latitudeY']),
                                  crs=fcover_epsg)
fcover_and_aux = fcover_and_aux.to_crs(intersect_epsg)

# add new geospatial intersections
fcover_and_aux = spf.add_geospatial_aux(fcover_and_aux, 
                                        geo_paths, 
                                        geo_names, 
                                        geo_cols,
                                        intersect_epsg)

# drop unneccesary columns
fcover_and_aux = fcover_and_aux.drop(columns=['index_gaul1', 
                                              'index_fire', 
                                              'index_bioclim', 
                                              'plotVisit'])

In [193]:
# rename new columns
fcover_and_aux.rename(columns={'name1':'adminUnit',
                               'name0':'adminCountry',
                               'FIRE_YEAR':'fireYears',
                               'zone':'bioclimSubzone'}, inplace=True)

### 2.4.3. Duplicate coordinates <a name="dupes"></a>
This indicates that plots may have been revisited, they were very close and GPS accuracy was too low, or there is human error

In [194]:
# find duplicates
fcover_and_aux = spf.find_duplicates(fcover_and_aux, ['longitudeX', 'latitudeY'], 'duplicatedCoords')
fcover_and_aux = spf.find_duplicates(fcover_and_aux, ['surveyYear', 'surveyMonth', 'surveyDay'], 'duplicatedDate')

  group_ids = df.groupby(subset, sort=False).grouper.group_info[0]
  group_ids = df.groupby(subset, sort=False).grouper.group_info[0]


---
# 3. Export <a name="export"></a>

In [195]:
# clean up
pfts = {'deciduous shrub':'deciduousShrubCover',
        'evergreen shrub':'evergreenShrubCover',
        'deciduous tree':'deciduousTreeCover',
        'evergreen tree':'evergreenTreeCover',
        'forb':'forbCover',
        'graminoid':'graminoidCover',
        'non-vascular':'nonvascularSumCover',
        'bryophyte':'bryophyteCover',
        'lichen':'lichenCover',
        'litter':'litterCover',
        'bare ground':'baregroundCover',
        'water':'waterCover',
        'other':'otherCover'}

# rename columns and replace NaN cover with 0
fcover_and_aux.rename(columns=pfts, inplace=True)
fcover_cols = list(pfts.values())

# reproject and set cover data type
fcover_and_aux = fcover_and_aux.to_crs('EPSG:4326')
fcover_and_aux[fcover_cols] = fcover_and_aux[fcover_cols].astype(np.float32)
fcover_and_aux.index.name = 'plotVisit'

In [196]:
fcover_and_aux.columns

Index(['deciduousShrubCover', 'deciduousTreeCover', 'evergreenShrubCover',
       'evergreenTreeCover', 'forbCover', 'graminoidCover',
       'nonvascularSumCover', 'bryophyteCover', 'lichenCover', 'otherCover',
       'baregroundCover', 'litterCover', 'waterCover', 'surveyYear',
       'surveyMonth', 'surveyDay', 'plotArea', 'plotShape', 'latitudeY',
       'longitudeX', 'georefSource', 'georefAccuracy', 'coordEPSG', 'plotName',
       'dataSubsource', 'dataSource', 'dataSourceHtml',
       'dataSubsourceCitation', 'surveyMethod', 'fcoverScale', 'surveyPurpose',
       'geometry', 'adminUnit', 'adminCountry', 'fireYears', 'bioclimSubzone',
       'duplicatedCoords', 'duplicatedDate'],
      dtype='object')

In [197]:
# Export cover data
covercols = [col for col in fcover_and_aux.columns if 'Cover' in col]
auxcols = [col for col in fcover_and_aux.columns if 'Cover' not in col]
pft_fcover = fcover_and_aux[covercols]
pft_fcover.to_csv(f'output_data/{source}_standard_pft_fcover.csv')

# Export aux data
pft_aux = fcover_and_aux[auxcols]
pft_aux.to_csv(f'output_data/{source}_plot_info.csv')