In [1]:
import pandas as pd
import numpy as np
import standardize_pft_funcs as spf
import geopandas as gpd

In [2]:
out_path = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'

# Read fcover, checklist, and ancillary data for AKVEG NPR-A

In [3]:
# read akveg npr-a data, checklist, and ancillary data
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS'
checklist = pd.read_csv(f'{p}/AKVEG_species_checklist.csv')
cover = pd.read_csv(f'{p}/AKVEG_foliar_cover.csv', index_col=0)
ancillary = pd.read_csv(f'{p}/AKVEG_ancillary.csv', index_col=0)
nonveg = pd.read_csv(f'{p}/BLM_Natl_AIM_TerrADat_Hub.csv')

In [4]:
len(cover['Site Code'].unique())

185

In [5]:
# remove species names with 0 cover
nonnan_cover = cover.copy()
nonnan_cover['Cover'].replace(0, np.nan, inplace=True)
nonnan_cover = nonnan_cover.dropna(subset=['Cover'])

## Get unique species names

In [31]:
species = spf.get_unique_species(nonnan_cover, 'Accepted Name', 'AKVEG', SAVE=True, OUTP=out_path)

Saved unique species list to /mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS//AKVEG_unique_species.csv.


# Begin standardization process

## Get habit information

In [32]:
# get first 2 words (genus-species) from checklist accepted name and data species name
checklist['Mapping Name'] = checklist['Accepted Name'].apply(spf.get_substrings)
species['Mapping Name'] = species['Accepted Name'].apply(spf.get_substrings)

In [33]:
habits = spf.fill_habits(unique_species=species, 
                         checklist=checklist, 
                         u_name='Accepted Name', 
                         c_unofficial_name='Name', 
                         c_official_name='Accepted Name', 
                         mapping_name='Mapping Name',
                         habit='Habit')

53 species are missing habits.
50 species still missing habits.
6 species still missing habits.
6 species still missing habits.


## Add evergreen/deciduous

In [34]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AK-AVA_post2000/'
f = 'evergreendecid_macander2022.csv'
evergrndecid = pd.read_csv(p + f, header=None)
evergrndecid.columns = ['evergreendecid', 'species']
final = spf.add_leaf_habit(habits, evergrndecid)

In [35]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'
f = 'AKVEG_species_habit_00.csv'
final.to_csv(p + f)

In [36]:
list(final['Potential Habit'].unique())

['moss',
 'forb',
 'lichen',
 nan,
 'dwarf shrub',
 'graminoid',
 'liverwort',
 'shrub',
 'spore-bearing',
 ' shrub,dwarf shrub',
 ' tree, shrub,dwarf shrub, deciduous tree, dwarf shrub']

## Export shrubs, nonshrubs, null habit seperately

In [37]:
# get all shrub species
finalnonnull = final[~final['Potential Habit'].isnull()]
shrubs = finalnonnull[finalnonnull['Potential Habit'].str.contains('shrub')]
shrubs.to_csv(p + 'AKVEG_shrubs_00.csv')

In [38]:
# get all non-shrub species
nonshrubs = finalnonnull[~finalnonnull['Potential Habit'].str.contains('shrub')]
nonshrubs.to_csv(p + 'AKVEG_nonshrubs_00.csv')

In [39]:
# get null habits
finalnull = final[final['Potential Habit'].isnull()]
finalnull.to_csv(p + 'AKVEG_nullhabit_00.csv')

In [40]:
len(finalnull) + len(nonshrubs) + len(shrubs)

384

In [41]:
len(final)

384

## HERE: Manually add data from shrub05 file

## Standardize PFT Schema

In [6]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'
nonshrub = 'AKVEG_nonshrubs_00.csv'
null = 'AKVEG_nullhabit_01.csv'
shrub = 'AKVEG_shrubs_01.csv'

alltypes = [nonshrub, shrub, null]

d = []
for t in alltypes:
    habits = pd.read_csv(p + t, index_col=0)
    d.append(habits)

In [7]:
completehabits = pd.concat(d)
completehabits['Habit'].unique().tolist()

['moss',
 'forb',
 'lichen',
 'graminoid',
 'liverwort',
 'spore-bearing',
 'dwarf shrub',
 'shrub',
 'shrub, tree',
 'algae',
 'fungus']

In [8]:
def standardize_habit_nonstandard(habitrow, leafrow, heightrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    heightrow = heightrow.to_numpy()
    new_row = []
    
    for habit, leaf, height in zip(habitrow, leafrow, heightrow):
        habit = habit.lower()
        if habit == 'shrub, tree':
            new_habit = f'{leaf} dwarf to tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
        else:
            new_habit = f'{habit} cover (%)'
        new_row.append(" ".join(new_habit.split()).lower())
    
    return new_row

In [9]:
def standardize_habit(habitrow, leafrow, heightrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    heightrow = heightrow.to_numpy()
    new_row = []
    
    for habit, leaf, height in zip(habitrow, leafrow, heightrow):
        habit = habit.lower()
        if 'algae' in habit:
            new_habit = 'lichen cover (%)'
        elif habit == np.nan or 'unknown' in habit:
            new_habit = 'unknown cover (%)'
        elif 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte cover (%)'
        elif 'spore-bearing' in habit:
            new_habit = 'forb cover (%)'
        elif habit == 'grass':
            new_habit = 'graminoid cover (%)'
            
        elif habit == 'shrub, tree':
            new_habit = f'{leaf} dwarf to tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
            
        elif 'litter' in habit or 'scat' in habit:
            new_habit = 'litter cover (%)'
        elif 'bare ground' in habit or 'crust' in habit or 'mineral' in habit:
            new_habit = 'bare ground cover (%)'
        else:
            new_habit = f'{habit} cover (%)'
        new_row.append(" ".join(new_habit.split()).lower())
    
    return new_row

In [10]:
# standardized PFT
standard_habit_col = standardize_habit(completehabits['Habit'], 
                                       completehabits['Leaf Retention'], 
                                       completehabits['Height'])

In [11]:
# non-standardized PFT
nonstandard_habit_col = standardize_habit_nonstandard(completehabits['Habit'], 
                                                      completehabits['Leaf Retention'],
                                                      completehabits['Height'])

In [12]:
completehabits['Standard Habit'] = standard_habit_col
completehabits['NonStandard Habit'] = nonstandard_habit_col

In [13]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'
f = 'AKVEG_species_habit_standardized.csv'
completehabits.to_csv(p + f)

## Join habit to fcover data

In [14]:
# add vegetation fcover and sum to PFT level
def add_cover(cover_df, pft_groups, habit_col):
    cover = cover_df.reset_index()
    withfcover = cover.merge(pft_groups, left_on='Accepted Name', right_on='Name', how='left')
    grouped = withfcover.groupby(['Site Code', habit_col], 
                                 group_keys=True).agg({'Cover': 'sum'})
    return grouped

In [15]:
# flatten multilevel
def flatten_multilevel(grouped):
    grouped.columns = grouped.columns.get_level_values(0)
    grouped = grouped.reset_index()
    return grouped

In [16]:
# transpose
def transpose_df(grouped, habit_col):
    groups = grouped.set_index(['Site Code', habit_col]).stack().unstack([1,2])
    groups.columns = groups.columns.get_level_values(0)
    groups = groups.reset_index()
    return groups

In [17]:
def add_standard_cols(groups):
    
    # required columns
    necessary_cols = ['deciduous dwarf shrub cover (%)',
                      'deciduous dwarf to low shrub cover (%)',
                      'deciduous dwarf to tall shrub cover (%)',
                      'deciduous dwarf to tree cover (%)',
                      'deciduous tree cover (%)',
                      'evergreen dwarf shrub cover (%)',
                      'evergreen dwarf to low shrub cover (%)',
                      'evergreen dwarf to tall shrub cover (%)',
                      'evergreen dwarf to tree cover (%)',
                      'evergreen tree cover (%)',
                      'bryophyte cover (%)',
                      'forb cover (%)',
                      'graminoid cover (%)',
                      'lichen cover (%)']
    
    # add missing columns and fill with nan
    cols = groups.columns.tolist()
    addcols = []
    for nc in necessary_cols:
        if nc not in cols:
            addcols.append(nc)
    groups[addcols] = np.nan
    return groups

In [18]:
grouped_standard = add_cover(cover, completehabits, 'Standard Habit')
grouped_standard = flatten_multilevel(grouped_standard)
grouped_standard = transpose_df(grouped_standard, 'Standard Habit')
standard_groups = add_standard_cols(grouped_standard)

In [19]:
grouped_nonstand = add_cover(cover, completehabits, 'NonStandard Habit')
grouped_nonstand = flatten_multilevel(grouped_nonstand)
grouped_nonstand = transpose_df(grouped_nonstand, 'NonStandard Habit')

## Read ancillary table

In [20]:
anc = ancillary.copy()
# year
date = cover.groupby('Site Code')['Date'].first().reset_index()
anc = anc.merge(date, on='Site Code', how='left')
anc['year'] = pd.DatetimeIndex(anc['Date']).year
anc['year'] = anc['year'].astype('Int64')
# plot size
anc['plot_radius_m'] = anc['Plot Dimensions']
anc['plot_radius_m'] = anc['plot_radius_m'].str.extract('(\d+)').astype(int)
# dataset source
anc['source'] = 'AKVEG_AIM_NPRA'
anc.rename(columns={'Latitude':'latitude','Longitude':'longitude'}, inplace=True)

In [21]:
anc = anc[['Site Code', 'latitude', 'longitude', 'year', 'plot_radius_m', 'source']]

In [22]:
anc

Unnamed: 0,Site Code,latitude,longitude,year,plot_radius_m,source
0,AB-1B,68.294218,-156.035416,2012,55,AKVEG_AIM_NPRA
1,AB-5B,68.328156,-156.216436,2012,55,AKVEG_AIM_NPRA
2,ADST-2,68.242462,-156.117486,2012,30,AKVEG_AIM_NPRA
3,ADST-21,68.721845,-156.470093,2013,30,AKVEG_AIM_NPRA
4,ADST-33,68.707473,-156.588315,2013,30,AKVEG_AIM_NPRA
...,...,...,...,...,...,...
253,GMT2-140,70.219950,-151.589810,,30,AKVEG_AIM_NPRA
254,GMT2-144,70.172090,-151.643930,,30,AKVEG_AIM_NPRA
255,GMT2-147,70.288100,-151.289060,,30,AKVEG_AIM_NPRA
256,GMT2-151,70.289540,-151.276000,,30,AKVEG_AIM_NPRA


## Create total cover file (non-merged)

In [23]:
# set index and sort columns
totcover_nm = grouped_nonstand.copy()
totcover_nm.drop(columns=['fungus cover (%)'], inplace=True)
totcover_nm.set_index('Site Code', inplace=True)
totcover_nm.columns.name = None
totcover_nm = totcover_nm[sorted(totcover_nm.columns.tolist())]

In [24]:
# add "total cover" to column names
cols = totcover_nm.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
totcover_nm.columns = cols

In [25]:
# add ancillary
totcover_nm = totcover_nm.merge(anc, on='Site Code', how='left')

In [26]:
totcover_nm.set_index('Site Code', inplace=True)

In [27]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'AKVEG_AIM_NPRA_fcover_total_orig.csv'
totcover_nm.to_csv(p + f)

## Create total cover file (merged)

In [28]:
# set index and sort columns
totcover = standard_groups.copy()
totcover.drop(columns=['fungus cover (%)'], inplace=True)
totcover.set_index('Site Code', inplace=True)
totcover.columns.name = None
totcover = totcover[sorted(totcover.columns.tolist())]

In [29]:
# add "total cover" to column names
cols = totcover.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
totcover.columns = cols

In [30]:
# add ancillary
totcover = totcover.merge(anc, on='Site Code', how='left')

In [31]:
totcover.set_index('Site Code', inplace=True)

In [33]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'AKVEG_AIM_NPRA_fcover_total_merged.csv'
totcover.to_csv(p + f)

## Create top cover file from existing summary

In [34]:
# rename columns and set index
topcover = nonveg.copy()
topcover.rename(columns={'BareSoilCover':'bare ground top cover (%)',
                         'FH_TotalLitterCover':'litter top cover (%)',
                         'FH_RockCover':'bare ground top cover (%)',
                         'FH_WaterCover':'water top cover (%)',
                         'PlotID':'Site Code'}, inplace=True)
topcover.set_index('Site Code', inplace=True)

# sort and sum duplicate columns
topcover = topcover[sorted(['bare ground top cover (%)',
                            'litter top cover (%)',
                            'water top cover (%)'])]
topcover = topcover.groupby(lambda x:x, axis=1).sum()

In [35]:
temp = totcover.merge(topcover, on='Site Code', how='left')
nonvegtop = temp[['bare ground top cover (%)', 'litter top cover (%)', 'water top cover (%)']]
topcover = nonvegtop.merge(anc, on='Site Code')

In [36]:
topcover.set_index('Site Code', inplace=True)

In [37]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'AKVEG_AIM_NRPA_fcover_top_nonPFT.csv'
topcover.to_csv(p + f)

## Misc

In [129]:
# # adjust GMT-2 site code names
# nonull_proj = nonveg[~nonveg['ProjectName'].isna()]
# gmt2_plots = nonull_proj[nonull_proj['ProjectName'].str.contains('GMT2')]
# gmt2_plots = gmt2_plots.copy()
# gmt2_plots['PlotID2'] = 'GMT2-' + gmt2_plots['PlotID'].astype(str)
# new_gmt_ids = gmt2_plots[['PlotID', 'PlotID2']]

# # add new IDs to original non-veg data
# nonveg2 = nonveg.merge(new_gmt_ids, on='PlotID', how='left')
# nonveg2['PlotID2'].fillna(nonveg2['PlotID'], inplace=True)
# nonveg2['PlotID'] = nonveg2['PlotID2']