In [1]:
import pandas as pd
import numpy as np
import standardize_pft_funcs as spf
import geopandas as gpd

In [2]:
out_path = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'

# Read fcover, checklist, and ancillary data for AKVEG NPR-A

In [3]:
# read akveg npr-a data, checklist, and ancillary data
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS'
checklist = pd.read_csv(f'{p}/AKVEG_species_checklist.csv')
cover = pd.read_csv(f'{p}/AKVEG_foliar_cover.csv', index_col=0)
ancillary = pd.read_csv(f'{p}/AKVEG_ancillary.csv', index_col=0)
nonveg = pd.read_csv(f'{p}/BLM_Natl_AIM_TerrADat_Hub.csv')

In [4]:
len(cover['Site Code'].unique())

185

In [5]:
# remove species names with 0 cover
nonnan_cover = cover.copy()
nonnan_cover['Cover'].replace(0, np.nan, inplace=True)
nonnan_cover = nonnan_cover.dropna(subset=['Cover'])

## Get unique species names

In [6]:
species = spf.get_unique_species(nonnan_cover, 'Accepted Name', 'AKVEG', SAVE=True, OUTP=out_path)

Saved unique species list to /mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS//AKVEG_unique_species.csv.


## Get habit information

In [7]:
# get first 2 words (genus-species) from checklist accepted name and data species name
checklist['Mapping Name'] = checklist['Accepted Name'].apply(spf.get_substrings)
species['Mapping Name'] = species['Accepted Name'].apply(spf.get_substrings)

In [8]:
habits = spf.fill_habits(unique_species=species, 
                         checklist=checklist, 
                         u_name='Accepted Name', 
                         c_unofficial_name='Name', 
                         c_official_name='Accepted Name', 
                         mapping_name='Mapping Name',
                         habit='Habit')

53 species are missing habits.
50 species still missing habits.
6 species still missing habits.
6 species still missing habits.


## Add evergreen/deciduous

In [9]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AK-AVA_post2000/'
f = 'evergreendecid_macander2022.csv'
evergrndecid = pd.read_csv(p + f, header=None)
evergrndecid.columns = ['evergreendecid', 'species']
final = spf.add_leaf_habit(habits, evergrndecid)

In [10]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'
f = 'AKVEG_species_habit_00.csv'
final.to_csv(p + f)

In [11]:
list(final['Potential Habit'].unique())

['moss',
 'forb',
 'lichen',
 nan,
 'dwarf shrub',
 'graminoid',
 'liverwort',
 'shrub',
 'spore-bearing',
 'dwarf shrub, shrub',
 'shrub, tree, dwarf shrub, deciduous tree, shrub']

## Export shrubs, nonshrubs, null habit seperately

In [12]:
# get all shrub species
finalnonnull = final[~final['Potential Habit'].isnull()]
shrubs = finalnonnull[finalnonnull['Potential Habit'].str.contains('shrub')]
shrubs.to_csv(p + 'AKVEG_shrubs_00.csv')

In [13]:
# get all non-shrub species
nonshrubs = finalnonnull[~finalnonnull['Potential Habit'].str.contains('shrub')]
nonshrubs.to_csv(p + 'AKVEG_nonshrubs_00.csv')

In [14]:
# get null habits
finalnull = final[final['Potential Habit'].isnull()]
finalnull.to_csv(p + 'AKVEG_nullhabit_00.csv')

In [15]:
len(finalnull) + len(nonshrubs) + len(shrubs)

384

In [16]:
len(final)

384

## HERE: Manually add data from shrub05 file

## Standardize PFT Schema

In [17]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'
nonshrub = 'AKVEG_nonshrubs_00.csv'
null = 'AKVEG_nullhabit_01.csv'
shrub = 'AKVEG_shrubs_01.csv'

alltypes = [nonshrub, shrub, null]

d = []
for t in alltypes:
    habits = pd.read_csv(p + t, index_col=0)
    d.append(habits)

In [18]:
completehabits = pd.concat(d)
completehabits['Habit'].unique().tolist()

['moss',
 'forb',
 'lichen',
 'graminoid',
 'liverwort',
 'spore-bearing',
 'dwarf shrub',
 'shrub',
 'shrub, tree',
 'algae',
 'fungus']

In [19]:
def standardize_habit_nonstandard(habitrow, leafrow, heightrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    heightrow = heightrow.to_numpy()
    new_row = []
    
    for habit, leaf, height in zip(habitrow, leafrow, heightrow):
        habit = habit.lower()
        if habit == 'shrub, tree':
            new_habit = f'{leaf} dwarf to tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
        else:
            new_habit = f'{habit} cover (%)'
        new_row.append(" ".join(new_habit.split()).lower())
    
    return new_row

In [20]:
def standardize_habit(habitrow, leafrow, heightrow):
    
    habitrow = habitrow.to_numpy()
    leafrow = leafrow.to_numpy()
    heightrow = heightrow.to_numpy()
    new_row = []
    
    for habit, leaf, height in zip(habitrow, leafrow, heightrow):
        habit = habit.lower()
        if 'algae' in habit:
            new_habit = 'lichen cover (%)'
        elif habit == np.nan or 'unknown' in habit:
            new_habit = 'unknown cover (%)'
        elif 'moss' in habit or 'liverwort' in habit:
            new_habit = 'bryophyte cover (%)'
        elif 'spore-bearing' in habit:
            new_habit = 'forb cover (%)'
        elif habit == 'grass':
            new_habit = 'graminoid cover (%)'
            
        elif habit == 'shrub, tree':
            new_habit = f'{leaf} dwarf to tree cover (%)'
        elif habit == 'dwarf shrub':
            new_habit = f'{leaf} {habit} cover (%)'
        elif 'shrub' in habit:
            new_habit = f'{leaf} {height} cover (%)'
            
        elif 'litter' in habit or 'scat' in habit:
            new_habit = 'litter cover (%)'
        elif 'bare ground' in habit or 'crust' in habit or 'mineral' in habit:
            new_habit = 'bare ground cover (%)'
        else:
            new_habit = f'{habit} cover (%)'
        new_row.append(" ".join(new_habit.split()).lower())
    
    return new_row

In [21]:
# standardized PFT
standard_habit_col = standardize_habit(completehabits['Habit'], 
                                       completehabits['Leaf Retention'], 
                                       completehabits['Height'])

In [22]:
# non-standardized PFT
nonstandard_habit_col = standardize_habit_nonstandard(completehabits['Habit'], 
                                                      completehabits['Leaf Retention'],
                                                      completehabits['Height'])

In [23]:
completehabits['Standard Habit'] = standard_habit_col
completehabits['NonStandard Habit'] = nonstandard_habit_col

In [24]:
p = '/mnt/poseidon/remotesensing/arctic/data/vectors/AKVEG_ACCS/'
f = 'AKVEG_species_habit_standardized.csv'
completehabits.to_csv(p + f)

## Join habit to fcover data

In [25]:
# add vegetation fcover and sum to PFT level
def add_cover(cover_df, pft_groups, habit_col):
    cover = cover_df.reset_index()
    withfcover = cover.merge(pft_groups, left_on='Accepted Name', right_on='Name', how='left')
    grouped = withfcover.groupby(['Site Code', habit_col], 
                                 group_keys=True).agg({'Cover': 'sum'})
    return grouped

In [26]:
# flatten multilevel
def flatten_multilevel(grouped):
    grouped.columns = grouped.columns.get_level_values(0)
    grouped = grouped.reset_index()
    return grouped

In [27]:
# transpose
def transpose_df(grouped, habit_col):
    groups = grouped.set_index(['Site Code', habit_col]).stack().unstack([1,2])
    groups.columns = groups.columns.get_level_values(0)
    groups = groups.reset_index()
    return groups

In [28]:
def add_standard_cols(groups):
    
    # required columns
    necessary_cols = ['deciduous dwarf shrub cover (%)',
                      'deciduous dwarf to low shrub cover (%)',
                      'deciduous dwarf to tall shrub cover (%)',
                      'deciduous dwarf to tree cover (%)',
                      'deciduous tree cover (%)',
                      'evergreen dwarf shrub cover (%)',
                      'evergreen dwarf to low shrub cover (%)',
                      'evergreen dwarf to tall shrub cover (%)',
                      'evergreen dwarf to tree cover (%)',
                      'evergreen tree cover (%)',
                      'bryophyte cover (%)',
                      'forb cover (%)',
                      'graminoid cover (%)',
                      'lichen cover (%)']
    
    # add missing columns and fill with nan
    cols = groups.columns.tolist()
    addcols = []
    for nc in necessary_cols:
        if nc not in cols:
            addcols.append(nc)
    groups[addcols] = np.nan
    return groups

In [29]:
grouped_standard = add_cover(cover, completehabits, 'Standard Habit')
grouped_standard = flatten_multilevel(grouped_standard)
grouped_standard = transpose_df(grouped_standard, 'Standard Habit')
standard_groups = add_standard_cols(grouped_standard)

In [30]:
grouped_nonstand = add_cover(cover, completehabits, 'NonStandard Habit')
grouped_nonstand = flatten_multilevel(grouped_nonstand)
grouped_nonstand = transpose_df(grouped_nonstand, 'NonStandard Habit')

## Read ancillary table

In [44]:
anc = ancillary.copy()
# year
date = cover.groupby('Site Code')['Date'].first().reset_index()
anc = anc.merge(date, on='Site Code', how='left')
anc['year'] = pd.DatetimeIndex(anc['Date']).year
anc['year'] = anc['year'].astype('Int64')
# plot size
anc['plot_radius_m'] = anc['Plot Dimensions']
anc['plot_radius_m'] = anc['plot_radius_m'].str.extract('(\d+)').astype(int)
# dataset source
anc['source'] = 'AKVEG_AIM_NPRA'
anc.rename(columns={'Latitude':'latitude','Longitude':'longitude'}, inplace=True)

In [45]:
anc = anc[['Site Code', 'latitude', 'longitude', 'year', 'plot_radius_m', 'source']]

In [46]:
anc.set_index('Site Code', inplace=True)

## Create top cover file from existing summary

In [47]:
# rename columns and set index
topcover = nonveg.copy()
topcover.rename(columns={'BareSoilCover':'bare ground top cover (%)',
                         'FH_TotalLitterCover':'litter cover (%)',
                         'FH_RockCover':'bare ground top cover (%)',
                         'FH_WaterCover':'water top cover (%)',
                         'PlotID':'Site Code'}, inplace=True)
topcover.set_index('Site Code', inplace=True)

# sort and sum duplicate columns
topcover = topcover[sorted(['bare ground top cover (%)',
                            'litter cover (%)',
                            'water top cover (%)'])]
topcover = topcover.groupby(lambda x:x, axis=1).sum()

In [51]:
topcover1 = topcover.drop(columns=['litter cover (%)'])
topcover1 = pd.concat([topcover1, anc], axis=1, join='inner')
len(topcover1)

185

In [52]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'AKVEG_AIM_NRPA_fcover_top_nonPFT.csv'
topcover1.to_csv(p + f)

## Create total cover file (non-merged)

In [53]:
# set index and sort columns
totcover_nm = grouped_nonstand.copy()
totcover_nm.drop(columns=['fungus cover (%)'], inplace=True)
totcover_nm.set_index('Site Code', inplace=True)
totcover_nm.columns.name = None

# add litter
litter = topcover[['litter cover (%)']]
totcover_nm = pd.concat([litter, totcover_nm], axis=1)
totcover_nm = totcover_nm[sorted(totcover_nm.columns.tolist())]

In [54]:
# add "total cover" to column names
cols = totcover_nm.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
totcover_nm.columns = cols

In [56]:
# add ancillary
totcover_nm = pd.concat([totcover_nm, anc], axis=1, join='inner')
len(totcover_nm)

185

In [57]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'AKVEG_AIM_NPRA_fcover_total_orig.csv'
totcover_nm.to_csv(p + f)

## Create total cover file (merged)

In [58]:
# set index and sort columns
totcover = standard_groups.copy()
totcover.drop(columns=['fungus cover (%)'], inplace=True)
totcover.set_index('Site Code', inplace=True)
totcover.columns.name = None

# add litter
litter = topcover[['litter cover (%)']]
totcover = pd.concat([litter, totcover], axis=1)
totcover = totcover[sorted(totcover.columns.tolist())]

In [59]:
# add "total cover" to column names
cols = totcover.columns.tolist()
cols = [x.replace('cover (%)' , 'total cover (%)') for x in cols]
totcover.columns = cols

In [60]:
# add ancillary
totcover = pd.concat([totcover, anc], axis=1, join='inner')
len(totcover)

185

In [62]:
# export
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_04/original_files/'
f = 'AKVEG_AIM_NPRA_fcover_total_merged.csv'
totcover.to_csv(p + f)

## combine top and total for test 05

In [67]:
allcover = pd.concat([totcover, topcover1], axis=1, join='inner')
c = sorted([col for col in allcover.columns if 'cover (%)' in col])
o = sorted([col for col in allcover.columns if 'cover (%)' not in col])
cover = allcover[c]
other = allcover[o]
final_allcover = pd.concat([cover, other], axis=1)
final_allcover = final_allcover.loc[:,~final_allcover.columns.duplicated()].copy()
final_allcover

Unnamed: 0_level_0,bare ground top cover (%),bryophyte total cover (%),deciduous dwarf shrub total cover (%),deciduous dwarf to low shrub total cover (%),deciduous dwarf to tall shrub total cover (%),deciduous dwarf to tree total cover (%),deciduous tree total cover (%),evergreen dwarf shrub total cover (%),evergreen dwarf to low shrub total cover (%),evergreen dwarf to tall shrub total cover (%),...,forb total cover (%),graminoid total cover (%),lichen total cover (%),litter total cover (%),water top cover (%),latitude,longitude,plot_radius_m,source,year
Site Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FMTT-5B,0.000000,3.0,0.7,13.5,,,,21.8,21.7,,...,2.3,37.4,15.2,4.081633,0.00,68.873887,-156.403748,30,AKVEG_AIM_NPRA,2012
FWMM-5B,0.000000,2.1,,14.0,,,,0.7,1.3,,...,2.7,76.8,1.3,11.333333,4.00,68.821651,-156.121144,30,AKVEG_AIM_NPRA,2012
AB-1B,20.666667,,,0.0,,,,,,,...,1.4,0.7,12.7,0.000000,0.00,68.294218,-156.035416,55,AKVEG_AIM_NPRA,2012
FMTT-1,0.000000,3.4,,15.0,14.3,,,25.9,11.6,,...,4.7,34.7,0.7,2.040816,0.00,69.357297,-155.814138,30,AKVEG_AIM_NPRA,2012
AFMM-5,0.000000,2.0,0.7,0.0,1.3,,,3.3,0.0,,...,32.7,28.0,,0.000000,4.00,69.827883,-154.564411,30,AKVEG_AIM_NPRA,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CPHCP-78,0.000000,21.5,,3.0,8.1,,,26.6,23.7,,...,1.5,63.0,18.4,0.000000,10.37,70.124101,-160.501601,30,AKVEG_AIM_NPRA,2017
ADST-75,33.340000,3.4,,,0.0,,,40.0,0.0,,...,8.1,5.3,22.8,2.670000,0.00,69.292420,-159.875810,30,AKVEG_AIM_NPRA,2017
ADST-76,0.000000,72.6,8.0,0.7,,,,35.3,,,...,16.0,46.6,4.1,0.670000,0.00,69.210250,-159.875870,30,AKVEG_AIM_NPRA,2017
ADST-79,28.000000,14.7,8.7,2.0,,,,42.6,0.0,,...,16.8,4.0,18.7,3.330000,0.00,69.233760,-159.988460,30,AKVEG_AIM_NPRA,2017


In [69]:
p = '/mnt/poseidon/remotesensing/arctic/data/training/Test_05/temp_fcover/'
f = 'VEG_fcover_parent.csv'
f1 = 'VEG_fcover_child.csv'
final_allcover.to_csv(p+f)
final_allcover.to_csv(p+f1)

## Misc

In [129]:
# # adjust GMT-2 site code names
# nonull_proj = nonveg[~nonveg['ProjectName'].isna()]
# gmt2_plots = nonull_proj[nonull_proj['ProjectName'].str.contains('GMT2')]
# gmt2_plots = gmt2_plots.copy()
# gmt2_plots['PlotID2'] = 'GMT2-' + gmt2_plots['PlotID'].astype(str)
# new_gmt_ids = gmt2_plots[['PlotID', 'PlotID2']]

# # add new IDs to original non-veg data
# nonveg2 = nonveg.merge(new_gmt_ids, on='PlotID', how='left')
# nonveg2['PlotID2'].fillna(nonveg2['PlotID'], inplace=True)
# nonveg2['PlotID'] = nonveg2['PlotID2']