In [15]:
import pandas as pd
from rapidfuzz import process, fuzz
import re
import sys
import importlib
import os
import csv
from pathlib import Path

# find helper scripts
project_root = Path.cwd().parent

# 2) Add that root to sys.path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
import pavc_funcs.standardize_pft_funcs as spf
importlib.reload(spf)

<module 'pavc_funcs.standardize_pft_funcs' from '/mnt/poseidon/remotesensing/arctic/alaska_pft_fcover_harmonization/pavc_funcs/standardize_pft_funcs.py'>

In [16]:
sources = ['abr', 'akveg', 'ava', 'neon', 'nga']

In [17]:
# load and prep the nonstandard fcover data
dfs = []
for source in sources:
    path = f'../data/plot_data/{source}/output_data/{source}_nonstandard_species_fcover.csv'
    df = pd.read_csv(path)
    dfs.append(df)
fcover_data = pd.concat(dfs)
fcover_data.head(3)

Unnamed: 0,plotVisit,datasetSpeciesName,standardHabit,nonstandardHabit,percentCover
0,u165edeef6,Litter,litter,litter,18.54
1,u165edeef6,Hylocomium splendens,bryophyte,moss,15.89
2,u165edeef6,Carex bigelowii,graminoid,graminoid,11.26


In [18]:
# load aux table to drop extraneous plot visits
aux = pd.read_csv('../pavc/survey_unit_information.csv')
fcover_data_subsel = fcover_data[fcover_data['plotVisit'].isin(aux['visit_id'])]
len(fcover_data_subsel['plotVisit'].unique())

976

In [19]:
fcover_data_subsel.head(3)

Unnamed: 0,plotVisit,datasetSpeciesName,standardHabit,nonstandardHabit,percentCover
0,u165edeef6,Litter,litter,litter,18.54
1,u165edeef6,Hylocomium splendens,bryophyte,moss,15.89
2,u165edeef6,Carex bigelowii,graminoid,graminoid,11.26


In [20]:
# load the species to pft checklist to assign accepted species names to dataset species names
checklist = pd.read_csv('../pavc/species_pft_checklist.csv')
checklist.columns

Index(['dataset_species_name', 'accepted_species_name',
       'accepted_species_name_author', 'visit_id', 'data_source',
       'data_subsource', 'taxon_rank', 'naming_authority', 'category', 'habit',
       'pft', 'nonstandard_pft'],
      dtype='object')

In [21]:
checklist = checklist[['dataset_species_name', 'accepted_species_name']]
species_fcover = fcover_data_subsel.merge(checklist, how='left', left_on='datasetSpeciesName', right_on='dataset_species_name')
species_fcover = species_fcover.rename(columns={'plotVisit':'visit_id', 'percentCover':'fcover'})
species_fcover = species_fcover[['visit_id', 'accepted_species_name', 'fcover']]

In [22]:
len(species_fcover['visit_id'].unique())

976

In [23]:
# import json
# UNITS_AND_DEFS = { col: {"unit": "", "definition": ""} for col in species_fcover.columns }
# print(json.dumps(UNITS_AND_DEFS, indent=4))

In [24]:
dd_info = {
    "visit_id": {
        "unit": 1,
        "definition": "A unique identifier representing the survey unit visitation at a particular survey unit; some survey units are visited more than once"
    },
    "accepted_species_name": {
        "unit": 1,
        "definition": "From Nawrocki et al. (2020) Checklist of Vascular Plants, Bryophytes, Lichens, and Lichenicolous Fungi of Alaska, an accepted name of a species determined by a naming authority"
    },
    "fcover": {
        "unit": "percent",
        "definition": "Percent total cover of a vegetation type or species, or percent top cover of non-vegetation"
    }
}

In [25]:
import pandas as pd
import typing
from typing import get_origin, get_args

from pavc_funcs.schemas import SCHEMAS
SynthesizedAux = SCHEMAS["synthesized_species_fcover"]

dtype_map = {}
for name, field in SynthesizedAux.model_fields.items():
    t      = field.annotation
    origin = get_origin(t)
    args   = get_args(t)

    # 1) Explicit primitives
    if t is str:
        pd_type = "string"
    elif t is float:
        pd_type = "float64"
    elif t is int:
        pd_type = "Int64"            # nullable int

    # 2) Optionals
    elif origin is typing.Union and set(args) == {int, type(None)}:
        pd_type = "Int64"
    elif origin is typing.Union and set(args) == {str, type(None)}:
        pd_type = "string"
    elif origin is typing.Union and set(args) == {float, type(None)}:
        pd_type = "float64"

    # 3) Lists stay object
    elif origin in (list, typing.List):
        pd_type = "object"

    # 4) Everything else → string
    else:
        pd_type = "string"

    dtype_map[name] = pd_type

# cast in one go
species_fcover = species_fcover.astype(dtype_map)

# build your DD exactly as before
rows = []
for col in species_fcover.columns:
    meta = dd_info.get(col, {"unit": "", "definition": ""})
    rows.append({
        "column_name":      col,
        "unit":             meta["unit"],
        "definition":       meta["definition"],
        "column_long_name": col,
        "data_type":        species_fcover[col].dtype.name
    })

dd_df = pd.DataFrame(rows, columns=[
    "column_name","unit","definition","column_long_name","data_type"
])

In [29]:
dd_df.head(3)

Unnamed: 0,column_name,unit,definition,column_long_name,data_type
0,visit_id,1,A unique identifier representing the survey un...,visit_id,string
1,accepted_species_name,1,From Nawrocki et al. (2020) Checklist of Vascu...,accepted_species_name,string
2,fcover,percent,Percent total cover of a vegetation type or sp...,fcover,float64


In [26]:
species_fcover = spf.validate_synthesized_species_fcover(species_fcover)

In [27]:
species_fcover.set_index('visit_id', inplace=True)
species_fcover.head(3)

Unnamed: 0_level_0,accepted_species_name,fcover
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
u165edeef6,litter,18.54
u165edeef6,Hylocomium splendens,15.89
u165edeef6,Carex bigelowii,11.26


In [28]:
species_fcover.to_csv('../pavc/synthesized_species_fcover.csv', encoding='utf-8-sig', quoting=csv.QUOTE_ALL)

In [30]:
dd_df.to_csv('../pavc/synthesized_species_fcover_dd.csv', encoding='utf-8-sig', quoting=csv.QUOTE_ALL, index=False)