In [29]:
import pandas as pd
from rapidfuzz import process, fuzz
import re
import sys
import importlib
import os
import csv
from pathlib import Path

# find helper scripts
project_root = Path.cwd().parent

# 2) Add that root to sys.path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
import pavc_funcs.standardize_pft_funcs as spf
importlib.reload(spf)

<module 'pavc_funcs.standardize_pft_funcs' from '/mnt/poseidon/remotesensing/arctic/alaska_pft_fcover_harmonization/pavc_funcs/standardize_pft_funcs.py'>

In [30]:
sources = ['abr', 'akveg', 'ava', 'neon', 'nga']

In [31]:
# load and prep the nonstandard fcover data
dfs = []
for source in sources:
    path = f'../data/plot_data/{source}/output_data/{source}_nonstandard_species_fcover.csv'
    df = pd.read_csv(path)
    dfs.append(df)
fcover_data = pd.concat(dfs)
fcover_data.head(3)

Unnamed: 0,plotVisit,datasetSpeciesName,standardHabit,nonstandardHabit,percentCover
0,u165edeef6,Litter,litter,litter,18.54
1,u165edeef6,Hylocomium splendens,bryophyte,moss,15.89
2,u165edeef6,Carex bigelowii,graminoid,graminoid,11.26


In [32]:
# get unique dataset species names
source_names = (
    fcover_data.drop(columns='percentCover')  # drop this column
    .groupby('datasetSpeciesName', as_index=False)
    .agg({
        'standardHabit': 'first',
        'nonstandardHabit': 'first',
        'plotVisit': lambda x: list(x)
    })
)
source_names.head(3)

Unnamed: 0,datasetSpeciesName,standardHabit,nonstandardHabit,plotVisit
0,!Oncophorus demetrii,bryophyte,bryophyte,"[u86f7e437f, ue9d71f5ee, u84a516841, u3c363836..."
1,Abietinella abietina,bryophyte,moss,"[ue971a0c9e, uaa73cfd9b, u35d95359b, uf0f69fc6..."
2,Acarospora schleicheri,lichen,lichen,"[u3e41cd323, ud0ef714b4, u00dda602e, u5e173091..."


In [33]:
# load the akveg checklist
checklist = pd.read_csv('../data/supporting_data/akveg_species_checklist.csv')
checklist.head(3)

Unnamed: 0,Code,Name,Status,Accepted Name,Family,Name Source,Level,Category,Habit
0,abroth,Abrothallus De Not.,accepted,Abrothallus De Not.,Abrothallaceae,CNALH,genus,lichen,lichen
1,abrpar,Abrothallus parmeliarum (Sommerf.) Arnold,accepted,Abrothallus parmeliarum (Sommerf.) Arnold,Abrothallaceae,CNALH,species,lichen,lichen
2,abrpey,Abrothallus peyritschii (Stein) Kotte,accepted,Abrothallus peyritschii (Stein) Kotte,Abrothallaceae,Dillman et al. 2012,species,lichen,lichen


In [34]:
# —————————————————————————————————————————— 1) Split taxon vs author ——————————————————————————————————————————
def split_taxon_author(name):
    # 1) Parenthetical citations: “(Piper) M. Peck”
    parts = re.split(r'\s(?=\()', name, maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    # 2) sensu lato marker: “s.l.”
    parts = re.split(r'\s(?=s\.l\.?\b)', name, maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    # 3) Multi-letter uppercase initials: “DC.”, “NYL.”
    parts = re.split(r'\s(?=[A-Z]{2,}\.)', name, maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    # 4) Standard author abbreviations: “Mot.”, “Smith.”
    parts = re.split(r'\s(?=[A-Z][a-z]+\.)', name, maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    # 5) Single-letter initials: “L.”
    parts = re.split(r'\s(?=[A-Z]\.)', name, maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    # 6) No author detected
    return name.strip(), ''

# —————————————————————————————————————————— 2) Clean & normalize ——————————————————————————————————————————
def clean_species_name(name):
    n = name.lower()
    n = re.sub(r'\(.*?\)', '', n)
    n = re.sub(r'[^a-z\s]', '', n)
    return re.sub(r'\s+', ' ', n).strip()

def normalize_by_rank(name, rank):
    tokens = clean_species_name(name).split()
    rank_to_n = {
        'family':     1,
        'genus':      1,
        'species':    2,
        'subspecies': 4,
        'variety':    4,
        'form':       4
    }
    k = rank_to_n.get(rank.lower(), len(tokens))
    return ' '.join(tokens[:k])

# —————————————————————————————————————————— 3) Improved rank inference ——————————————————————————————————————————
def infer_rank_from_name(name):
    low = name.lower()
    # extract pure alphabetic tokens
    words = re.findall(r'\b[a-z]+\b', low)
    for idx, w in enumerate(words):
        if w == 'var':
            return 'variety' if idx >= 2 else 'genus'
        if w in ('ssp', 'subsp'):
            return 'subspecies' if idx >= 2 else 'genus'
        if w in ('spp', 'species', 'sp', 's'):
            return 'genus' if idx < 2 else 'subspecies'
    # fallback on token count
    if len(words) <= 1:
        return 'genus'
    if len(words) == 2:
        return 'species'
    return 'subspecies'

# —————————————————————————————————————————— 4) Fuzzy‐match ——————————————————————————————————————————
def fallback_fuzzy_match(normalized_key, raw_taxon, ref_df, min_score=85):
    dataset_genus = clean_species_name(raw_taxon).split()[0]
    inferred = infer_rank_from_name(raw_taxon)
    rank_priority = {
        'subspecies': ['subspecies','variety','species','genus','family'],
        'variety':    ['variety','subspecies','species','genus','family'],
        'species':    ['species','subspecies','variety','genus','family'],
        'genus':      ['genus','family'],
        'family':     ['family'],
    }

    audit = []
    for rank in rank_priority.get(inferred, []):
        subset = ref_df[ref_df['name_rank'] == rank].copy()
        # enforce same genus for species+ inputs, and genus-only for genus
        subset = subset[
            subset['taxon_part'].str.split().str[0].str.lower() == dataset_genus
        ]
        if subset.empty:
            continue

        ref_map  = dict(zip(subset['normalized'], subset['accepted_name']))
        orig_map = dict(zip(subset['normalized'], subset['dataset_species_name']))

        if normalized_key in ref_map:
            return ref_map[normalized_key], 100, rank, orig_map[normalized_key], []

        cands = process.extract(
            normalized_key, ref_map.keys(),
            scorer=fuzz.partial_ratio, limit=5
        )
        audit += [(ref_map[n], s, rank) for n, s, _ in cands]
        if not cands or cands[0][1] < min_score:
            continue

        best_score = cands[0][1]
        best = [c for c in cands if c[1] == best_score]

        for norm, score, _ in best:
            if inferred in ('subspecies','variety') and raw_taxon.split()[-1] == norm.split()[-1]:
                return ref_map[norm], score, rank, orig_map[norm], audit
        norm, score, _ = best[0]
        return ref_map[norm], score, rank, orig_map[norm], audit

    return None, 0, None, None, audit

# —————————————————————————————————————————— 5) Main pipeline ——————————————————————————————————————————
def match_species_pipeline(df1, df2, score_threshold=85):
    ref = df2.rename(columns={
        'Name':           'dataset_species_name',
        'Accepted Name':  'accepted_name',
        'Level':          'accepted_rank'
    }).copy()

    ref[['taxon_part','author_part']] = ref['dataset_species_name'] \
        .apply(lambda x: pd.Series(split_taxon_author(x)))
    ref['name_rank'] = ref['taxon_part'].apply(infer_rank_from_name)
    ref['normalized'] = ref.apply(
        lambda r: normalize_by_rank(r['taxon_part'], r['name_rank']),
        axis=1
    )

    df = df1.copy()
    df[['taxon_part','author_part']] = df['datasetSpeciesName'] \
        .apply(lambda x: pd.Series(split_taxon_author(x)))
    df['normalized']    = df['taxon_part'].apply(clean_species_name)
    df['inferred_rank'] = df['taxon_part'].apply(infer_rank_from_name)

    def match_one(row):
        return fallback_fuzzy_match(
            row['normalized'], row['taxon_part'], ref,
            min_score=score_threshold
        )

    out = df.apply(match_one, axis=1)
    df[[
        'accepted_name',
        'match_score',
        'matched_rank',
        'matched_checklist_name',
        'audit_candidates'
    ]] = pd.DataFrame(out.tolist(), index=df.index)

    df['rank_match'] = df['matched_rank'] == df['inferred_rank']
    return df

In [35]:
# Example run
df1 = source_names[['datasetSpeciesName', 'standardHabit']]
df2 = checklist[['Name', 'Accepted Name', 'Level']]
result = match_species_pipeline(df1, df2)

In [36]:
result.to_csv('../data/supporting_data/accepted_name_mapping_temp.csv', encoding='utf-8-sig')

### Create `accepted_name_mapping.csv` from `accepted_name_mapping_temp` to get an Accepted Name associated with every Dataset Species Name
- Should have two columns: accepted_name, dataset_species_name

In [37]:
# Match the dataset_species_names in accepted_name_mapping_temp to dataset_species_names in accepted_name_mapping
pft_checklist = pd.read_csv('../data/supporting_data/accepted_name_mapping.csv', encoding='utf-8', index_col=0)
pft_checklist = pft_checklist.merge(source_names, how='left', left_on='datasetSpeciesName', right_on='datasetSpeciesName')
pft_checklist = pft_checklist.merge(checklist, how='left', left_on='accepted_name', right_on='Accepted Name')
pft_checklist = pft_checklist.groupby('datasetSpeciesName').agg('first')
pft_checklist = pft_checklist.drop(columns=['Name', 'Status', 'Accepted Name'])
pft_checklist.head(3)

Unnamed: 0_level_0,accepted_name,standardHabit,nonstandardHabit,plotVisit,Code,Family,Name Source,Level,Category,Habit
datasetSpeciesName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
!Oncophorus demetrii,Oncophorus (Brid.) Brid.,bryophyte,bryophyte,"[u86f7e437f, ue9d71f5ee, u84a516841, u3c363836...",oncoph,Rhabdoweisiaceae,Flora of North America,genus,moss,moss
Abietinella abietina,Abietinella abietina (Hedw.) Fleisch.,bryophyte,moss,"[ue971a0c9e, uaa73cfd9b, u35d95359b, uf0f69fc6...",abiabi,Thuidiaceae,Flora of North America,species,moss,moss
Acarospora schleicheri,Acarospora schleicheri (Ach.) A. Massal.,lichen,lichen,"[u3e41cd323, ud0ef714b4, u00dda602e, u5e173091...",acasch,Acarosporaceae,CNALH,species,lichen,lichen


In [38]:
# print which unique accepted names have no assigned Level
nan_level = pft_checklist[pft_checklist['Level'].isnull()]
nan_level['accepted_name'].unique()

array(['algae', 'bare ground', 'bryophyte', 'forb', 'other', 'lichen',
       'graminoid', 'Lepraria membranacea (Dicks.) Vain.', 'litter',
       'Pedicularis arctoeuropaea (Hultén) Molau & D.F. Murray',
       'Polyblastia bryophila Lonnr.',
       'Rhizocarpon disporum (Naeg. ex Hepp) Müll. Arg.',
       'Salix brachycarpa Nutt.',
       'Schistidium holmenianum Steere & Brassard', 'water',
       'Xanthoria (Fr.) Th. Fr.'], dtype=object)

In [39]:
# if key is in 'accepted_name', use the value in 'Level'
level_specs = {'algae':'type', 
               'bare ground':'type', 
               'bryophyte':'type', 
               'forb':'type',
               'other':'type',
               'lichen':'type',
               'graminoid':'type',
               'Lepraria membranacea (Dicks.) Vain.':'species',
               'litter':'type',
               'Pedicularis arctoeuropaea (Hultén) Molau & D.F. Murray':'species',
               'Polyblastia bryophila Lonnr.':'species',
               'Rhizocarpon disporum (Naeg. ex Hepp) Müll. Arg.':'species',
               'Salix brachycarpa Nutt.':'species',
               'Schistidium holmenianum Steere & Brassard':'species',
               'water':'type',
               'Xanthoria (Fr.) Th. Fr.':'genus'}
mask = pft_checklist['accepted_name'].isin(level_specs)
pft_checklist['Level'] = pft_checklist['Level'].where(~mask, pft_checklist['accepted_name'].map(level_specs))

In [40]:
# ensure there are no NaN levels
print(pft_checklist['Level'].isna().any())

False


In [41]:
# split author from accepted name based on taxon rank (Level)
taxon_ranks = sorted(list(pft_checklist['Level'].unique()))
taxon_ranks

['genus', 'species', 'subspecies', 'type', 'variety']

In [42]:
# Ensure the author column exists as object dtype
pft_checklist['accepted_species_name_author'] = pd.Series(dtype='object', index=pft_checklist.index)
pft_checklist_by_taxon = pft_checklist.groupby('Level', group_keys=False)

for rank in taxon_ranks:
    # Get the group for the current rank
    if rank in pft_checklist_by_taxon.groups:
        group = pft_checklist_by_taxon.get_group(rank)
        
        if rank in ['family', 'type']:
            # No author for family or type
            pft_checklist.loc[group.index, 'accepted_species_name_author'] = None
        
        elif rank == 'genus':
            # First word is the genus, rest is the author
            pft_checklist.loc[group.index, 'accepted_name'] = (
                group['accepted_name']
                .apply(lambda x: x.split()[0] if pd.notna(x) else x)
            )
            pft_checklist.loc[group.index, 'accepted_species_name_author'] = (
                group['accepted_name']
                .apply(lambda x: " ".join(x.split()[1:]) if pd.notna(x) else None)
            )
        
        elif rank == 'species':
            # First two words are the species name, rest is the author
            pft_checklist.loc[group.index, 'accepted_name'] = (
                group['accepted_name']
                .apply(lambda x: " ".join(x.split()[:2]) if pd.notna(x) else x)
            )
            pft_checklist.loc[group.index, 'accepted_species_name_author'] = (
                group['accepted_name']
                .apply(lambda x: " ".join(x.split()[2:]) if pd.notna(x) else None)
            )
        
        elif rank in ['subspecies', 'variety']:
            # First four words are the subspecies/variety name, rest is the author
            pft_checklist.loc[group.index, 'accepted_name'] = (
                group['accepted_name']
                .apply(lambda x: " ".join(x.split()[:4]) if pd.notna(x) else x)
            )
            pft_checklist.loc[group.index, 'accepted_species_name_author'] = (
                group['accepted_name']
                .apply(lambda x: " ".join(x.split()[4:]) if pd.notna(x) else None)
            )
        
        else:
            print(f'{rank} is an unidentified taxon rank.')

In [43]:
len(pft_checklist.index.unique())

2242

In [44]:
# explode the plotVisit lists into rows
pft_exploded = pft_checklist.explode('plotVisit')
pft_exploded.reset_index(inplace=True)

In [45]:
# add some useful information from aux
aux = pd.read_csv('../pavc/survey_unit_information.csv')
checklist_w_info = pd.merge(left=aux[['visit_id', 'data_source', 'data_subsource']], 
                            right=pft_exploded,
                            left_on='visit_id', right_on='plotVisit', 
                            how='right')
checklist_w_info = checklist_w_info.drop(columns=['plotVisit', 'Code', 'Family'])
checklist_w_info.head(3)

Unnamed: 0,visit_id,data_source,data_subsource,datasetSpeciesName,accepted_name,standardHabit,nonstandardHabit,Name Source,Level,Category,Habit,accepted_species_name_author
0,u86f7e437f,NGA,NGEE-Arctic Amy Breen (pub 2020),!Oncophorus demetrii,Oncophorus,bryophyte,bryophyte,Flora of North America,genus,moss,moss,(Brid.) Brid.
1,ue9d71f5ee,NGA,NGEE-Arctic Amy Breen (pub 2020),!Oncophorus demetrii,Oncophorus,bryophyte,bryophyte,Flora of North America,genus,moss,moss,(Brid.) Brid.
2,u84a516841,NGA,NGEE-Arctic Amy Breen (pub 2020),!Oncophorus demetrii,Oncophorus,bryophyte,bryophyte,Flora of North America,genus,moss,moss,(Brid.) Brid.


In [46]:
# set the pft_checklist column names
col_names = {'accepted_name':'accepted_species_name',
             'standardHabit':'pft',
             'nonstandardHabit':'nonstandard_pft',
             'Family':'taxon_family',
             'Name Source':'naming_authority',
             'Level':'taxon_rank',
             'Category':'category',
             'Habit':'habit',
             'datasetSpeciesName':'dataset_species_name'}
checklist_w_info.rename(columns=col_names, inplace=True)

# drop species names derived from plots that are not included in the PAVC
checklist_w_info = checklist_w_info.dropna(subset=['visit_id'])

In [47]:
# condense by making unit_ids a list
checklist_w_info_02 = checklist_w_info.groupby('dataset_species_name').agg({
    'accepted_species_name': 'first',
    'accepted_species_name_author': 'first',
    'visit_id': lambda x: list(x.unique()),  # List of unique unit_id values
    'data_source': lambda x: list(x.unique()),
    'data_subsource': lambda x: list(x.unique()),
    'taxon_rank': 'first',
    'naming_authority': 'first',
    'category': 'first',
    'habit': 'first',
    'pft': 'first',
    'nonstandard_pft': 'first'
}).reset_index()

In [48]:
len(checklist_w_info_02)

1152

In [49]:
# column order
checklist_w_info_02 = checklist_w_info_02[['dataset_species_name', 'accepted_species_name', 'accepted_species_name_author',
                                           'visit_id', 'data_source', 'data_subsource', 'taxon_rank', 'naming_authority',
                                           'category', 'habit', 'pft', 'nonstandard_pft']]
checklist_w_info_02.head(3)

Unnamed: 0,dataset_species_name,accepted_species_name,accepted_species_name_author,visit_id,data_source,data_subsource,taxon_rank,naming_authority,category,habit,pft,nonstandard_pft
0,!Oncophorus demetrii,Oncophorus,(Brid.) Brid.,"[u86f7e437f, ue9d71f5ee, u84a516841, u3c363836...",[NGA],[NGEE-Arctic Amy Breen (pub 2020)],genus,Flora of North America,moss,moss,bryophyte,bryophyte
1,Abietinella abietina,Abietinella abietina,(Hedw.) Fleisch.,"[ue971a0c9e, uaa73cfd9b, u35d95359b, uf0f69fc6...","[AKVEG, AVA]","[AIM NPR-A Nawrocki (pub 2020), Atqasuk Villar...",species,Flora of North America,moss,moss,bryophyte,moss
2,Achillea lanulosa,Achillea millefolium,L.,"[u9ffbd5716, u0dd9597d7, u1b3577145, u35b471d5...",[AVA],[Atqasuk Villarreal (pub 2013)],species,FNA,eudicot,forb,forb,forb


In [50]:
# ensure the checklist is formatted properly
species_pft_checklist = spf.validate_species_pft_checklist(checklist_w_info_02)
species_pft_checklist.set_index('dataset_species_name', inplace=True)
species_pft_checklist.head(3)

Unnamed: 0_level_0,accepted_species_name,accepted_species_name_author,visit_id,data_source,data_subsource,taxon_rank,naming_authority,category,habit,pft,nonstandard_pft
dataset_species_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
!Oncophorus demetrii,Oncophorus,(Brid.) Brid.,"[u86f7e437f, ue9d71f5ee, u84a516841, u3c363836...",[NGA],[NGEE-Arctic Amy Breen (pub 2020)],genus,Flora of North America,moss,moss,bryophyte,bryophyte
Abietinella abietina,Abietinella abietina,(Hedw.) Fleisch.,"[ue971a0c9e, uaa73cfd9b, u35d95359b, uf0f69fc6...","[AKVEG, AVA]","[AIM NPR-A Nawrocki (pub 2020), Atqasuk Villar...",species,Flora of North America,moss,moss,bryophyte,moss
Achillea lanulosa,Achillea millefolium,L.,"[u9ffbd5716, u0dd9597d7, u1b3577145, u35b471d5...",[AVA],[Atqasuk Villarreal (pub 2013)],species,FNA,eudicot,forb,forb,forb


In [51]:
# species_pft_checklist.to_csv('species_pft_checklist.csv', encoding='utf-8-sig', quoting=csv.QUOTE_ALL)

### Create `species_pft_checklist_dd.csv`

In [52]:
# import json
# UNITS_AND_DEFS = { col: {"unit": "", "definition": ""} for col in species_pft_checklist.columns }
# print(json.dumps(UNITS_AND_DEFS, indent=4))

In [53]:
dd_info = {
    "dataset_species_name": {
        "unit": 1,
        "definition": "The unaccepted species name derived from source survey unit data. These unaccepted names include misspellings, disparate syntax, extra punctuation, encoding issues, etc., that were semi-automatically assigned an accepted species name"
    },
    "accepted_species_name": {
        "unit": 1,
        "definition": "From Nawrocki et al. (2020) Checklist of Vascular Plants, Bryophytes, Lichens, and Lichenicolous Fungi of Alaska, an accepted name of a species determined by a naming authority"
    },
    "accepted_species_name_author": {
        "unit": 1,
        "definition": "From Nawrocki et al. (2020) Checklist of Vascular Plants, Bryophytes, Lichens, and Lichenicolous Fungi of Alaska, the author associated with an accepted species name"
    },
    "visit_id": {
        "unit": 1,
        "definition": "A python list of unique identifiers representing the survey unit visitation at a particular survey unit; some survey units are visited more than once; in Python, the lists in this column can be 'exploded' using explode() into rows"
    },
    "data_source": {
        "unit": 1,
        "definition": "A python list of data_source strings associated with the visit_ids; this should be dropped when using explode() on visit_id"
    },
    "data_subsource": {
        "unit": 1,
        "definition": "A python list of data_subsource strings associated with the visit_ids; this should be dropped when using explode() on unit_id"
    },
    "taxon_rank": {
        "unit": 1,
        "definition": "Indicates the taxon rank of the species name (family, genus, species, subspecies, variety, and type); type is assigned when the dataset author only indicated a PFT for the species"
    },
    "naming_authority": {
        "unit": 1,
        "definition": "From Nawrocki et al. (2020) Checklist of Vascular Plants, Bryophytes, Lichens, and Lichenicolous Fungi of Alaska, the naming authority that decided if a species name is accepted; if there was not a match in the Checklist, a species expert was consulted"
    },
    "category": {
        "unit": 1,
        "definition": "From Nawrocki et al. (2020) Checklist of Vascular Plants, Bryophytes, Lichens, and Lichenicolous Fungi of Alaska, the vegetation category or categories that are associated with an accepted species name"
    },
    "habit": {
        "unit": 1,
        "definition": "From Nawrocki et al. (2020) Checklist of Vascular Plants, Bryophytes, Lichens, and Lichenicolous Fungi of Alaska, the growth habit(s) at maturity in the Arcitc that is associated with an accepted species name"
    },
    "pft": {
        "unit": 1,
        "definition": "Our PFT assignment based on a species' growth habit that is most likely associated with the arctic species at maturity; derived partly from Nawrocki et al. (2020) Checklist of Vascular Plants, Bryophytes, Lichens, and Lichenicolous Fungi of Alaska, as well as Macander et al. (2020) Table S4, the Plant Functional Type (PFT) and leaf retention descriptor for shrubs and trees; if there was not a match in the Nawrocki Checklist and/or Table S4, a species expert was consulted"
    },
    "nonstandard_pft": {
        "unit": 1,
        "definition": "The intermediate PFT designation that came from the source data AND did not have a match in the Nawrocki et al. (2020) Checklist of Vascular Plants, Bryophytes, Lichens, and Lichenicolous Fungi of Alaska"
    }
}

In [54]:
import pandas as pd
import typing
from typing import get_origin, get_args

from pavc_funcs.schemas import SCHEMAS
SynthesizedAux = SCHEMAS["species_pft_checklist"]

dtype_map = {}
for name, field in SynthesizedAux.model_fields.items():
    t      = field.annotation
    origin = get_origin(t)
    args   = get_args(t)

    # 1) Explicit primitives
    if t is str:
        pd_type = "string"
    elif t is float:
        pd_type = "float64"
    elif t is int:
        pd_type = "Int64"            # nullable int

    # 2) Optionals
    elif origin is typing.Union and set(args) == {int, type(None)}:
        pd_type = "Int64"
    elif origin is typing.Union and set(args) == {str, type(None)}:
        pd_type = "string"
    elif origin is typing.Union and set(args) == {float, type(None)}:
        pd_type = "float64"

    # 3) Lists stay object
    elif origin in (list, typing.List):
        pd_type = "object"

    # 4) Everything else → string
    else:
        pd_type = "string"

    dtype_map[name] = pd_type

# cast in one go
species_pft_checklist.reset_index(inplace=True)
species_pft_checklist = species_pft_checklist.astype(dtype_map)

# build your DD exactly as before
rows = []
for col in species_pft_checklist.columns:
    meta = dd_info.get(col, {"unit": "", "definition": ""})
    rows.append({
        "column_name":      col,
        "unit":             meta["unit"],
        "definition":       meta["definition"],
        "column_long_name": col,
        "data_type":        species_pft_checklist[col].dtype.name
    })

dd_df = pd.DataFrame(rows, columns=[
    "column_name","unit","definition","column_long_name","data_type"
])

In [55]:
dd_df

Unnamed: 0,column_name,unit,definition,column_long_name,data_type
0,dataset_species_name,1,The unaccepted species name derived from sourc...,dataset_species_name,string
1,accepted_species_name,1,From Nawrocki et al. (2020) Checklist of Vascu...,accepted_species_name,string
2,accepted_species_name_author,1,From Nawrocki et al. (2020) Checklist of Vascu...,accepted_species_name_author,string
3,visit_id,1,A python list of unique identifiers representi...,visit_id,object
4,data_source,1,A python list of data_source strings associate...,data_source,object
5,data_subsource,1,A python list of data_subsource strings associ...,data_subsource,object
6,taxon_rank,1,Indicates the taxon rank of the species name (...,taxon_rank,string
7,naming_authority,1,From Nawrocki et al. (2020) Checklist of Vascu...,naming_authority,string
8,category,1,From Nawrocki et al. (2020) Checklist of Vascu...,category,string
9,habit,1,From Nawrocki et al. (2020) Checklist of Vascu...,habit,string


In [57]:
species_pft_checklist.to_csv('../pavc/species_pft_checklist.csv', encoding='utf-8-sig', quoting=csv.QUOTE_ALL, index=False)

In [58]:
dd_df.to_csv('../pavc/species_pft_checklist_dd.csv', encoding='utf-8-sig', quoting=csv.QUOTE_ALL, index=False)