In [1]:
import pandas as pd
import geopandas as gpd
import re
import os
from osgeo import gdal
import geopandas as gpd
import rasterio
import sys
import importlib
import re
import pandas as pd
import csv
import numpy as np
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from rasterio.transform import from_bounds
from rasterio.features import rasterize
from pathlib import Path
import json

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

# find helper scripts
project_root = Path().resolve().parent
parent = project_root.parent

# 2) Add that root to sys.path
if str(parent) not in sys.path:
    sys.path.insert(0, str(parent))
import alaska_pft_fcover_harmonization.pavc_funcs.standardize_pft_funcs as spf
importlib.reload(spf)

<module 'alaska_pft_fcover_harmonization.pavc_funcs.standardize_pft_funcs' from '/mnt/poseidon/remotesensing/arctic/alaska_pft_fcover_harmonization/pavc_funcs/standardize_pft_funcs.py'>

---
# Helper functions

In [2]:
# rename columns for ESS-Dive
def camel_to_snake(name):
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name)
    return name.lower()

# throughout the script this will help keep my indices matching
def normalize_to_int_str(val):
    try:
        f = float(val)
        i = int(f)
        if f == i:
            return str(i)
    except (ValueError, TypeError):
        # now also catches the int(inf) case
        pass
    return str(val)

---
# 1. Create `synthesized_pft_fcover.csv`

In [3]:
# load aux data
aux = pd.read_csv('../pavc/survey_unit_information.csv', index_col=0)
aux.index = aux.index.astype(str)

In [4]:
# pft_fcover
sources = ['abr', 'akveg', 'ava', 'neon', 'nga']
pft_dfs = []
all_num_visits = []
for src in sources:
    df = pd.read_csv(f'../data/plot_data/{src}/output_data/{src}_standard_pft_fcover.csv',
                     index_col=0, encoding='utf-8-sig')
    df.columns = [camel_to_snake(c) for c in df.columns]
    df.index.name = 'visit_id'
    df.index = df.index.astype(str)
    pft_dfs.append(df)
    all_num_visits.append(len(df.index))
print(f'Table-wise visits: {sum(all_num_visits)}')

# adjust column names and index
pft_fcover = pd.concat(pft_dfs)
print(f'PFT-fcover table visits: {len(pft_fcover)}')

# subselect pft_fcover data collected before 2010
subset_pft_fcover = pft_fcover.loc[aux.index]
print(f'PFT-fcover table visits on or after 2010: {len(subset_pft_fcover)}')

Table-wise visits: 3247
PFT-fcover table visits: 3247
PFT-fcover table visits on or after 2010: 977


In [5]:
# ensure the data are formatted correctly and makes sense
# pft_fcover_df = spf.validate_synthesized_pft_fcover(subset_pft_fcover)
pft_fcover_df = subset_pft_fcover
# pft_fcover_df.set_index('visit_id', drop=True, inplace=True)

## Create `synthesized_pft_fcover_dd.csv`

In [6]:
# UNITS_AND_DEFS = { col: {"unit": "", "definition": ""} for col in pft_fcover_df.columns }
# print(json.dumps(UNITS_AND_DEFS, indent=4))

In [7]:
dd_info = {
    "visit_id": {
        "unit": 1,
        "definition": "A unique identifier representing the survey unit visitation at a particular survey unit; some survey units are visited more than once"
    },
    "deciduous_shrub_cover": {
        "unit": "percent",
        "definition": "Percent total cover of deciduous shrubs"
    },
    "deciduous_tree_cover": {
        "unit": "percent",
        "definition": "Percent total cover of deciduous trees"
    },
    "evergreen_shrub_cover": {
        "unit": "percent",
        "definition": "Percent total cover of evergreen shrubs"
    },
    "evergreen_tree_cover": {
        "unit": "percent",
        "definition": "Percent total cover of evergreen trees"
    },
    "forb_cover": {
        "unit": "percent",
        "definition": "Percent total cover of forbs (herbaceous plants and spore-bearing plants)"
    },
    "graminoid_cover": {
        "unit": "percent",
        "definition": "Percent total cover of graminoids (grasses, sedges)"
    },
    "nonvascular_sum_cover": {
        "unit": "percent",
        "definition": "Percent total cover of bryophytes and lichens summed"
    },
    "bryophyte_cover": {
        "unit": "percent",
        "definition": "Percent total cover of bryophytes (mosses and liverworts)"
    },
    "lichen_cover": {
        "unit": "percent",
        "definition": "Percent total cover of lichens"
    },
    "litter_cover": {
        "unit": "percent",
        "definition": "Percent total cover of litter (includes scat; varying litter definitions; not all studies collected litter information)"
    },
    "bareground_cover": {
        "unit": "percent",
        "definition": "Percent top cover of bare ground (sum of rock, pebbles, crust, bare soil, etc.)"
    },
    "water_cover": {
        "unit": "percent",
        "definition": "Percent top cover of water (assumption made for AVA quadrat plots; studies did not always specify if they collected top or total water cover)"
    },
    "other_cover": {
        "unit": "percent",
        "definition": "Percent total cover of other vegetation types summed together (fungus, cyanobacteria, algae, standing dead vegetation, unknown vegetation, etc.)"
    }
}

In [8]:
import pandas as pd
import typing
from typing import get_origin, get_args

from pavc_funcs.schemas import SCHEMAS
SynthesizedAux = SCHEMAS["synthesized_pft_fcover"]

dtype_map = {}
for name, field in SynthesizedAux.model_fields.items():
    t      = field.annotation
    origin = get_origin(t)
    args   = get_args(t)

    # 1) Explicit primitives
    if t is str:
        pd_type = "string"
    elif t is float:
        pd_type = "float64"
    elif t is int:
        pd_type = "Int64"            # nullable int

    # 2) Optionals
    elif origin is typing.Union and set(args) == {int, type(None)}:
        pd_type = "Int64"
    elif origin is typing.Union and set(args) == {str, type(None)}:
        pd_type = "string"
    elif origin is typing.Union and set(args) == {float, type(None)}:
        pd_type = "float64"

    # 3) Lists stay object
    elif origin in (list, typing.List):
        pd_type = "object"

    # 4) Everything else â†’ string
    else:
        pd_type = "string"

    dtype_map[name] = pd_type

# cast in one go
pft_fcover_df.reset_index(inplace=True)
pft_fcover_df = pft_fcover_df.astype(dtype_map)

# build your DD exactly as before
rows = []
for col in pft_fcover_df.columns:
    meta = dd_info.get(col, {"unit": "", "definition": ""})
    rows.append({
        "column_name":      col,
        "unit":             meta["unit"],
        "definition":       meta["definition"],
        "column_long_name": col,
        "data_type":        pft_fcover_df[col].dtype.name
    })

dd_df = pd.DataFrame(rows, columns=[
    "column_name","unit","definition","column_long_name","data_type"
])

In [9]:
dd_df

Unnamed: 0,column_name,unit,definition,column_long_name,data_type
0,visit_id,1,A unique identifier representing the survey un...,visit_id,string
1,deciduous_shrub_cover,percent,Percent total cover of deciduous shrubs,deciduous_shrub_cover,float64
2,deciduous_tree_cover,percent,Percent total cover of deciduous trees,deciduous_tree_cover,float64
3,evergreen_shrub_cover,percent,Percent total cover of evergreen shrubs,evergreen_shrub_cover,float64
4,evergreen_tree_cover,percent,Percent total cover of evergreen trees,evergreen_tree_cover,float64
5,forb_cover,percent,Percent total cover of forbs (herbaceous plant...,forb_cover,float64
6,graminoid_cover,percent,"Percent total cover of graminoids (grasses, se...",graminoid_cover,float64
7,nonvascular_sum_cover,percent,Percent total cover of bryophytes and lichens ...,nonvascular_sum_cover,float64
8,bryophyte_cover,percent,Percent total cover of bryophytes (mosses and ...,bryophyte_cover,float64
9,lichen_cover,percent,Percent total cover of lichens,lichen_cover,float64


In [8]:
pft_fcover_df.head(3)

Unnamed: 0_level_0,deciduous_shrub_cover,deciduous_tree_cover,evergreen_shrub_cover,evergreen_tree_cover,forb_cover,graminoid_cover,nonvascular_sum_cover,bryophyte_cover,lichen_cover,litter_cover,bareground_cover,water_cover,other_cover
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
120715b1,17.44,,24.83,,9.39,14.77,28.83,22.12,6.71,75.17,0.67,3.36,
120815a1,47.72,,,,40.28,56.05,39.68,39.68,,79.37,1.59,0.0,
120727b1,40.66,,,,21.34,20.01,24.67,24.67,,79.33,6.0,0.0,


In [9]:
# export pft_fcover
pft_fcover_df = pft_fcover_df.replace({None: np.nan, '':np.nan})
pft_fcover_df.to_csv('../pavc/synthesized_pft_fcover.csv', encoding='utf-8-sig', quoting=csv.QUOTE_ALL)

In [12]:
dd_df.to_csv('../pavc/synthesized_pft_fcover_dd.csv', encoding='utf-8-sig', quoting=csv.QUOTE_ALL, index=False)

## Address all the pydantic warnings

In [10]:
# address coverage less than 100%
temp = pft_fcover_df.merge(aux, how='left', left_index=True, right_index=True)

# columns to exclude in summing
exclude = {'nonvascular_sum_cover'}

# identify *_cover cols, minus nonvascular_sum_cover
cover_cols = [
    c for c in df.columns
    if c.endswith('_cover') and c not in exclude
]

# get rows less summing than 100
row_sums = df[cover_cols].sum(axis=1)
mask = row_sums < 100
df_under100 = (
    df
    .loc[mask]            # all columns, only rows where sum<100
    .copy()               # avoid SettingWithCopyWarning
    .assign(total_sum=row_sums[mask])
)

In [11]:
# determine the culprit of data summing to <100
df_under100.merge(aux[['data_source', 'data_subsource']], 
                  how='left', left_index=True, right_index=True)

Unnamed: 0_level_0,deciduous_shrub_cover,deciduous_tree_cover,evergreen_shrub_cover,evergreen_tree_cover,forb_cover,graminoid_cover,nonvascular_sum_cover,bryophyte_cover,lichen_cover,litter_cover,bareground_cover,water_cover,other_cover,total_sum,data_source,data_subsource
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
u3c363836c,26.0,,41.0,,6.0,11.0,13.0,13.0,,0.0,0.0,0.0,,97.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
ue9d71f5ee,29.0,,47.0,,2.0,14.0,6.0,6.0,,0.0,0.0,0.0,,98.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
u1c4f0c6eb,16.0,,28.0,,1.0,4.0,37.0,3.0,34.0,0.0,5.0,0.0,,91.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
u7e240de74,0.0,,65.0,,4.0,0.0,29.0,2.0,27.0,0.0,0.0,0.0,,98.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
ud7dacae2c,5.0,,57.0,,2.0,2.5,19.5,0.0,19.5,0.0,0.0,0.0,,86.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
u6f457e97a,13.0,,30.0,,1.0,4.0,31.5,1.5,30.0,0.0,3.0,0.0,,82.5,NGA,NGEE-Arctic Amy Breen (pub 2020)
ubf9661def,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
u857817355,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
ua056c8d05,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
u782337220,1.0,,6.0,0.0,2.0,31.0,30.0,28.0,2.0,8.0,0.0,0.0,,78.0,NGA,NGEE-Arctic Amy Breen (pub 2020)
