In [30]:
import numpy as np
import pandas as pd

In [31]:
veg_df = pd.read_csv('../data/neon_sjer_site/plant_presence/NEON_presence-cover-plant/NEON.D17.SJER.DP1.10058.001.2023-03.basic.20250129T000730Z.RELEASE-2025/NEON.D17.SJER.DP1.10058.001.div_10m2Data100m2Data.2023-03.basic.20241118T190814Z.csv')

In [32]:
veg_df

Unnamed: 0,uid,namedLocation,domainID,siteID,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertainty,elevation,elevationUncertainty,...,morphospeciesIDRemarks,identificationReferences,identificationHistoryID,additionalSpecies,remarks,measuredBy,recordedBy,samplingImpractical,samplingImpracticalRemarks,biophysicalCriteria
0,e82886d6-6131-4d67-98fc-bc0407a07ef9,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,...,,,,Y,,0000-0002-1733-4950,0000-0001-7298-9805,OK,,OK - no known exceptions
1,f9700725-14ae-4776-baf3-112c54e4885c,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,...,,,,Y,,0000-0002-1733-4950,0000-0001-7298-9805,OK,,OK - no known exceptions
2,9c7d73b9-f0d0-49e0-a271-353b958a3080,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,...,,,,Y,,0000-0002-1733-4950,0000-0001-7298-9805,OK,,OK - no known exceptions
3,ea797576-5431-4023-aa46-84703c84244d,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,...,,,,Y,,0000-0002-1733-4950,0000-0001-7298-9805,OK,,OK - no known exceptions
4,52c9902e-dafa-43a0-8225-1a23c3f2271f,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,...,,,,Y,,0000-0002-1733-4950,0000-0001-7298-9805,OK,,OK - no known exceptions
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1406,abfbf5f1-300b-4490-a072-2482a5120380,SJER_021.basePlot.div,D17,SJER,37.110743,-119.750784,WGS84,20.10,405.6,0.1,...,,,,Y,,0000-0002-1733-4950,0000-0002-1733-4950,OK,,OK - no known exceptions
1407,f276b189-5298-4fb6-a07a-49f90ee1a8b4,SJER_021.basePlot.div,D17,SJER,37.110743,-119.750784,WGS84,20.10,405.6,0.1,...,,,,Y,,0000-0002-1733-4950,0000-0002-1733-4950,OK,,OK - no known exceptions
1408,845e80fe-c46f-4acb-b319-e1a337abcffd,SJER_021.basePlot.div,D17,SJER,37.110743,-119.750784,WGS84,20.10,405.6,0.1,...,,,,Y,,0000-0002-1733-4950,0000-0002-1733-4950,OK,,OK - no known exceptions
1409,30b0ceac-7e11-457c-ba45-04faf68a46ca,SJER_021.basePlot.div,D17,SJER,37.110743,-119.750784,WGS84,20.10,405.6,0.1,...,,,,Y,,0000-0002-1733-4950,0000-0002-1733-4950,OK,,OK - no known exceptions


In [33]:
import pandas as pd
import numpy as np
import re

def reformat_subplot_id(x):
    parts = x.split('.')
    if len(parts) == 3 and parts[2] in {'1', '10', '100', '400'}:
        return f"{parts[0]}_{parts[2]}_{parts[1]}"
    return x

def stack_plant_presence(div_data_dict, total_sampled_area_filter=None):
    assert 'div_1m2Data' in div_data_dict and 'div_10m2Data100m2Data' in div_data_dict, \
        "Input dict must contain 'div_1m2Data' and 'div_10m2Data100m2Data'"

    div_1m = div_data_dict['div_1m2Data'].copy()
    div_10_100 = div_data_dict['div_10m2Data100m2Data'].copy()

    # Standardize subplotID formatting
    for df in [div_1m, div_10_100]:
        df['subplotID_old'] = df['subplotID']
        df['subplotID'] = df['subplotID'].astype(str).apply(
            lambda x: f"{x}_100" if len(x) == 2 else
            reformat_subplot_id(x) if '.' in x else x
        )

    # Create eventID and filter to only plant species
    for df in [div_1m, div_10_100]:
        df['year'] = df['endDate'].str[:4]
        df['eventID'] = np.where(
            df['eventID'].isna() | df['eventID'].str.len() > 11,
            df['siteID'] + "." + df['boutNumber'].astype(str) + "." + df['year'],
            df['eventID']
        )
        df.drop(columns='year', inplace=True)

    div_1m = div_1m[div_1m['divDataType'] == 'plantSpecies']

    # Keep relevant columns
    cols_to_keep = ["namedLocation", "domainID", "siteID", "decimalLatitude", "decimalLongitude", "geodeticDatum",
                    "coordinateUncertainty", "elevation", "elevationUncertainty", "nlcdClass", "eventID", "plotType",
                    "plotID", "subplotID", "boutNumber", "targetTaxaPresent", "taxonID", "scientificName",
                    "taxonRank", "family", "nativeStatusCode", "identificationQualifier", "morphospeciesID",
                    "samplingImpractical", "samplingImpracticalRemarks", "biophysicalCriteria", "publicationDate", "release"]

    div_1m = div_1m.loc[:, div_1m.columns.intersection(cols_to_keep)]
    div_10_100 = div_10_100.loc[:, div_10_100.columns.intersection(cols_to_keep)]

    # Separate event IDs
    small_event_ids = div_1m['eventID'].dropna().unique()
    big_event_ids = div_10_100['eventID'].dropna().unique()

    small_out_ids = np.setdiff1d(small_event_ids, big_event_ids)
    merge_ids = np.intersect1d(small_event_ids, big_event_ids)

    div_1m_out = div_1m[div_1m['eventID'].isin(small_out_ids)]
    div_1m = div_1m[div_1m['eventID'].isin(merge_ids)]

    # Build 10m2 data
    data_10 = div_10_100[div_10_100['subplotID'].str.contains('_10_')]
    data_10_build = div_1m.copy()
    data_10_build['subplotID'] = data_10_build['subplotID'].str.replace('_1_', '_10_')
    data_10 = pd.concat([data_10, data_10_build], ignore_index=True)

    # Build 100m2 data
    data_100 = div_10_100[div_10_100['subplotID'].str.endswith('_100')]
    data_100_build = data_10.copy()
    data_100_build['subplotID'] = data_100_build['subplotID'].str.replace(r'_10_[0-9]', '_100', regex=True)
    data_100 = pd.concat([data_100, data_100_build], ignore_index=True)

    # Build 400m2 data
    data_400 = data_100.copy()
    data_400['subplotID'] = '400'

    # Recombine and assign scale
    div_1m = pd.concat([div_1m, div_1m_out], ignore_index=True)

    div_1m['totalSampledArea'] = 1
    data_10['totalSampledArea'] = 10
    data_100['totalSampledArea'] = 100
    data_400['totalSampledArea'] = 400

    all_data = pd.concat([div_1m, data_10, data_100, data_400], ignore_index=True).drop_duplicates()

    # Filter invalid targetTaxaPresent
    group_cols = ['eventID', 'plotID', 'subplotID']
    all_data['tmp_count'] = all_data.groupby(group_cols)['targetTaxaPresent'].transform('count')
    all_data = all_data[
        ~((all_data['targetTaxaPresent'] == 'Y') & all_data['scientificName'].isna() & all_data['taxonID'].isna())
    ]
    all_data = all_data[
        ((all_data['tmp_count'] > 1) & (all_data['targetTaxaPresent'] != 'N')) | (all_data['tmp_count'] <= 1)
    ]
    all_data.drop(columns='tmp_count', inplace=True)

    # Filter if needed
    if total_sampled_area_filter in {1, 10, 100, 400}:
        all_data = all_data[all_data['totalSampledArea'] == total_sampled_area_filter]

    return all_data


In [34]:
import os
import pandas as pd
from glob import glob

def load_neon_presence_data(base_dir):
    """
    Load and combine NEON div_1m2Data and div_10m2Data100m2Data CSVs from local subdirectories.
    """
    all_1m2_files = glob(os.path.join(base_dir, "**", "*div_1m2Data*.csv"), recursive=True)
    all_10_100m2_files = glob(os.path.join(base_dir, "**", "*div_10m2Data100m2Data*.csv"), recursive=True)

    dfs_1m2 = [pd.read_csv(f, low_memory=False) for f in all_1m2_files]
    dfs_10_100 = [pd.read_csv(f, low_memory=False) for f in all_10_100m2_files]

    df_1m2 = pd.concat(dfs_1m2, ignore_index=True)
    df_10_100 = pd.concat(dfs_10_100, ignore_index=True)

    return {
        "div_1m2Data": df_1m2,
        "div_10m2Data100m2Data": df_10_100
    }


In [35]:
# Set this to your base directory
base_dir = '../data/neon_sjer_site/plant_presence/NEON_presence-cover-plant'

# Load data from local CSVs
div_data_dict = load_neon_presence_data(base_dir)

# Run the stacking function, e.g. for 10m²
stacked_df_10m2 = stack_plant_presence(div_data_dict, total_sampled_area_filter=10)

# Or for 100m²
stacked_df_100m2 = stack_plant_presence(div_data_dict, total_sampled_area_filter=100)


In [36]:
stacked_df_100m2

Unnamed: 0,namedLocation,domainID,siteID,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertainty,elevation,elevationUncertainty,plotType,...,scientificName,taxonRank,family,nativeStatusCode,identificationQualifier,morphospeciesID,samplingImpractical,samplingImpracticalRemarks,biophysicalCriteria,totalSampledArea
28912,SJER_012.basePlot.div,D17,SJER,37.083680,-119.717046,WGS84,20.12,321.0,0.1,distributed,...,Avena barbata Pott ex Link,species,Poaceae,I,,,,,,100
28913,SJER_012.basePlot.div,D17,SJER,37.083680,-119.717046,WGS84,20.12,321.0,0.1,distributed,...,Micropus californicus Fisch. & C.A. Mey.,species,Asteraceae,N,,,,,,100
28914,SJER_012.basePlot.div,D17,SJER,37.083680,-119.717046,WGS84,20.12,321.0,0.1,distributed,...,Thysanocarpus curvipes Hook.,species,Brassicaceae,N,,,,,,100
28915,SJER_012.basePlot.div,D17,SJER,37.083680,-119.717046,WGS84,20.12,321.0,0.1,distributed,...,Hordeum murinum L. ssp. leporinum (Link) Arcang.,subspecies,Poaceae,I,,,,,,100
28916,SJER_012.basePlot.div,D17,SJER,37.083680,-119.717046,WGS84,20.12,321.0,0.1,distributed,...,Stellaria nitens Nutt.,species,Caryophyllaceae,N,,,,,,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48094,SJER_004.basePlot.div,D17,SJER,37.108972,-119.746305,WGS84,20.10,383.1,0.1,distributed,...,Oxalis radicosa A. Rich.,species,Oxalidaceae,I,,,,,,100
48095,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,tower,...,Avena barbata Pott ex Link,species,Poaceae,I,,,,,,100
48096,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,tower,...,Bromus hordeaceus L.,species,Poaceae,I,,,,,,100
48098,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,tower,...,Cerastium glomeratum Thuill.,species,Caryophyllaceae,I,,,,,,100


In [37]:
stacked_df_100m2.columns

Index(['namedLocation', 'domainID', 'siteID', 'decimalLatitude',
       'decimalLongitude', 'geodeticDatum', 'coordinateUncertainty',
       'elevation', 'elevationUncertainty', 'plotType', 'nlcdClass', 'plotID',
       'subplotID', 'boutNumber', 'eventID', 'targetTaxaPresent', 'taxonID',
       'scientificName', 'taxonRank', 'family', 'nativeStatusCode',
       'identificationQualifier', 'morphospeciesID', 'samplingImpractical',
       'samplingImpracticalRemarks', 'biophysicalCriteria',
       'totalSampledArea'],
      dtype='object')

In [38]:
stacked_df_100m2[['taxonID', 'scientificName', 'targetTaxaPresent']]

Unnamed: 0,taxonID,scientificName,targetTaxaPresent
28912,AVBA,Avena barbata Pott ex Link,Y
28913,MICA,Micropus californicus Fisch. & C.A. Mey.,Y
28914,THCU,Thysanocarpus curvipes Hook.,Y
28915,HOMUL,Hordeum murinum L. ssp. leporinum (Link) Arcang.,Y
28916,STNI,Stellaria nitens Nutt.,Y
...,...,...,...
48094,OXRA,Oxalis radicosa A. Rich.,Y
48095,AVBA,Avena barbata Pott ex Link,Y
48096,BRHO2,Bromus hordeaceus L.,Y
48098,CEGL2,Cerastium glomeratum Thuill.,Y


In [48]:
stacked_df_100m2.scientificName.unique()

array(['Avena barbata Pott ex Link',
       'Micropus californicus Fisch. & C.A. Mey.',
       'Thysanocarpus curvipes Hook.',
       'Hordeum murinum L. ssp. leporinum (Link) Arcang.',
       'Stellaria nitens Nutt.', 'Gilia tricolor Benth.',
       'Dichelostemma capitatum (Benth.) Alph. Wood ssp. capitatum',
       "Erodium cicutarium (L.) L'Hér. ex Aiton", 'Senecio vulgaris L.',
       'Vulpia sp.', 'Capsella bursa-pastoris (L.) Medik.',
       'Claytonia parviflora Douglas ex Hook.',
       'Phacelia cicutaria Greene',
       'Clarkia purpurea (W. Curtis) A. Nelson & J.F. Macbr.',
       'Crassula connata (Ruiz & Pav.) A. Berger var. connata',
       'Eleocharis macrostachya Britton', 'Bromus arenarius Labill.',
       'Trifolium willdenovii Spreng.', 'Galium aparine L.',
       'Trifolium ciliolatum Benth.', 'Stellaria media (L.) Vill.',
       'Lupinus bicolor Lindl.', 'Carduus pycnocephalus L.',
       'Aira caryophyllea L.', 'Erodium brachycarpum (Godr.) Thell.',
       'Litho

In [57]:
stacked_df_100m2[stacked_df_100m2.scientificName == 'Pinus sabiniana Douglas ex Douglas']

Unnamed: 0,namedLocation,domainID,siteID,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertainty,elevation,elevationUncertainty,plotType,...,scientificName,taxonRank,family,nativeStatusCode,identificationQualifier,morphospeciesID,samplingImpractical,samplingImpracticalRemarks,biophysicalCriteria,totalSampledArea
29122,SJER_003.basePlot.div,D17,SJER,37.116632,-119.730379,WGS84,20.12,437.5,0.2,distributed,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,,,,100
29164,SJER_008.basePlot.div,D17,SJER,37.107458,-119.717536,WGS84,20.15,400.2,0.2,distributed,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,,,,100
29329,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,tower,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,OK,,OK - no known exceptions,100
29402,SJER_008.basePlot.div,D17,SJER,37.107458,-119.717536,WGS84,20.15,400.2,0.2,distributed,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,OK,,OK - no known exceptions,100
29644,SJER_010.basePlot.div,D17,SJER,37.128635,-119.749395,WGS84,20.1,409.2,0.1,distributed,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,OK,,OK - no known exceptions,100
29690,SJER_016.basePlot.div,D17,SJER,37.077219,-119.716137,WGS84,20.1,271.0,0.1,distributed,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,OK,,OK - no known exceptions,100
29846,SJER_017.basePlot.div,D17,SJER,37.126821,-119.745937,WGS84,20.2,472.4,0.3,distributed,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,OK,,OK - no known exceptions,100
29941,SJER_003.basePlot.div,D17,SJER,37.116632,-119.730379,WGS84,20.12,437.5,0.2,distributed,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,,,,100
29949,SJER_046.basePlot.div,D17,SJER,37.111276,-119.728168,WGS84,20.11,431.8,0.2,tower,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,,,,100
29971,SJER_003.basePlot.div,D17,SJER,37.116632,-119.730379,WGS84,20.12,437.5,0.2,distributed,...,Pinus sabiniana Douglas ex Douglas,species,Pinaceae,N,,,,,,100


In [39]:
stacked_df_100m2.targetTaxaPresent.value_counts()

targetTaxaPresent
Y    14979
Name: count, dtype: int64

In [43]:
def create_presence_absence_matrix(df):
    """
    Create a presence/absence matrix with 1 if a species is present, 0 otherwise.
    Rows are uniquely identified by plotID + subplotID + eventID.
    Columns are scientific names (not multi-indexed).
    """
    # Ensure no missing species names or key IDs
    df = df.dropna(subset=["plotID", "subplotID", "eventID", "scientificName"])

    # Assign presence = 1
    df['presence'] = 1

    # Pivot: subplots as rows, species as columns
    pa_matrix = (
        df.pivot_table(
            index=["plotID", "subplotID", "eventID"],
            columns="scientificName",
            values="presence",
            fill_value=0,
            aggfunc="max"
        )
        .reset_index()
    )

    # Remove the columns axis name ('scientificName') so species names are clean headers
    pa_matrix.columns.name = None

    # Optional: reorder with ID fields first
    id_cols = ["plotID", "subplotID", "eventID"]
    species_cols = sorted([col for col in pa_matrix.columns if col not in id_cols])
    pa_matrix = pa_matrix[id_cols + species_cols]

    return pa_matrix


In [44]:

# Generate presence/absence matrix
pa_matrix = create_presence_absence_matrix(stacked_df_100m2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['presence'] = 1


In [45]:
pa_matrix

Unnamed: 0,plotID,subplotID,eventID,Achillea millefolium L.,Acorus americanus (Raf.) Raf.,Aesculus californica (Spach) Nutt.,Agoseris grandiflora (Nutt.) Greene,Agoseris heterophylla (Nutt.) Greene,Agoseris sp.,Aira caryophyllea L.,...,Triteleia sp.,Tropidocarpum gracile Hook.,Unknown plant,Urticaceae sp.,Veronica sp.,Vulpia microstachys (Nutt.) Munro,Vulpia microstachys (Nutt.) Munro var. ciliata (Beal) Lonard & Gould,Vulpia microstachys/Vulpia myuros,Vulpia myuros (L.) C.C. Gmel.,Vulpia sp.
0,SJER_001,31_100,SJER.1.2017,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,SJER_001,31_100,SJER.1.2018,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,SJER_001,31_100,SJER.1.2019,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,SJER_001,31_100,SJER.1.2021,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,SJER_001,31_100,SJER.1.2023,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,SJER_047,41_100,SJER.1.2017,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
548,SJER_047,41_100,SJER.1.2018,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
549,SJER_047,41_100,SJER.1.2019,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
550,SJER_047,41_100,SJER.1.2021,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0


In [58]:
pa_matrix.head(10)

Unnamed: 0,plotID,subplotID,eventID,Achillea millefolium L.,Acorus americanus (Raf.) Raf.,Aesculus californica (Spach) Nutt.,Agoseris grandiflora (Nutt.) Greene,Agoseris heterophylla (Nutt.) Greene,Agoseris sp.,Aira caryophyllea L.,...,Triteleia sp.,Tropidocarpum gracile Hook.,Unknown plant,Urticaceae sp.,Veronica sp.,Vulpia microstachys (Nutt.) Munro,Vulpia microstachys (Nutt.) Munro var. ciliata (Beal) Lonard & Gould,Vulpia microstachys/Vulpia myuros,Vulpia myuros (L.) C.C. Gmel.,Vulpia sp.
0,SJER_001,31_100,SJER.1.2017,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,SJER_001,31_100,SJER.1.2018,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,SJER_001,31_100,SJER.1.2019,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,SJER_001,31_100,SJER.1.2021,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,SJER_001,31_100,SJER.1.2023,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5,SJER_001,32_100,SJER.1.2017,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
6,SJER_001,32_100,SJER.1.2018,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
7,SJER_001,32_100,SJER.1.2019,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
8,SJER_001,32_100,SJER.1.2021,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
9,SJER_001,32_100,SJER.1.2023,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [59]:
pa_matrix.columns.

Index(['plotID', 'subplotID', 'eventID', 'Achillea millefolium L.',
       'Acorus americanus (Raf.) Raf.', 'Aesculus californica (Spach) Nutt.',
       'Agoseris grandiflora (Nutt.) Greene',
       'Agoseris heterophylla (Nutt.) Greene', 'Agoseris sp.',
       'Aira caryophyllea L.',
       ...
       'Triteleia sp.', 'Tropidocarpum gracile Hook.', 'Unknown plant',
       'Urticaceae sp.', 'Veronica sp.', 'Vulpia microstachys (Nutt.) Munro',
       'Vulpia microstachys (Nutt.) Munro var. ciliata (Beal) Lonard & Gould',
       'Vulpia microstachys/Vulpia myuros', 'Vulpia myuros (L.) C.C. Gmel.',
       'Vulpia sp.'],
      dtype='object', length=357)

In [61]:
def get_dominant_tree_species_fuzzy(pa_matrix, known_tree_binomials):
    """
    Identify dominant tree species by matching binomial species names.
    
    Parameters:
    pa_matrix (pd.DataFrame): Output of create_presence_absence_matrix()
    known_tree_binomials (list of str): List like ['Quercus douglasii', 'Pinus sabiniana']
    
    Returns:
    pd.DataFrame: Tree species with counts, sorted descending
    """
    from collections import defaultdict

    # Extract only species columns (exclude IDs)
    species_cols = [col for col in pa_matrix.columns if col not in ["plotID", "subplotID", "eventID"]]

    # Build a mapping of full column name to binomial name
    col_to_binomial = {
        col: " ".join(col.split()[:2]) for col in species_cols
    }

    # Sum presence across all subplots
    counts = pa_matrix[species_cols].sum()

    # Aggregate counts by binomial name
    binomial_counts = defaultdict(int)
    for full_name, count in counts.items():
        binomial = col_to_binomial[full_name]
        if binomial in known_tree_binomials:
            binomial_counts[binomial] += count

    # Convert to DataFrame and sort
    df = pd.DataFrame(list(binomial_counts.items()), columns=["scientificName", "count"])
    df = df.sort_values(by="count", ascending=False).reset_index(drop=True)
    return df


In [63]:
# Define your known trees at SJER (can expand this list)
tree_species_sjer = [
    "Quercus chrysolepis", # Canyon Live Oak
    "Quercus douglasii", # Douglas Pine
    "Quercus wislizeni", # Interior Live Oak
    "Pinus sabiniana", # Gray Pine
    "Frangula californica", # Coffeeberry
    "Frangula californica subsp. cuspidata", # Sierra hoary coffeeberry
    "Rhamnus ilicifolia", # Hollyleaf redberry
    "Ceanothus cuneatus", # Buckbrush
    "Ceanothus leucodermis", # Chaparral Whitethorn
    "Arctostaphylos viscida", # Whiteleaf Manzanita
    "Arctostaphylos viscida subsp. mariposa" # Mariposa Manzanita
]

dominant_trees = get_dominant_tree_species_fuzzy(pa_matrix, tree_species_sjer)
print(dominant_trees)


           scientificName  count
0       Quercus wislizeni    118
1      Ceanothus cuneatus     58
2         Pinus sabiniana     56
3       Quercus douglasii     56
4   Ceanothus leucodermis     47
5      Rhamnus ilicifolia     34
6    Frangula californica     11
7  Arctostaphylos viscida      8


In [65]:
# Example: find all columns containing "Quercus"
[col for col in pa_matrix.columns if "chrysolepis" in col]


[]