# Generate Presence/Absence Species List with following columns 
- array name 
- year 
- longitude (longitude of centroid of all sites same camera array)
- latitude (longitude of centroid of all sites same camera array
- radius 
- habitats  
- sum of all group sizes detected 
- species columns: one column for each species found in the SnapShotUSA dataset(130). It will have value 1 (species present) or 0(species absent)

Notes: 
- The table will have 120 columns, of which 113 are for each of the 113 species common between Snapshot and IUCN
- For 5 km radius used for the camera trap array sites, we have found the centroid of the entire array sites. Then used the longitude and latitude of this centroid 

In [26]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import os

from warnings import filterwarnings
filterwarnings('ignore')    

In [27]:
# File paths
base_folder= "data"
snapshot_folder = "ssusa"
SNAPSHOT_CSV = "cleaned_snapshot_usa_iucn.csv"

In [None]:

snapshot_df = pd.read_csv(os.path.join(base_folder,snapshot_folder, SNAPSHOT_CSV))

#snapshot_df[['Camera_Trap_Array', 'Deployment_ID', 'Sci_Name']].head(25)

Unnamed: 0,Camera_Trap_Array,Deployment_ID,Sci_Name
0,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
1,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
2,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
3,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
4,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
5,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
6,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
7,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
8,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos
9,Crupi,AK_Forest_Chilkat_Preserve_1,ursus arctos


### Summarize by Camera Trap array and generate presence absence data

In [29]:
#Create a GeoDataFrame from snapshot_df
gdf = gpd.GeoDataFrame(
    snapshot_df.copy(),
    geometry=gpd.points_from_xy(snapshot_df['Longitude'], snapshot_df['Latitude']),
    crs="EPSG:4326"  # WGS84 (lat/lon)
)

# Project to a metric CRS (so distance/radius are in meters)
# EPSG:5070 = NAD83 / Conus Albers, suitable for mainland USA
gdf_proj = gdf.to_crs(epsg=5070)

In [30]:
# Compute per-array summaries
def summarize_array(group):
    # Centroid of all camera points in projected space
    centroid = group.geometry.unary_union.centroid

    # Convert centroid back to geographic coordinates for display
    centroid_geo = gpd.GeoSeries([centroid], crs=gdf_proj.crs).to_crs(epsg=4326).iloc[0]

    # Max distance (radius) from centroid in km
    distances = group.geometry.distance(centroid)
    radius_km = distances.max() / 1000.0  # meters ‚Üí km

    # Aggregate habitats
    habitats = sorted(group['Habitat'].dropna().unique().tolist())

    return pd.Series({
        'mean_longitude': centroid_geo.x,
        'mean_latitude': centroid_geo.y,
        'radius_km': radius_km,
        'sum_group_size': group['Group_Size'].sum(),
        'habitats_list': habitats
    })

array_summary = (
    gdf_proj.groupby(['Camera_Trap_Array', 'Year'])
    .apply(summarize_array)
    .reset_index()
)


In [35]:
# Compute species presence (0/1) matrix ---
presence_absence = (
    snapshot_df
    .assign(Present=1)
    .pivot_table(
        index=['Camera_Trap_Array', 'Year'],
        columns='Sci_Name',
        values='Present',
        aggfunc='max',
        fill_value=0
    )
    .reset_index()
)

# --- Step 4. Merge centroid-based summaries with presence‚Äìabsence table ---
final_df = pd.merge(array_summary, presence_absence, on=['Camera_Trap_Array', 'Year'], how='outer')

# --- Step 5. Rename key columns for clarity and export ---
final_df = final_df.rename(columns={
    'Camera_Trap_Array': 'array_name',
    'Year': 'year',
    'mean_longitude': 'longitude',
    'mean_latitude': 'latitude',
    'radius_km': 'radius_km',
    'habitats_list': 'habitats_list',
    'sum_group_size': 'sum_group_size'
})

In [38]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Columns: 120 entries, array_name to zapus hudsonius
dtypes: float64(3), int64(115), object(2)
memory usage: 547.6+ KB


In [39]:
final_df.head()

Unnamed: 0,array_name,year,longitude,latitude,radius_km,sum_group_size,habitats_list,alces alces,ammospermophilus harrisii,ammospermophilus leucurus,...,urocitellus richardsonii,urocyon cinereoargenteus,ursus americanus,ursus arctos,vulpes macrotis,vulpes velox,vulpes vulpes,xerospermophilus tereticaudus,zalophus californianus,zapus hudsonius
0,ARNWR,2020,-75.901024,35.805063,7.763368,383,[wetland],0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,ARNWR,2021,-75.898379,35.827427,9.3307,828,[wetland],0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,ARNWR,2022,-75.894423,35.830377,9.114756,787,[wetland],0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,AandM,2022,-97.427305,26.220872,1.039181,829,[forest],0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Abilene,2019,-99.882862,32.239334,0.951132,883,[grassland],0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [40]:
# Save to CSV ---
out_filename = os.path.join(base_folder,snapshot_folder, "species_list_5km.csv")
final_df.to_csv(out_filename, index=False)


### Validating data in Presence Absence Species List With Snapshot data

In [None]:
arrays_in_snapshot = snapshot_df['Camera_Trap_Array'].nunique()
arrays_in_presence = presence_absence['Camera_Trap_Array'].nunique()

print(f"Unique arrays in snapshot_df: {arrays_in_snapshot}")
print(f"Unique arrays in presence_absence: {arrays_in_presence}")

Unique arrays in snapshot_df: 261
Unique arrays in presence_absence: 261


In [None]:
def test_species_presence(species_name):
    """
    Verify that the species presence in the presence_absence matrix 
    matches detections in snapshot_df.
    """
    species_name = species_name.strip().lower()

    # --- Arrays where species appears in snapshot_df ---
    arrays_in_snapshot = (
        snapshot_df.loc[snapshot_df['Sci_Name'].str.lower() == species_name, 'Camera_Trap_Array']
        .dropna()
        .unique()
    )

    # --- Arrays where species is marked as 1 in presence_absence ---
    if species_name not in presence_absence.columns:
        print(f" Species '{species_name}' not found as a column in presence_absence.")
        return

    arrays_in_presence = (
        presence_absence.loc[presence_absence[species_name] == 1, 'Camera_Trap_Array']
        .dropna()
        .unique()
    )

    # --- Compare sets ---
    missing_in_presence = sorted(set(arrays_in_snapshot) - set(arrays_in_presence))
    extra_in_presence = sorted(set(arrays_in_presence) - set(arrays_in_snapshot))

    # --- Summary ---
    print(f"üîç Testing species: {species_name}")
    print(f"Arrays detected in snapshot_df: {len(arrays_in_snapshot)}")
    print(f"Arrays marked '1' in presence_absence: {len(arrays_in_presence)}")
    print(f" Matched arrays: {len(set(arrays_in_snapshot) & set(arrays_in_presence))}")
    print(f"Missing arrays in presence_absence: {missing_in_presence}")
    print(f"Extra arrays marked as present (not found in snapshot_df): {extra_in_presence}")

    # Optional assertion for pipeline testing
    assert set(arrays_in_snapshot) == set(arrays_in_presence), \
        f"Mismatch in presence data for species: {species_name}"

In [None]:
test_species_presence("sus scrofa")