# Calculating Zonal Statistics 
MS 263 Final Project

Caroline Daley | Moss Landing Marine Laboratories 

May 2025

In [1]:
import geopandas as gpd
import pandas as pd
import rasterio
from rasterio.features import geometry_mask
import glob
import os
import xdem
import numpy as np
from shapely.geometry import box
from rasterio.mask import mask

## Import & Format Spatial Data

Connect to local database to access habitat metric files created in the ['Calculate Habitat Metrics with xDEM'](http://localhost:8888/lab/tree/Project/ms263-main%202/Calculate%20Habitat%20Metrics%20with%20xDEM.ipynb) notebook, as well as a shapefile of CCFRP gridcells. Make sure the habitat metric rasters and gridcell shapefile are in the same spatial reference system. 

In [4]:
# this path is specific to your local machine! 
raster_folder = "C:\\Users\\FELAB\\Documents\\MS263\\Python Repository\\xDEM_Habitat_Metrics"

# this path is relational to this repository 
shapefile_path = "Shapefiles/CCFRP_Grid_Cells_2021.shp"

Make sure that the spatial reference of the shapefile matches the rasters of habitat metrics. 

In [5]:
reprojected_shapefile_path = shapefile_path+'_reprojected.shp'

gridcells = gpd.read_file(shapefile_path)

# --- Get CRS from first DEM ---
raster_files = glob.glob(os.path.join(raster_folder, '*.tif'))
first_raster = raster_files[0]

# Get the CRS from the first DEM file
with rasterio.open(first_raster) as src:
    common_crs = src.crs  # Define common crs for project

gridcells = gridcells.to_crs(common_crs)
gridcells.to_file(reprojected_shapefile_path)

NameError: name 'raster_folder' is not defined

## Perform Zonal Statistics

I relied on the ['Introduction to Geospatial Raster and Vector Data with Python'](https://carpentries-incubator.github.io/geospatial-python/10-zonal-statistics.html) to build and [Chat GPT](https://chatgpt.com/) to loop the following code for performing zonal statistics.

This code uses zonal statistics to calculate the mean of each habitat metirc within each CCFRP-sampled grid cell. The output of this code is a .csv file of mean habitat metrics within each gridcell by its 'Site' (MPA versus reference site) and 'Location' (name of the MPA being sampled). 

In [14]:
# Locate newly reprojected shapefile
gridcells = gpd.read_file(reprojected_shapefile_path)

# Identify relevant attributes
location_field = 'Location'       # 'Site' notes the area (e.g. Point Lobos)
site_field = 'Site'           # 'ID' notes whether the gridcell is in an MPA or REF

# Redefine where habitat metric rasters are located 
raster_files = glob.glob(os.path.join(raster_folder, '*.tif'))

# Build a container for results
results = []

# Loop over all grid cells
for idx, row in gridcells.iterrows():
    # Get ID and Site for each grid cell
    location = row[location_field] if pd.notna(row[location_field]) else 'Unknown'
    
    # Container for the grid cell's habitat metrics
    grid_cell_metrics = {
        "Site": row[site_field],      # Include ID from shapefile
        "Location": location          # Include Site from shapefile
    }

    # Loop over each habitat metric file
    for filepath in raster_files:
        filename = os.path.basename(filepath)
        habitat_metric = filename.split('_')[-1].replace('.tif', '')

        # Open the raster (DEM)
        with rasterio.open(filepath) as src:
            raster_crs = src.crs

            # Reproject shapefile to match raster CRS if needed
            if gridcells.crs != raster_crs:
                gridcells = gridcells.to_crs(raster_crs)

            # Get the geometry of the grid cell
            geom = [row.geometry]
            
            try:
                # Check if grid cell intersects raster bounds first
                if not row.geometry.intersects(box(*src.bounds)):
                    continue

                # Mask the raster to the grid cell's geometry
                out_image, out_transform = mask(src, geom, crop=True)
                data = out_image[0]

                # Mask out nodata values
                nodata = src.nodata
                if nodata is not None:
                    data = np.ma.masked_equal(data, nodata)
                else:
                    data = np.ma.masked_invalid(data)

                # Calculate the mean value of the habitat metric for the grid cell
                mean_val = data.mean() if data.count() > 0 else np.nan

                # Add the mean value for the current habitat metric
                grid_cell_metrics[habitat_metric] = mean_val

            except Exception as e:
                print(f"Failed to process {filename} for grid cell {idx}: {e}")
                grid_cell_metrics[habitat_metric] = np.nan  # If any error, assign NaN

    # Append the grid cell's metrics to the results list
    results.append(grid_cell_metrics)

# Convert the results into a DataFrame
df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
df.to_csv("habitat_metrics_summary.csv", index=False)
print("Summary saved to 'habitat_metrics_summary.csv'")

Summary saved to 'habitat_metrics_summary.csv'
