In [4]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import matplotlib.colors as mcolors
from datetime import datetime, timedelta
from matplotlib.patches import Patch
import json

#### Code to get gridded erosion data from cloud server and cluster into parallel data cubes of erosion metrics and cluster IDs

In [5]:
# loading data from reefbreak by location
def find_csv_files(location, erosion=True, res_cm=10, cleaned=False):
    """
    Returns a sorted list of paths to all *_grid_{res_cm}cm.csv
    in /…/results/<location>/{erosion,deposition}/<date>/.
    """
    base = "/Volumes/group/LiDAR/LidarProcessing/LidarProcessingCliffs/results"
    mode = "erosion" if erosion else "deposition"
    root = os.path.join(base, location, mode)
    
    date_re = re.compile(r"(\d{8})")
    out = []
    for date in sorted(os.listdir(root)):
        ddir = os.path.join(root, date)
        if not os.path.isdir(ddir):
            continue
        for fn in os.listdir(ddir):
            if not cleaned:
                if fn.endswith(f"_grid_{res_cm}cm_filled.csv") and (
                (erosion and "_ero_" in fn) or (not erosion and "_dep_" in fn)
                ):
                    m = date_re.search(fn)
                    if m:
                        out.append((m.group(1), os.path.join(ddir, fn)))
            else:
                if fn.endswith(f"_grid_{res_cm}cm_filled.csv") and (
                (erosion and "_ero_" in fn) or (not erosion and "_dep_" in fn)
                ):
                    m = date_re.search(fn)
                    if m:
                        out.append((m.group(1), os.path.join(ddir, fn)))
    # sort by date prefix
    out.sort(key=lambda x: x[0])
    return [path for (_, path) in out]


def find_cluster_csv_files(location, erosion=True, res_cm=10, cleaned=False):
    """
    Returns a sorted list of paths to all *_clusters_{res_cm}cm.csv files
    in /…/results/<location>/{erosion,deposition}/<date>/.
    """
    base = "/Volumes/group/LiDAR/LidarProcessing/LidarProcessingCliffs/results"
    mode = "erosion" if erosion else "deposition"
    root = os.path.join(base, location, mode)
    
    date_re = re.compile(r"(\d{8})")
    out = []
    for date in sorted(os.listdir(root)):
        ddir = os.path.join(root, date)
        if not os.path.isdir(ddir):
            continue
        for fn in os.listdir(ddir):
            if not cleaned:
                # Look for cluster files instead of grid files
                if fn.endswith(f"_clusters_{res_cm}cm_filled.csv") and (
                (erosion and "_ero_" in fn) or (not erosion and "_dep_" in fn)
                ):
                    m = date_re.search(fn)
                    if m:
                        out.append((m.group(1), os.path.join(ddir, fn)))
            else:
                if fn.endswith(f"_clusters_{res_cm}cm_filled.csv") and (
                (erosion and "_ero_" in fn) or (not erosion and "_dep_" in fn)
                ):
                    m = date_re.search(fn)
                    if m:
                        out.append((m.group(1), os.path.join(ddir, fn)))
    # sort by date prefix
    out.sort(key=lambda x: x[0])
    return [path for (_, path) in out]

# Load both grid and cluster data for a location
def load_grid_and_cluster_cubes(location, erosion=True, res_cm=10, cleaned=False):
    """
    Load both erosion/deposition grid data and cluster data for a location.
    Returns grid_cube, grid_files, cluster_cube, cluster_files
    """
    # Load grid data
    grid_files = find_csv_files(location, erosion=erosion, res_cm=res_cm, cleaned=cleaned)
    grid_cube, valid_grid_files = load_csv_to_numpy(grid_files)
    
    # Load cluster data
    cluster_files = find_cluster_csv_files(location, erosion=erosion, res_cm=res_cm, cleaned=cleaned)
    cluster_cube, valid_cluster_files = load_csv_to_numpy(cluster_files)
    
    print(f"Grid cube shape: {grid_cube.shape}")
    print(f"Cluster cube shape: {cluster_cube.shape}")
    
    return grid_cube, valid_grid_files, cluster_cube, valid_cluster_files

# turn csv grids into numpy arrays
def load_csv_to_numpy(file_list):
    """
    Loads CSV grids into a 3D NumPy array, with error checking for shape mismatches.
    Mismatched grids are omitted from the final cube.
    """
    grids = []
    shapes = []
    valid_files = []
    
    for fp in file_list:
        df = pd.read_csv(fp, index_col=0)
        grids.append(df.values)
        shapes.append((fp, df.shape))
        valid_files.append(fp)

    if not grids:
        raise ValueError("No valid CSV files found.")
    
    # Get reference shape from first file
    ref_shape = shapes[0][1]
    
    # Filter out mismatched grids
    valid_grids = []
    valid_files_final = []
    mismatches = []
    
    for i, (fp, shape) in enumerate(shapes):
        if shape == ref_shape:
            valid_grids.append(grids[i])
            valid_files_final.append(fp)
        else:
            mismatches.append((fp, shape))
    
    if mismatches:
        print(f"\n⚠️ Omitting {len(mismatches)} files with mismatched grid shapes:")
        for fp, shape in mismatches:
            print(f"  {os.path.basename(fp)} — shape = {shape}, expected = {ref_shape}")
        print(f"\n✅ Using {len(valid_grids)} files with consistent shape {ref_shape}")
    
    if not valid_grids:
        raise ValueError("No files have consistent grid shapes.")
    
    return np.stack(valid_grids, axis=0), valid_files_final

In [None]:
# List of all locations to process
locations = ["Delmar", "Torrey", "Solana", "Encinitas", "SanElijo", "Blacks"]

def process_location(location):
    """Process a single location: load data, create cubes, and save files"""
    
    print(f"\n{'='*50}")
    print(f"Processing {location}...")
    print(f"{'='*50}")
    
    try:
        # Create data_cubes directory if it doesn't exist
        output_dir = f"/Volumes/group/LiDAR/LidarProcessing/LidarProcessingCliffs/results/{location}/data_cubes"
        os.makedirs(output_dir, exist_ok=True)
        
        # Load both grid and cluster data for erosion
        grid_cube, grid_files, cluster_cube, cluster_files = load_grid_and_cluster_cubes(
            location, erosion=True, res_cm=10, cleaned=False
        )
        
        # Save both cubes
        grid_path = os.path.join(output_dir, "cube_ero_10cm_filled.npz")
        cluster_path = os.path.join(output_dir, "cube_clusters_ero_10cm_filled.npz")
        
        np.savez_compressed(grid_path, data=grid_cube)
        np.savez_compressed(cluster_path, data=cluster_cube)
        
        print(f"✅ Saved grid cube: {grid_path}")
        print(f"✅ Saved cluster cube: {cluster_path}")
        
        # Save file lists
        grid_files_path = os.path.join(output_dir, "files_ero.json")
        cluster_files_path = os.path.join(output_dir, "files_clusters_ero.json")
        
        with open(grid_files_path, "w") as f:
            json.dump(grid_files, f, indent=2)
        
        with open(cluster_files_path, "w") as f:
            json.dump(cluster_files, f, indent=2)
        
        print(f"✅ Saved grid file list: {grid_files_path}")
        print(f"✅ Saved cluster file list: {cluster_files_path}")
        
        # Print summary
        print(f"\n📊 {location} Summary:")
        print(f"   Grid cube shape: {grid_cube.shape}")
        print(f"   Cluster cube shape: {cluster_cube.shape}")
        print(f"   Number of time steps: {len(grid_files)}")
        print(f"   Grid file size: {grid_cube.nbytes / (1024**3):.2f} GB")
        print(f"   Cluster file size: {cluster_cube.nbytes / (1024**3):.2f} GB")
        
        return True
        
    except Exception as e:
        print(f"❌ Error processing {location}: {str(e)}")
        return False

# Process all locations
successful_locations = []
failed_locations = []

print("Starting batch processing of all locations...")
print(f"Locations to process: {', '.join(locations)}")

for location in locations:
    success = process_location(location)
    
    if success:
        successful_locations.append(location)
    else:
        failed_locations.append(location)

# Final summary
print(f"\n{'='*60}")
print("BATCH PROCESSING COMPLETE")
print(f"{'='*60}")

print(f"\n✅ Successfully processed ({len(successful_locations)}/{len(locations)}):")
for loc in successful_locations:
    print(f"   - {loc}")

if failed_locations:
    print(f"\n❌ Failed to process ({len(failed_locations)}/{len(locations)}):")
    for loc in failed_locations:
        print(f"   - {loc}")
else:
    print(f"\n🎉 All {len(locations)} locations processed successfully!")

print(f"\nData cubes saved to:")
for loc in successful_locations:
    print(f"   {loc}: /Volumes/group/LiDAR/LidarProcessing/LidarProcessingCliffs/results/{loc}/data_cubes/")