In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from snowpylot import caaml_parser


In [None]:
def parse_pits(folder_path):
    """
    Function to parse CAAML files in the specified folder with error handling
    """
    import xml.etree.ElementTree as ET
    
    files = [
        f for f in os.listdir(folder_path) if f.endswith(".xml")
    ]  # List of all .xml files in the folder

    pits_list = []
    failed_files = []
    
    print(f"Found {len(files)} XML files to process...")

    for i, file in enumerate(files):  # iterate through each file in the folder
        try:
            file_path = folder_path + "/" + file  # create the file path
            
            # First, do a quick XML validation check
            ET.parse(file_path).getroot()
            
            # If XML is valid, parse with caaml_parser
            pit = caaml_parser(file_path)  # parse the file
            pits_list.append(pit)
                
        except ET.ParseError as e:
            failed_files.append((file, f"XML ParseError: {e}"))
            print(f"⚠️ Skipping {file}: XML ParseError - {e}")
            
        except Exception as e:
            failed_files.append((file, f"Other error: {e}"))
            print(f"⚠️ Skipping {file}: {type(e).__name__} - {e}")
    
    print(f"✅ Successfully parsed {len(pits_list)} files")
    print(f"⚠️ Failed to parse {len(failed_files)} files")
    
    if failed_files:
        print("Failed files:")
        for file, error in failed_files[:10]:  # Show first 10 failed files
            print(f"  - {file}: {error}")
        if len(failed_files) > 10:
            print(f"  ... and {len(failed_files) - 10} more")

    return pits_list, failed_files


In [3]:
# Parse all pits with error handling
all_pits, failed_files = parse_pits("../snowpits/combined_caaml_files/")

print(f"\nDataset summary:")
print(f"Total successfully parsed pits: {len(all_pits)}")
print(f"Total failed files: {len(failed_files)}")


Found 50279 XML files to process...
Processed 1000/50279 files...
Processed 2000/50279 files...
Processed 3000/50279 files...
Processed 4000/50279 files...
Processed 5000/50279 files...
Processed 6000/50279 files...
Processed 7000/50279 files...
Processed 8000/50279 files...
Processed 9000/50279 files...
Processed 10000/50279 files...
Processed 11000/50279 files...
Processed 12000/50279 files...
Processed 13000/50279 files...
Processed 14000/50279 files...
Processed 15000/50279 files...
Processed 16000/50279 files...
Processed 17000/50279 files...
Processed 18000/50279 files...
Processed 19000/50279 files...
Processed 20000/50279 files...
Processed 21000/50279 files...
Processed 22000/50279 files...
Processed 23000/50279 files...
Processed 24000/50279 files...
Processed 25000/50279 files...
Processed 26000/50279 files...
Processed 27000/50279 files...
Processed 28000/50279 files...
Processed 29000/50279 files...
Processed 30000/50279 files...
Processed 31000/50279 files...
Processed 32

In [4]:
# Summary Stats

pit_info_list = []

for pit in all_pits:
    num_primary_grain_form = 0  # initialize
    num_primary_grain_size = 0  # initialize
    for layer in pit.snow_profile.layers:  # iterate through each layer in the pit
        if layer.grain_form_primary is not None:  # if the layer has a primary grain form
            num_primary_grain_form += 1  # increment the number of primary grain forms
            if (
                layer.grain_form_primary.grain_size_avg is not None
            ):  # if the layer has a primary grain size
                num_primary_grain_size += 1  # increment the number of primary grain sizes

    pit_info_dict = {
        # Metadata
        "PitID": pit.core_info.pit_id,
        "Date": pit.core_info.date,
        # User
        "SnowPilot Username": pit.core_info.user.username,
        "Professional": pit.core_info.user.professional,
        "Operation Name": pit.core_info.user.operation_name,
        # Location
        "Latitude": pit.core_info.location.latitude,
        "Longitude": pit.core_info.location.longitude,
        "Elevation": pit.core_info.location.elevation,
        "Aspect": pit.core_info.location.aspect,
        "Slope Angle": pit.core_info.location.slope_angle,
        "Country": pit.core_info.location.country,
        "Region": pit.core_info.location.region,
        "Pit Near Avalanche": pit.core_info.location.pit_near_avalanche,
        "Pit Near Avalanche Location": pit.core_info.location.pit_near_avalanche_location,
        # Snow Profile
        "HS": pit.snow_profile.hs,
        # Layers
        "Num Layers": len(pit.snow_profile.layers),
        "num Layers wPrimary Grain Form": num_primary_grain_form,
        "num Layers wPrimary Grain Size": num_primary_grain_size,
        # Stability Tests
        "Num ECT": len(pit.stability_tests.ECT),
        "Num CT": len(pit.stability_tests.CT),
        "Num RBlock": len(pit.stability_tests.RBlock),
        "Num PST": len(pit.stability_tests.PST),
    }
    pit_info_list.append(pit_info_dict)

    pit_info_df = pd.DataFrame(pit_info_list)


KeyboardInterrupt: 