In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from snowpylot import caaml_parser


In [7]:
def parse_pits(folder_path):
    """
    Function to parse CAAML files in the specified folder with error handling
    """
    import xml.etree.ElementTree as ET
    
    files = [
        f for f in os.listdir(folder_path) if f.endswith(".xml")
    ]  # List of all .xml files in the folder

    pits_list = []
    failed_files = []
    
    print(f"Found {len(files)} XML files to process...")

    for i, file in enumerate(files):  # iterate through each file in the folder
        try:
            file_path = folder_path + "/" + file  # create the file path
            
            # First, do a quick XML validation check
            ET.parse(file_path).getroot()
            
            # If XML is valid, parse with caaml_parser
            pit = caaml_parser(file_path)  # parse the file
            pits_list.append(pit)
                
        except ET.ParseError as e:
            failed_files.append((file, f"XML ParseError: {e}"))
            print(f"⚠️ Skipping {file}: XML ParseError - {e}")
            
        except Exception as e:
            failed_files.append((file, f"Other error: {e}"))
            print(f"⚠️ Skipping {file}: {type(e).__name__} - {e}")
    
    print(f"✅ Successfully parsed {len(pits_list)} files")
    print(f"⚠️ Failed to parse {len(failed_files)} files")
    
    if failed_files:
        print("Failed files:")
        for file, error in failed_files[:10]:  # Show first 10 failed files
            print(f"  - {file}: {error}")
        if len(failed_files) > 10:
            print(f"  ... and {len(failed_files) - 10} more")

    return pits_list, failed_files


In [8]:
# Parse all pits with error handling
all_pits, failed_files = parse_pits("../snowpits/combined_caaml_files/")

print(f"\nDataset summary:")
print(f"Total successfully parsed pits: {len(all_pits)}")
print(f"Total failed files: {len(failed_files)}")


Found 50278 XML files to process...
✅ Successfully parsed 50278 files
⚠️ Failed to parse 0 files

Dataset summary:
Total successfully parsed pits: 50278
Total failed files: 0


In [18]:
# Create summary statistics DataFrame efficiently
print("Creating summary statistics...")

pit_info_list = []

for i, pit in enumerate(all_pits):
    # layer info
    num_primary_grain_form = sum(1 for layer in pit.snow_profile.layers 
                                if layer.grain_form_primary is not None)
    num_primary_grain_size = sum(1 for layer in pit.snow_profile.layers 
                                if (layer.grain_form_primary is not None and 
                                    layer.grain_form_primary.grain_size_avg is not None))
    num_hand_hardness = sum(1 for layer in pit.snow_profile.layers 
                                if (layer.hardness is not None))

    pit_info_dict = {
        # Metadata
        "PitID": pit.core_info.pit_id,
        "Date": pit.core_info.date,
        # User
        "SnowPilot Username": pit.core_info.user.username,
        "Professional": pit.core_info.user.professional,
        "Operation Name": pit.core_info.user.operation_name,
        # Location
        "Latitude": pit.core_info.location.latitude,
        "Longitude": pit.core_info.location.longitude,
        "Elevation": pit.core_info.location.elevation,
        "Aspect": pit.core_info.location.aspect,
        "Slope Angle": pit.core_info.location.slope_angle,
        "Country": pit.core_info.location.country,
        "Region": pit.core_info.location.region,
        "Pit Near Avalanche": pit.core_info.location.pit_near_avalanche,
        "Pit Near Avalanche Location": pit.core_info.location.pit_near_avalanche_location,
        # Snow Profile
        "HS": pit.snow_profile.hs,
        # Layers
        "Num Layers": len(pit.snow_profile.layers),
        "num Layers wPrimary Grain Form": num_primary_grain_form,
        "num Layers wPrimary Grain Size": num_primary_grain_size,
        "num Layers wHand Hardness": num_hand_hardness,
        # Stability Tests
        "Num ECT": len(pit.stability_tests.ECT),
        "Num CT": len(pit.stability_tests.CT),
        "Num RBlock": len(pit.stability_tests.RBlock),
        "Num PST": len(pit.stability_tests.PST),
    }
    pit_info_list.append(pit_info_dict)

# Create DataFrame only once after collecting all data
print("Creating DataFrame...")
pit_info_df = pd.DataFrame(pit_info_list)

print(f"✅ Successfully created DataFrame with {len(pit_info_df)} rows and {len(pit_info_df.columns)} columns")
print(f"DataFrame shape: {pit_info_df.shape}")


Creating summary statistics...
Creating DataFrame...
✅ Successfully created DataFrame with 50278 rows and 23 columns
DataFrame shape: (50278, 23)


In [26]:
# Display Summary Stats

# Summary statistics for pit_info_df 
print("=== Summary Statistics of Full Dataset ===\n")

# Helper function to extract numeric values from [value, unit] lists
def extract_numeric_value(data_series):
    """Extract numeric values from lists of format [value, unit]"""
    numeric_values = []
    for item in data_series:
        if item is not None and isinstance(item, list) and len(item) >= 1:
            try:
                numeric_values.append(float(item[0]))
            except (ValueError, TypeError):
                continue
        elif item is not None:
            try:
                numeric_values.append(float(item))
            except (ValueError, TypeError):
                continue
    return pd.Series(numeric_values)

# Basic dataset info
print(f"Total number of pits: {len(pit_info_df)}")

# Handle dates safely
try:
    date_min = pit_info_df['Date'].min()
    date_max = pit_info_df['Date'].max()
    print(f"Date range: {date_min} to {date_max}")
except:
    print("Date range: Unable to determine (mixed data types)")
print()

# Layer information
print("=== LAYER INFORMATION ===")
try:
    layers_numeric = pd.to_numeric(pit_info_df['Num Layers'], errors='coerce')
    layers_clean = layers_numeric.dropna()

    
    # Grain form and size data
    grain_form_numeric = pd.to_numeric(pit_info_df['num Layers wPrimary Grain Form'], errors='coerce')
    grain_size_numeric = pd.to_numeric(pit_info_df['num Layers wPrimary Grain Size'], errors='coerce')
    hardness_numeric = pd.to_numeric(pit_info_df['num Layers wHand Hardness'], errors='coerce')

    pits_with_grain_form = (grain_form_numeric > 0).sum()
    pits_with_grain_size = (grain_size_numeric > 0).sum()
    pits_with_hardness = (hardness_numeric > 0).sum()

    print(f"Pits with primary grain form data: {pits_with_grain_form} ({pits_with_grain_form/len(pit_info_df)*100:.1f}%)")
    print(f"Pits with primary grain size data: {pits_with_grain_size} ({pits_with_grain_size/len(pit_info_df)*100:.1f}%)")
    print(f"Pits with hand hardness data: {pits_with_hardness} ({pits_with_hardness/len(pit_info_df)*100:.1f}%)")

    print()

    # Total Layers
    total_layers = layers_clean.sum()
    layers_with_grain_form = grain_form_numeric.sum()
    layers_with_grain_size = grain_size_numeric.sum()
    layers_with_hardness = hardness_numeric.sum()

    print(f"Total number of layers across all pits: {total_layers}")
    print(f"Total number of layers with primary grain form data: {layers_with_grain_form} ({layers_with_grain_form/total_layers*100:.1f}%)")
    print(f"Total number of layers with primary grain size data: {layers_with_grain_size} ({layers_with_grain_size/total_layers*100:.1f}%)")
    print(f"Total number of layers with hand hardness data: {layers_with_hardness} ({layers_with_hardness/total_layers*100:.1f}%)")

except Exception as e:
    print(f"Layer data: Unable to process ({str(e)})")
print()

# Stability tests
print("=== STABILITY TESTS ===")
try:
    ect_numeric = pd.to_numeric(pit_info_df['Num ECT'], errors='coerce')
    ct_numeric = pd.to_numeric(pit_info_df['Num CT'], errors='coerce')
    rblock_numeric = pd.to_numeric(pit_info_df['Num RBlock'], errors='coerce')
    pst_numeric = pd.to_numeric(pit_info_df['Num PST'], errors='coerce')
    
    pits_with_ect = (ect_numeric > 0).sum()
    pits_with_ct = (ct_numeric > 0).sum()
    pits_with_rblock = (rblock_numeric > 0).sum()
    pits_with_pst = (pst_numeric > 0).sum()
    
    print(f"Pits with ECT tests: {pits_with_ect} ({pits_with_ect/len(pit_info_df)*100:.1f}%)")
    print(f"Pits with CT tests: {pits_with_ct} ({pits_with_ct/len(pit_info_df)*100:.1f}%)")
    print(f"Pits with RBlock tests: {pits_with_rblock} ({pits_with_rblock/len(pit_info_df)*100:.1f}%)")
    print(f"Pits with PST tests: {pits_with_pst} ({pits_with_pst/len(pit_info_df)*100:.1f}%)")

    print()

    total_ect = ect_numeric.sum()
    total_ct = ct_numeric.sum()
    total_rblock = rblock_numeric.sum()
    total_pst = pst_numeric.sum()

    print(f"Total ECT tests: {total_ect}")
    print(f"Total CT tests: {total_ct}")
    print(f"Total RBlock tests: {total_rblock}")
    print(f"Total PST tests: {total_pst}")

    print()

    print(f"Total number of stability test results: {total_ect + total_ct + total_rblock + total_pst}")
except Exception as e:
    print(f"Stability test data: Unable to process ({str(e)})")
print()

# User information
print("=== USER INFORMATION ===")
try:
    print(f"Unique users: {pit_info_df['SnowPilot Username'].nunique()}")
    
    # Handle professional status
    professional_data = pit_info_df['Professional']
    professional_count = professional_data.sum()
    print(f"Professional users: {professional_count} ({professional_count/len(pit_info_df)*100:.1f}%)")
    
    print(f"Unique operations: {pit_info_df['Operation Name'].nunique()}")
except Exception as e:
    print(f"User data: Unable to process ({str(e)})")
print()

# Pit near avalanche location breakdown
print("=== PITS NEAR AVALANCHE ===")
try:
    pits_near_avalanche = pit_info_df['Pit Near Avalanche'].sum()
    print(f"Total Pits near avalanche: {pits_near_avalanche}")
    print()
    location_counts = pit_info_df['Pit Near Avalanche Location'].value_counts()
    print(location_counts)
    print()

    print()
except Exception as e:
    print(f"Avalanche location data: Unable to process ({str(e)})")
print()

# Data completeness
print("=== DATA COMPLETENESS ===")
try:
    missing_data = pit_info_df.isnull().sum()
    missing_percent = (missing_data / len(pit_info_df)) * 100
    completeness_df = pd.DataFrame({
        'Missing Count': missing_data,
        'Missing %': missing_percent.round(1)
    }).sort_values('Missing %', ascending=False)
    
    # Only show columns with missing data
    incomplete_data = completeness_df[completeness_df['Missing Count'] > 0]
    if len(incomplete_data) > 0:
        print(incomplete_data)
    else:
        print("No missing data found!")
except Exception as e:
    print(f"Data completeness analysis: Unable to process ({str(e)})")


=== Summary Statistics of Full Dataset ===

Total number of pits: 50278
Date range: Unable to determine (mixed data types)

=== LAYER INFORMATION ===
Pits with primary grain form data: 47238 (94.0%)
Pits with primary grain size data: 35541 (70.7%)
Pits with hand hardness data: 49701 (98.9%)

Total number of layers across all pits: 371429
Total number of layers with primary grain form data: 305811 (82.3%)
Total number of layers with primary grain size data: 176044 (47.4%)
Total number of layers with hand hardness data: 336888 (90.7%)

=== STABILITY TESTS ===
Pits with ECT tests: 34327 (68.3%)
Pits with CT tests: 28959 (57.6%)
Pits with RBlock tests: 215 (0.4%)
Pits with PST tests: 5154 (10.3%)

Total ECT tests: 47684
Total CT tests: 51599
Total RBlock tests: 241
Total PST tests: 6213

Total number of stability test results: 105737

=== USER INFORMATION ===
Unique users: 5380
Professional users: 32707 (65.1%)
Unique operations: 561

=== PITS NEAR AVALANCHE ===
Total Pits near avalanche: 