In [87]:
# Import Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from snowpylot import caaml_parser
from geldsetzer_utils import get_density, convert_grain_form


In [88]:
def parse_pits(folder_path):
    """
    Function to parse CAAML files in the specified folder
    """
    files = [
        f for f in os.listdir(folder_path) if f.endswith(".xml")
    ]  # List of all .xml files in the folder

    pits_list = []

    for file in files:  # iterate through each file in the folder
        file_path = os.path.join(folder_path, file)  # create the file path
        try:
            pit = caaml_parser(file_path)  # parse the file
            pits_list.append(pit)
        except Exception as e:
            print(f"Error parsing {file}: {str(e)}")

    return pits_list


In [89]:
# Find all directories in new_data (these are the extracted folders)
new_data_dir = "../snowpits/new_data"
all_pits = []

# Get all items in new_data directory
items_in_new_data = os.listdir(new_data_dir)

# Filter to only directories (ignore tar.gz files)
extracted_directories = [
    item for item in items_in_new_data 
    if os.path.isdir(os.path.join(new_data_dir, item)) and not item.startswith('.')
]

print(f"Found {len(extracted_directories)} extracted directories to process:")
for directory in sorted(extracted_directories):
    print(f"  📁 {directory}")

print("\nParsing snowpit files from each directory...")

# Parse pits from each extracted directory
total_pits_parsed = 0
for directory in extracted_directories:
    directory_path = os.path.join(new_data_dir, directory)
    print(f"\nProcessing: {directory}")
    
    try:
        # Parse all pits in this directory
        pits_from_directory = parse_pits(directory_path)
        directory_pit_count = len(pits_from_directory)
        
        # Add to the master list
        all_pits.extend(pits_from_directory)
        total_pits_parsed += directory_pit_count
        
        print(f"  ✅ Parsed {directory_pit_count} snowpits")
        
    except Exception as e:
        print(f"  ❌ Error processing directory {directory}: {str(e)}")

print("\n🎉 Parsing complete!")
print(f"Total directories processed: {len(extracted_directories)}")
print(f"Total snowpits parsed: {total_pits_parsed}")
print(f"Length of all_pits list: {len(all_pits)}")

# Display some basic info about the parsed data
if all_pits:
    print("\nSample pit information:")
    sample_pit = all_pits[0]
    print(f"  First pit ID: {sample_pit.core_info.pit_id}")
    print(f"  Date: {sample_pit.core_info.date}")
    print(f"  Number of layers: {len(sample_pit.snow_profile.layers)}")
    
    # Show distribution of pits by some basic metadata if available
    pit_ids = [pit.core_info.pit_id for pit in all_pits if pit.core_info.pit_id]
    print(f"  Unique pit IDs: {len(set(pit_ids))}")
else:
    print("⚠️  No pits were successfully parsed")


Found 45 extracted directories to process:
  📁 sverplanck-snowpits-25-07-21-07-03
  📁 sverplanck-snowpits-25-07-21-07-04
  📁 sverplanck-snowpits-25-07-21-07-05
  📁 sverplanck-snowpits-25-07-21-07-06
  📁 sverplanck-snowpits-25-07-21-07-07
  📁 sverplanck-snowpits-25-07-21-07-10
  📁 sverplanck-snowpits-25-07-21-07-11
  📁 sverplanck-snowpits-25-07-21-07-12
  📁 sverplanck-snowpits-25-07-21-07-13
  📁 sverplanck-snowpits-25-07-21-07-14
  📁 sverplanck-snowpits-25-07-21-07-15
  📁 sverplanck-snowpits-25-07-21-07-16
  📁 sverplanck-snowpits-25-07-21-07-17
  📁 sverplanck-snowpits-25-07-21-07-18
  📁 sverplanck-snowpits-25-07-21-07-19
  📁 sverplanck-snowpits-25-07-21-07-20
  📁 sverplanck-snowpits-25-07-21-07-21
  📁 sverplanck-snowpits-25-07-21-07-22
  📁 sverplanck-snowpits-25-07-21-07-23
  📁 sverplanck-snowpits-25-07-21-07-24
  📁 sverplanck-snowpits-25-07-21-07-25
  📁 sverplanck-snowpits-25-07-21-07-26
  📁 sverplanck-snowpits-25-07-21-07-27
  📁 sverplanck-snowpits-25-07-21-07-28
  📁 sverplanck-snowpi

In [90]:
# Data Summary

pit_info = []
layer_info = []
density_info = []
geldsetzer_info = []

for pit in all_pits:
    pit_dict = {
        'pit_id': pit.core_info.pit_id,
        'layer_count': len(pit.snow_profile.layers),
        'density_count': len(pit.snow_profile.density_profile)
    }
    pit_info.append(pit_dict)

    for density in pit.snow_profile.density_profile:
        density_dict = {
            'pit_id': pit.core_info.pit_id,
            'depth_top': density.depth_top,
            'thickness': density.thickness,
            'density': density.density,
        }
        density_info.append(density_dict)

    for layer in pit.snow_profile.layers:
        layer_dict = {
            'pit_id': pit.core_info.pit_id,
            'depth_top': layer.depth_top,
            'thickness': layer.thickness,
            'hand_hardness': layer.hardness,
            'wetness': layer.wetness,
            'layer_of_concern': layer.layer_of_concern,
            'grain_form_primary': layer.grain_form_primary,
            'grain_form_secondary': layer.grain_form_secondary,
        }
        layer_info.append(layer_dict)

    for density in pit.snow_profile.density_profile:
        # NOTE: Update to capture density measurements where density layer falls within a profile layer, but the depth_top is not the same
        for layer in pit.snow_profile.layers:
            # Check if we have required layer properties
            if not (layer.grain_form_primary and layer.hardness):
                continue
                
            # Handle potential None values for thickness and extract numeric values
            try:
                # Extract numeric depth values (handle [value, unit] format)
                density_depth = float(density.depth_top[0] if isinstance(density.depth_top, list) else density.depth_top)
                layer_depth = float(layer.depth_top[0] if isinstance(layer.depth_top, list) else layer.depth_top)
                
                # Extract thickness values, defaulting to 0 if not available
                density_thickness = getattr(density, 'thickness', 0) or 0
                layer_thickness = getattr(layer, 'thickness', 0) or 0
                density_thickness = float(density_thickness[0] if isinstance(density_thickness, list) else density_thickness)
                layer_thickness = float(layer_thickness[0] if isinstance(layer_thickness, list) else layer_thickness)
                
            except (TypeError, IndexError, AttributeError, ValueError):
                # Skip if we can't extract numeric depth values
                continue
            
            # Calculate depth ranges (bottom = top + thickness)
            density_bottom = density_depth + density_thickness
            layer_bottom = layer_depth + layer_thickness
            
            # Check for overlap: density range overlaps with layer range
            # Ranges overlap if: density_top <= layer_bottom AND density_bottom >= layer_top
            # Using <= and >= to properly handle exact matches when thickness = 0
            if (density_depth <= layer_bottom) and (density_bottom >= layer_depth):
                # Determine match type and appropriate thickness to use
                is_exact_match = density_depth == layer_depth
                match_type = 'exact' if is_exact_match else 'overlap'
                # Use layer thickness for exact matches, density thickness for overlap matches
                thickness_to_use = layer_thickness if is_exact_match else density_thickness
                
                geldsetzer_dict = {
                    'pit_id': pit.core_info.pit_id,
                    'depth_top': density.depth_top,
                    'density': density.density[0], # in kg/m^3
                    'hand_hardness': layer.hardness,
                    'grain_form': layer.grain_form_primary.grain_form,
                    'basic_grain_class_code': layer.grain_form_primary.basic_grain_class_code,
                    'sub_grain_class_code': layer.grain_form_primary.sub_grain_class_code,
                    'geldsetzer_grain_form': convert_grain_form(layer),
                    # Add additional fields to track the match type and thickness info
                    'layer_depth_top': layer.depth_top,
                    'layer_thickness': layer_thickness,
                    'density_thickness': density_thickness,
                    'thickness': thickness_to_use,  # Primary thickness field based on match type
                    'match_type': match_type
                }
                geldsetzer_info.append(geldsetzer_dict)
    

In [91]:
# Diagnostic: Compare old exact-match logic vs new overlap-based logic

exact_matches = 0
overlap_matches = 0
total_density_measurements = 0
total_layer_measurements = 0

for pit in all_pits:
    total_density_measurements += len(pit.snow_profile.density_profile)
    total_layer_measurements += len(pit.snow_profile.layers)
    
    for density in pit.snow_profile.density_profile:
        for layer in pit.snow_profile.layers:
            if not (layer.grain_form_primary and layer.hardness):
                continue
                
            try:
                # Extract numeric depth values
                density_depth = float(density.depth_top[0] if isinstance(density.depth_top, list) else density.depth_top)
                layer_depth = float(layer.depth_top[0] if isinstance(layer.depth_top, list) else layer.depth_top)
                
                density_thickness = getattr(density, 'thickness', 0) or 0
                layer_thickness = getattr(layer, 'thickness', 0) or 0
                density_thickness = float(density_thickness[0] if isinstance(density_thickness, list) else density_thickness)
                layer_thickness = float(layer_thickness[0] if isinstance(layer_thickness, list) else layer_thickness)
                
            except (TypeError, IndexError, AttributeError, ValueError):
                continue
            
            # Old logic: exact match only
            if density_depth == layer_depth:
                exact_matches += 1
            
            # New logic: overlap detection (corrected to handle exact matches with zero thickness)
            density_bottom = density_depth + density_thickness
            layer_bottom = layer_depth + layer_thickness
            overlaps = (density_depth <= layer_bottom) and (density_bottom >= layer_depth)
            
            if overlaps:
                overlap_matches += 1

print("=== Matching Logic Comparison ===")
print(f"Total density measurements across all pits: {total_density_measurements:,}")
print(f"Total layer measurements across all pits: {total_layer_measurements:,}")
print(f"Matches with old exact-depth logic: {exact_matches:,}")
print(f"Matches with new overlap logic: {overlap_matches:,}")
print(f"Additional matches captured: {overlap_matches - exact_matches:,}")
print(f"Percentage increase: {((overlap_matches - exact_matches) / exact_matches * 100):.1f}%")


=== Matching Logic Comparison ===
Total density measurements across all pits: 14,538
Total layer measurements across all pits: 254,387
Matches with old exact-depth logic: 7,186
Matches with new overlap logic: 24,004
Additional matches captured: 16,818
Percentage increase: 234.0%


In [92]:
# Summary stats for pit_info
pit_info_df = pd.DataFrame(pit_info)
total_pits = len(pit_info_df)
print(f"Total pits: {total_pits:,}")

# Calculate number of pits with density measurements
pits_with_density = (pit_info_df['density_count'] > 0).sum()
percentage_with_density = (pits_with_density / total_pits) * 100

print(f"Pits with density measurements: {pits_with_density:,}")
print(f"Percentage of pits with density data: {percentage_with_density:.1f}%")

# Summary stats for density_info
density_info_df = pd.DataFrame(density_info)
print("\ndensity_info_df")
for column in density_info_df.columns:
        # Count non-null values
        non_null_count = density_info_df[column].notna().sum()
        print(f"Number of values for {column:25}: {non_null_count:8,}")

# Summary stats for layer_info
layer_info_df = pd.DataFrame(layer_info)
print("\nlayer_info_df")
for column in layer_info_df.columns:
        # Count non-null values
        non_null_count = layer_info_df[column].notna().sum()
        print(f"Number of values for {column:25}: {non_null_count:8,}")


Total pits: 34,010
Pits with density measurements: 2,007
Percentage of pits with density data: 5.9%

density_info_df
Number of values for pit_id                   :   14,538
Number of values for depth_top                :   14,538
Number of values for thickness                :    7,021
Number of values for density                  :   14,538

layer_info_df
Number of values for pit_id                   :  254,387
Number of values for depth_top                :  254,387
Number of values for thickness                :  254,387
Number of values for hand_hardness            :  230,434
Number of values for wetness                  :   60,228
Number of values for layer_of_concern         :   24,628
Number of values for grain_form_primary       :  208,388
Number of values for grain_form_secondary     :   28,436


In [93]:
# Summary stats for geldsetzer_info
geldsetzer_info_df = pd.DataFrame(geldsetzer_info)
print("geldsetzer_info_df")
for column in geldsetzer_info_df.columns:
        # Count non-null values
        non_null_count = geldsetzer_info_df[column].notna().sum()
        print(f"Number of values for {column:25}: {non_null_count:8,}")

geldsetzer_info_df.head()


geldsetzer_info_df
Number of values for pit_id                   :   24,004
Number of values for depth_top                :   24,004
Number of values for density                  :   24,004
Number of values for hand_hardness            :   24,004
Number of values for grain_form               :   23,795
Number of values for basic_grain_class_code   :   23,795
Number of values for sub_grain_class_code     :   14,239
Number of values for geldsetzer_grain_form    :   23,795
Number of values for layer_depth_top          :   24,004
Number of values for layer_thickness          :   24,004
Number of values for density_thickness        :   24,004
Number of values for thickness                :   24,004
Number of values for match_type               :   24,004


Unnamed: 0,pit_id,depth_top,density,hand_hardness,grain_form,basic_grain_class_code,sub_grain_class_code,geldsetzer_grain_form,layer_depth_top,layer_thickness,density_thickness,thickness,match_type
0,26539,"[13.5, cm]",180.0,4F,RGlr,RG,RGlr,RG,"[1.0, cm]",20.5,0.0,0.0,overlap
1,26539,"[43.5, cm]",250.0,1F-,RGlr,RG,RGlr,RG,"[25.5, cm]",27.0,0.0,0.0,overlap
2,26539,"[168.5, cm]",280.0,1F+,RGlr,RG,RGlr,RG,"[160.5, cm]",16.0,0.0,0.0,overlap
3,26319,"[0.0, cm]",250.0,1F,DFdc,DF,DFdc,DF,"[0.0, cm]",47.0,47.0,47.0,exact
4,26411,"[11.0, cm]",151.0,F+,DF,DF,,DF,"[4.0, cm]",19.0,0.0,0.0,overlap


In [94]:
# Add columns for grain form and hardness group combinations

geldsetzer_info_df["geldsetzer_group"] = geldsetzer_info_df["hand_hardness"] + " / " + geldsetzer_info_df["geldsetzer_grain_form"]
geldsetzer_info_df.head()


Unnamed: 0,pit_id,depth_top,density,hand_hardness,grain_form,basic_grain_class_code,sub_grain_class_code,geldsetzer_grain_form,layer_depth_top,layer_thickness,density_thickness,thickness,match_type,geldsetzer_group
0,26539,"[13.5, cm]",180.0,4F,RGlr,RG,RGlr,RG,"[1.0, cm]",20.5,0.0,0.0,overlap,4F / RG
1,26539,"[43.5, cm]",250.0,1F-,RGlr,RG,RGlr,RG,"[25.5, cm]",27.0,0.0,0.0,overlap,1F- / RG
2,26539,"[168.5, cm]",280.0,1F+,RGlr,RG,RGlr,RG,"[160.5, cm]",16.0,0.0,0.0,overlap,1F+ / RG
3,26319,"[0.0, cm]",250.0,1F,DFdc,DF,DFdc,DF,"[0.0, cm]",47.0,47.0,47.0,exact,1F / DF
4,26411,"[11.0, cm]",151.0,F+,DF,DF,,DF,"[4.0, cm]",19.0,0.0,0.0,overlap,F+ / DF


In [95]:
# Create a comprehensive density statistics dataframe by geldsetzer_group
density_stats_df = geldsetzer_info_df.dropna(subset=['density', 'geldsetzer_group']).groupby('geldsetzer_group')['density'].agg([
    'count',
    'mean', 
    'median',
    'std',
    'min',
    'max',
    lambda x: x.quantile(0.25),  # Q1
    lambda x: x.quantile(0.75),  # Q3
]).round(2)

# Rename the lambda columns to more readable names
density_stats_df.columns = ['count', 'mean', 'median', 'std', 'min', 'max', 'Q1', 'Q3']

# Add interquartile range
density_stats_df['IQR'] = (density_stats_df['Q3'] - density_stats_df['Q1']).round(2)

# Get unique mappings of geldsetzer_group to hardness_group and geldsetzer_grain_form from the original dataframe
group_mappings = geldsetzer_info_df[['geldsetzer_group', 'hand_hardness', 'geldsetzer_grain_form']].drop_duplicates(subset=['geldsetzer_group'])

# Merge these mappings with the density_stats_df
density_stats_df = density_stats_df.merge(group_mappings, on='geldsetzer_group', how='left')

# Add geldsetzer_density to stats df - apply function row by row
density_stats_df['geldsetzer_density'] = density_stats_df.apply(
    lambda row: get_density(row['hand_hardness'], row['geldsetzer_grain_form']), 
    axis=1
)

# Add diff column
density_stats_df['diff'] = density_stats_df['geldsetzer_density'] - density_stats_df['mean']
density_stats_df['diff_percent'] = density_stats_df['diff'] / density_stats_df['mean'] * 100

# Sort by count (most observations first)
density_stats_df = density_stats_df.sort_values('count', ascending=False)

# Reset index to make geldsetzer_group a column
density_stats_df = density_stats_df.reset_index()

# Display the dataframe
print("Density Statistics by Geldsetzer Group:")
print("="*50)
display(density_stats_df)
density_stats_df.to_csv('density_stats_df.csv')

# Show summary information
print(f"\nSummary:")
print(f"Total groups: {len(density_stats_df)}")
print(f"Total observations: {density_stats_df['count'].sum():,}")
print(f"Groups with >100 observations: {(density_stats_df['count'] > 100).sum()}")
print(f"Groups with >50 observations: {(density_stats_df['count'] > 50).sum()}")


Density Statistics by Geldsetzer Group:


Unnamed: 0,index,geldsetzer_group,count,mean,median,std,min,max,Q1,Q3,IQR,hand_hardness,geldsetzer_grain_form,geldsetzer_density,diff,diff_percent
0,2,1F / FC,1702,301.77,300.0,98.30,0.00,949.71,250.00,350.00,100.00,1F,FC,250.0,-51.77,-17.155450
1,7,1F / RG,1576,295.60,290.0,116.59,0.00,934.00,228.60,376.00,147.40,1F,RG,202.0,-93.60,-31.664411
2,112,P / RG,1548,336.28,350.0,110.36,0.00,720.00,280.00,412.00,132.00,P,RG,273.0,-63.28,-18.817652
3,30,4F / FC,1367,284.04,286.0,107.99,0.00,918.00,230.00,331.96,101.96,4F,FC,204.0,-80.04,-28.179130
4,109,P / MF,1244,350.02,340.0,149.14,0.05,973.43,250.00,450.00,200.00,P,MF,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,99,K+ / SH,2,502.50,502.5,17.68,490.00,515.00,496.25,508.75,12.50,K+,SH,,,
126,127,P- / PPgp,2,330.00,330.0,28.28,310.00,350.00,320.00,340.00,20.00,P-,PPgp,219.0,-111.00,-33.636364
127,73,F- / IF,2,110.00,110.0,127.28,20.00,200.00,65.00,155.00,90.00,F-,IF,,,
128,80,I / PP,1,370.00,370.0,,370.00,370.00,370.00,370.00,0.00,I,PP,,,



Summary:
Total groups: 130
Total observations: 23,795
Groups with >100 observations: 43
Groups with >50 observations: 56
