In [47]:
# Import Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from snowpylot import caaml_parser
from geldsetzer_utils import get_density, convert_grain_form


In [48]:
def parse_pits(folder_path):
    """
    Function to parse CAAML files in the specified folder
    """
    files = [
        f for f in os.listdir(folder_path) if f.endswith(".xml")
    ]  # List of all .xml files in the folder

    pits_list = []

    for file in files:  # iterate through each file in the folder
        file_path = os.path.join(folder_path, file)  # create the file path
        try:
            pit = caaml_parser(file_path)  # parse the file
            pits_list.append(pit)
        except Exception as e:
            print(f"Error parsing {file}: {str(e)}")

    return pits_list


In [49]:
# Find all directories in new_data (these are the extracted folders)
new_data_dir = "../snowpits/new_data"
all_pits = []

# Get all items in new_data directory
items_in_new_data = os.listdir(new_data_dir)

# Filter to only directories (ignore tar.gz files)
extracted_directories = [
    item for item in items_in_new_data 
    if os.path.isdir(os.path.join(new_data_dir, item)) and not item.startswith('.')
]

print(f"Found {len(extracted_directories)} extracted directories to process:")
for directory in sorted(extracted_directories):
    print(f"  📁 {directory}")

print("\nParsing snowpit files from each directory...")

# Parse pits from each extracted directory
total_pits_parsed = 0
for directory in extracted_directories:
    directory_path = os.path.join(new_data_dir, directory)
    print(f"\nProcessing: {directory}")
    
    try:
        # Parse all pits in this directory
        pits_from_directory = parse_pits(directory_path)
        directory_pit_count = len(pits_from_directory)
        
        # Add to the master list
        all_pits.extend(pits_from_directory)
        total_pits_parsed += directory_pit_count
        
        print(f"  ✅ Parsed {directory_pit_count} snowpits")
        
    except Exception as e:
        print(f"  ❌ Error processing directory {directory}: {str(e)}")

print("\n🎉 Parsing complete!")
print(f"Total directories processed: {len(extracted_directories)}")
print(f"Total snowpits parsed: {total_pits_parsed}")
print(f"Length of all_pits list: {len(all_pits)}")

# Display some basic info about the parsed data
if all_pits:
    print("\nSample pit information:")
    sample_pit = all_pits[0]
    print(f"  First pit ID: {sample_pit.core_info.pit_id}")
    print(f"  Date: {sample_pit.core_info.date}")
    print(f"  Number of layers: {len(sample_pit.snow_profile.layers)}")
    
    # Show distribution of pits by some basic metadata if available
    pit_ids = [pit.core_info.pit_id for pit in all_pits if pit.core_info.pit_id]
    print(f"  Unique pit IDs: {len(set(pit_ids))}")
else:
    print("⚠️  No pits were successfully parsed")


Found 45 extracted directories to process:
  📁 sverplanck-snowpits-25-07-21-07-03
  📁 sverplanck-snowpits-25-07-21-07-04
  📁 sverplanck-snowpits-25-07-21-07-05
  📁 sverplanck-snowpits-25-07-21-07-06
  📁 sverplanck-snowpits-25-07-21-07-07
  📁 sverplanck-snowpits-25-07-21-07-10
  📁 sverplanck-snowpits-25-07-21-07-11
  📁 sverplanck-snowpits-25-07-21-07-12
  📁 sverplanck-snowpits-25-07-21-07-13
  📁 sverplanck-snowpits-25-07-21-07-14
  📁 sverplanck-snowpits-25-07-21-07-15
  📁 sverplanck-snowpits-25-07-21-07-16
  📁 sverplanck-snowpits-25-07-21-07-17
  📁 sverplanck-snowpits-25-07-21-07-18
  📁 sverplanck-snowpits-25-07-21-07-19
  📁 sverplanck-snowpits-25-07-21-07-20
  📁 sverplanck-snowpits-25-07-21-07-21
  📁 sverplanck-snowpits-25-07-21-07-22
  📁 sverplanck-snowpits-25-07-21-07-23
  📁 sverplanck-snowpits-25-07-21-07-24
  📁 sverplanck-snowpits-25-07-21-07-25
  📁 sverplanck-snowpits-25-07-21-07-26
  📁 sverplanck-snowpits-25-07-21-07-27
  📁 sverplanck-snowpits-25-07-21-07-28
  📁 sverplanck-snowpi

In [None]:
# Data Summary

pit_info = []
layer_info = []
density_info = []
geldsetzer_info = []

for pit in all_pits:
    pit_dict = {
        'pit_id': pit.core_info.pit_id,
        'layer_count': len(pit.snow_profile.layers),
        'density_count': len(pit.snow_profile.density_profile)
    }
    pit_info.append(pit_dict)

    for density in pit.snow_profile.density_profile:
        density_dict = {
            'pit_id': pit.core_info.pit_id,
            'depth_top': density.depth_top,
            'thickness': density.thickness,
            'density': density.density,
        }
        density_info.append(density_dict)

    for layer in pit.snow_profile.layers:
        layer_dict = {
            'pit_id': pit.core_info.pit_id,
            'depth_top': layer.depth_top,
            'thickness': layer.thickness,
            'hand_hardness': layer.hardness,
            'wetness': layer.wetness,
            'layer_of_concern': layer.layer_of_concern,
            'grain_form_primary': layer.grain_form_primary,
            'grain_form_secondary': layer.grain_form_secondary,
        }
        layer_info.append(layer_dict)

    for density in pit.snow_profile.density_profile:
        # NOTE: Update to capture density measurements where density layer falls within a profile layer, but the depth_top is not the same
        for layer in pit.snow_profile.layers:
            if density.depth_top == layer.depth_top and layer.grain_form_primary and layer.hardness:
                geldsetzer_dict = {
                    'pit_id': pit.core_info.pit_id,
                    'depth_top': density.depth_top,
                    'density': density.density[0], # in kg/m^3
                    'hand_hardness': layer.hardness,
                    'grain_form': layer.grain_form_primary.grain_form,
                    'basic_grain_class_code': layer.grain_form_primary.basic_grain_class_code,
                    'sub_grain_class_code': layer.grain_form_primary.sub_grain_class_code,
                    'geldsetzer_grain_form': convert_grain_form(layer),
                }
                geldsetzer_info.append(geldsetzer_dict)
    

In [51]:
# Summary stats for pit_info
pit_info_df = pd.DataFrame(pit_info)
total_pits = len(pit_info_df)
print(f"Total pits: {total_pits:,}")

# Calculate number of pits with density measurements
pits_with_density = (pit_info_df['density_count'] > 0).sum()
percentage_with_density = (pits_with_density / total_pits) * 100

print(f"Pits with density measurements: {pits_with_density:,}")
print(f"Percentage of pits with density data: {percentage_with_density:.1f}%")

# Summary stats for density_info
density_info_df = pd.DataFrame(density_info)
print("density_info_df")
for column in density_info_df.columns:
        # Count non-null values
        non_null_count = density_info_df[column].notna().sum()
        print(f"Number of values for {column:25}: {non_null_count:8,}")

# Summary stats for layer_info
layer_info_df = pd.DataFrame(layer_info)
print("layer_info_df")
for column in layer_info_df.columns:
        # Count non-null values
        non_null_count = layer_info_df[column].notna().sum()
        print(f"Number of values for {column:25}: {non_null_count:8,}")


Total pits: 34,010
Pits with density measurements: 2,007
Percentage of pits with density data: 5.9%
density_info_df
Number of values for pit_id                   :   14,538
Number of values for depth_top                :   14,538
Number of values for thickness                :    7,021
Number of values for density                  :   14,538
layer_info_df
Number of values for pit_id                   :  254,387
Number of values for depth_top                :  254,387
Number of values for thickness                :  254,387
Number of values for hand_hardness            :  230,434
Number of values for wetness                  :   60,228
Number of values for layer_of_concern         :   24,628
Number of values for grain_form_primary       :  208,388
Number of values for grain_form_secondary     :   28,436


In [52]:
# Summary stats for geldsetzer_info
geldsetzer_info_df = pd.DataFrame(geldsetzer_info)
print("geldsetzer_info_df")
for column in geldsetzer_info_df.columns:
        # Count non-null values
        non_null_count = geldsetzer_info_df[column].notna().sum()
        print(f"Number of values for {column:25}: {non_null_count:8,}")

geldsetzer_info_df.head()


geldsetzer_info_df
Number of values for pit_id                   :    7,186
Number of values for depth_top                :    7,186
Number of values for density                  :    7,186
Number of values for hand_hardness            :    7,186
Number of values for grain_form               :    7,114
Number of values for basic_grain_class_code   :    7,114
Number of values for sub_grain_class_code     :    3,907
Number of values for geldsetzer_grain_form    :    7,114


Unnamed: 0,pit_id,depth_top,density,hand_hardness,grain_form,basic_grain_class_code,sub_grain_class_code,geldsetzer_grain_form
0,26319,"[0.0, cm]",250.0,1F,DFdc,DF,DFdc,DF
1,26411,"[31.0, cm]",270.0,F,FCxr,FC,FCxr,FC
2,26808,"[0.0, cm]",120.0,F,DFdc,DF,DFdc,DF
3,26808,"[8.0, cm]",440.0,K,MFpc,MF,MFpc,MF
4,26808,"[34.0, cm]",420.0,P-,MFpc,MF,MFpc,MF


In [53]:
# Map Hardness to group

hardness_to_group = {
    "F-": "F",
    "F": "F",
    "F+": "F",
    "4F-": "4F",
    "4F": "4F",
    "4F+": "4F",
    "1F-": "1F",
    "1F": "1F",
    "1F+": "1F",
    "P-": "P",
    "P": "P",
    "P+": "P",
    "K-": "K",
    "K": "K",
    "K+": "K",
    "I-": "I",
    "I": "I",
    "I+": "I",
}

# Add the new column using the mapping
geldsetzer_info_df["hardness_group"] = geldsetzer_info_df["hand_hardness"].map(hardness_to_group)
geldsetzer_info_df.head()


Unnamed: 0,pit_id,depth_top,density,hand_hardness,grain_form,basic_grain_class_code,sub_grain_class_code,geldsetzer_grain_form,hardness_group
0,26319,"[0.0, cm]",250.0,1F,DFdc,DF,DFdc,DF,1F
1,26411,"[31.0, cm]",270.0,F,FCxr,FC,FCxr,FC,F
2,26808,"[0.0, cm]",120.0,F,DFdc,DF,DFdc,DF,F
3,26808,"[8.0, cm]",440.0,K,MFpc,MF,MFpc,MF,K
4,26808,"[34.0, cm]",420.0,P-,MFpc,MF,MFpc,MF,P


In [54]:
# Add columns for grain form and hardness group combinations

geldsetzer_info_df["geldsetzer_group"] = geldsetzer_info_df["hand_hardness"] + " / " + geldsetzer_info_df["geldsetzer_grain_form"]
geldsetzer_info_df.head()


Unnamed: 0,pit_id,depth_top,density,hand_hardness,grain_form,basic_grain_class_code,sub_grain_class_code,geldsetzer_grain_form,hardness_group,geldsetzer_group
0,26319,"[0.0, cm]",250.0,1F,DFdc,DF,DFdc,DF,1F,1F / DF
1,26411,"[31.0, cm]",270.0,F,FCxr,FC,FCxr,FC,F,F / FC
2,26808,"[0.0, cm]",120.0,F,DFdc,DF,DFdc,DF,F,F / DF
3,26808,"[8.0, cm]",440.0,K,MFpc,MF,MFpc,MF,K,K / MF
4,26808,"[34.0, cm]",420.0,P-,MFpc,MF,MFpc,MF,P,P- / MF


In [55]:
# Create a comprehensive density statistics dataframe by geldsetzer_group
density_stats_df = geldsetzer_info_df.dropna(subset=['density', 'geldsetzer_group']).groupby('geldsetzer_group')['density'].agg([
    'count',
    'mean', 
    'median',
    'std',
    'min',
    'max',
    lambda x: x.quantile(0.25),  # Q1
    lambda x: x.quantile(0.75),  # Q3
]).round(2)

# Rename the lambda columns to more readable names
density_stats_df.columns = ['count', 'mean', 'median', 'std', 'min', 'max', 'Q1', 'Q3']

# Add interquartile range
density_stats_df['IQR'] = (density_stats_df['Q3'] - density_stats_df['Q1']).round(2)

# Get unique mappings of geldsetzer_group to hardness_group and geldsetzer_grain_form from the original dataframe
group_mappings = geldsetzer_info_df[['geldsetzer_group', 'hand_hardness', 'geldsetzer_grain_form']].drop_duplicates(subset=['geldsetzer_group'])

# Merge these mappings with the density_stats_df
density_stats_df = density_stats_df.merge(group_mappings, on='geldsetzer_group', how='left')

# Add geldsetzer_density to stats df - apply function row by row
density_stats_df['geldsetzer_density'] = density_stats_df.apply(
    lambda row: get_density(row['hand_hardness'], row['geldsetzer_grain_form']), 
    axis=1
)

# Add diff column
density_stats_df['diff'] = density_stats_df['geldsetzer_density'] - density_stats_df['mean']
density_stats_df['diff_percent'] = density_stats_df['diff'] / density_stats_df['mean'] * 100

# Sort by count (most observations first)
density_stats_df = density_stats_df.sort_values('count', ascending=False)

# Reset index to make geldsetzer_group a column
density_stats_df = density_stats_df.reset_index()

# Display the dataframe
print("Density Statistics by Geldsetzer Group:")
print("="*50)
display(density_stats_df)
density_stats_df.to_csv('density_stats_df.csv')

# Show summary information
print(f"\nSummary:")
print(f"Total groups: {len(density_stats_df)}")
print(f"Total observations: {density_stats_df['count'].sum():,}")
print(f"Groups with >100 observations: {(density_stats_df['count'] > 100).sum()}")
print(f"Groups with >50 observations: {(density_stats_df['count'] > 50).sum()}")


Density Statistics by Geldsetzer Group:


Unnamed: 0,index,geldsetzer_group,count,mean,median,std,min,max,Q1,Q3,IQR,hand_hardness,geldsetzer_grain_form,geldsetzer_density,diff,diff_percent
0,2,1F / FC,521,298.20,288.0,104.40,0.0,918.0,250.00,342.86,92.86,1F,FC,250.0,-48.20,-16.163649
1,102,P / RG,454,329.23,330.0,108.02,0.0,650.0,280.00,396.50,116.50,P,RG,273.0,-56.23,-17.079246
2,7,1F / RG,453,281.94,273.0,112.90,0.0,596.0,220.00,350.00,130.00,1F,RG,202.0,-79.94,-28.353550
3,25,4F / FC,376,267.14,250.0,112.16,0.0,897.0,208.75,328.50,119.75,4F,FC,204.0,-63.14,-23.635547
4,97,P / FC,348,339.40,330.0,78.55,0.0,600.0,300.00,380.00,80.00,P,FC,296.0,-43.40,-12.787272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,26,4F / IF,1,269.00,269.0,,269.0,269.0,269.00,269.00,0.00,4F,IF,,,
114,35,4F+ / IF,1,490.00,490.0,,490.0,490.0,490.00,490.00,0.00,4F+,IF,,,
115,87,K+ / IF,1,370.00,370.0,,370.0,370.0,370.00,370.00,0.00,K+,IF,,,
116,45,4F- / PPgp,1,210.00,210.0,,210.0,210.0,210.00,210.00,0.00,4F-,PPgp,145.0,-65.00,-30.952381



Summary:
Total groups: 118
Total observations: 7,114
Groups with >100 observations: 22
Groups with >50 observations: 33
