In [2]:
# Import Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from snowpylot import caaml_parser


In [5]:
def parse_pits(folder_path):
    """
    Function to parse CAAML files in the specified folder
    """
    files = [
        f for f in os.listdir(folder_path) if f.endswith(".xml")
    ]  # List of all .xml files in the folder

    pits_list = []

    for file in files:  # iterate through each file in the folder
        file_path = os.path.join(folder_path, file)  # create the file path
        try:
            pit = caaml_parser(file_path)  # parse the file
            pits_list.append(pit)
        except Exception as e:
            print(f"Error parsing {file}: {str(e)}")

    return pits_list


In [6]:
# Find all directories in new_data (these are the extracted folders)
new_data_dir = "../snowpits/new_data"
all_pits = []

# Get all items in new_data directory
items_in_new_data = os.listdir(new_data_dir)

# Filter to only directories (ignore tar.gz files)
extracted_directories = [
    item for item in items_in_new_data 
    if os.path.isdir(os.path.join(new_data_dir, item)) and not item.startswith('.')
]

print(f"Found {len(extracted_directories)} extracted directories to process:")
for directory in sorted(extracted_directories):
    print(f"  📁 {directory}")

print("\nParsing snowpit files from each directory...")

# Parse pits from each extracted directory
total_pits_parsed = 0
for directory in extracted_directories:
    directory_path = os.path.join(new_data_dir, directory)
    print(f"\nProcessing: {directory}")
    
    try:
        # Parse all pits in this directory
        pits_from_directory = parse_pits(directory_path)
        directory_pit_count = len(pits_from_directory)
        
        # Add to the master list
        all_pits.extend(pits_from_directory)
        total_pits_parsed += directory_pit_count
        
        print(f"  ✅ Parsed {directory_pit_count} snowpits")
        
    except Exception as e:
        print(f"  ❌ Error processing directory {directory}: {str(e)}")

print("\n🎉 Parsing complete!")
print(f"Total directories processed: {len(extracted_directories)}")
print(f"Total snowpits parsed: {total_pits_parsed}")
print(f"Length of all_pits list: {len(all_pits)}")

# Display some basic info about the parsed data
if all_pits:
    print("\nSample pit information:")
    sample_pit = all_pits[0]
    print(f"  First pit ID: {sample_pit.core_info.pit_id}")
    print(f"  Date: {sample_pit.core_info.date}")
    print(f"  Number of layers: {len(sample_pit.snow_profile.layers)}")
    
    # Show distribution of pits by some basic metadata if available
    pit_ids = [pit.core_info.pit_id for pit in all_pits if pit.core_info.pit_id]
    print(f"  Unique pit IDs: {len(set(pit_ids))}")
else:
    print("⚠️  No pits were successfully parsed")


Found 45 extracted directories to process:
  📁 sverplanck-snowpits-25-07-21-07-03
  📁 sverplanck-snowpits-25-07-21-07-04
  📁 sverplanck-snowpits-25-07-21-07-05
  📁 sverplanck-snowpits-25-07-21-07-06
  📁 sverplanck-snowpits-25-07-21-07-07
  📁 sverplanck-snowpits-25-07-21-07-10
  📁 sverplanck-snowpits-25-07-21-07-11
  📁 sverplanck-snowpits-25-07-21-07-12
  📁 sverplanck-snowpits-25-07-21-07-13
  📁 sverplanck-snowpits-25-07-21-07-14
  📁 sverplanck-snowpits-25-07-21-07-15
  📁 sverplanck-snowpits-25-07-21-07-16
  📁 sverplanck-snowpits-25-07-21-07-17
  📁 sverplanck-snowpits-25-07-21-07-18
  📁 sverplanck-snowpits-25-07-21-07-19
  📁 sverplanck-snowpits-25-07-21-07-20
  📁 sverplanck-snowpits-25-07-21-07-21
  📁 sverplanck-snowpits-25-07-21-07-22
  📁 sverplanck-snowpits-25-07-21-07-23
  📁 sverplanck-snowpits-25-07-21-07-24
  📁 sverplanck-snowpits-25-07-21-07-25
  📁 sverplanck-snowpits-25-07-21-07-26
  📁 sverplanck-snowpits-25-07-21-07-27
  📁 sverplanck-snowpits-25-07-21-07-28
  📁 sverplanck-snowpi