# Summary Statistics for Wumph Dataset

Wumph pits exported on 2/19/25

* Number of pits
* Number of layers
* Number of layers with Grain Form
* Number of layers with Grain Size
* Number of pits with Density Profile
* Number of density measurements
* Number of pits with Temp Profile
* Number of temp measurements
* Number of ECT Results
* Number of CT Results
* Number of PST Results
* Number of RBT Results
* Whumpf data summary

# To do

- map grain forms to match paper
- Reference Philipps code to plot
- programatically query snowpilot database (snowPilotQueryEngine) 
- look at pep8 conformance
- install and use Ruff formatter
- Github actions using twine (ref weac .toml)
- bump-my-version (python package GitHub release workflow)
- Sphinx documentation


In [2]:
# Import libraries
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
from snowpylot.caaml_parser import caaml_parser

ModuleNotFoundError: No module named 'snowpylot'

In [2]:
# Load and parse xml files
folder_path = "snowpits/wumph_pits"  # wumph data set pits exported on 2/19/25

# Create a list of all CAAML files in the folder
caaml_files = [
    f for f in os.listdir(folder_path) if f.endswith(".xml")
]  # List of all CAAML files in the folder

In [3]:
# Parse all pits and summarize available info
pits = []
pitList = []
pitDict = {}

for file in caaml_files:
    file_path = folder_path + "/" + file
    pit = caaml_parser(file_path)

    numPrimaryGrainForm = 0
    numPrimaryGrainSize = 0
    for layer in pit.snowProfile.layers:
        if layer.grainFormPrimary is not None:
            numPrimaryGrainForm += 1
            if layer.grainFormPrimary.grainSizeAvg is not None:
                numPrimaryGrainSize += 1

    pitDict = {
        "PitID": pit.coreInfo.pitID,
        "Operation Name": pit.coreInfo.user.operationName,
        "SnowPilot Username": pit.coreInfo.user.username,
        "num Layers": len(pit.snowProfile.layers),
        "num Layers wPrimary Grain Form": numPrimaryGrainForm,
        "num Layers wPrimary Grain Size": numPrimaryGrainSize,
        "tempMeasurements": len(pit.snowProfile.tempProfile)
        if pit.snowProfile.tempProfile != None
        else 0,
        "densityMeasurements": len(pit.snowProfile.densityProfile)
        if pit.snowProfile.densityProfile != None
        else 0,
        "ECT_qty": len(pit.stabilityTests.ECT),
        "CT_qty": len(pit.stabilityTests.CT),
        "PST_qty": len(pit.stabilityTests.PST),
        "RBT_qty": len(pit.stabilityTests.RBlock),
        "whumpfCracking": pit.whumpfData.whumpfCracking,
        "whumpfNoCracking": pit.whumpfData.whumpfNoCracking,
        "crackingNoWhumpf": pit.whumpfData.crackingNoWhumpf,
        "whumpfNearPit": pit.whumpfData.whumpfNearPit,
        "whumpfDepthWeakLayer": pit.whumpfData.whumpfDepthWeakLayer,
        "whumpfTriggeredRemoteAva": pit.whumpfData.whumpfTriggeredRemoteAva,
        "whumpfSize": pit.whumpfData.whumpfSize,
    }
    pitList.append(pitDict)
    pits.append(pit)

df = pd.DataFrame(pitList)

# df.to_csv('wumph-dataset-summary.csv', index=False)

In [None]:
# Print dataframe
print(df.to_string(index=False, max_rows=10))

In [None]:
summary_info = {
    "Pits": int(df["PitID"].count()),
    "Layers": int(df["num Layers"].sum()),
    "Layers wPrimary Grain Form": int(df["num Layers wPrimary Grain Form"].sum()),
    "Layers wPrimary Grain Size": int(df["num Layers wPrimary Grain Size"].sum()),
    "Pits with Density Info": int((df["densityMeasurements"] != 0).sum()),
    "Density Measurements": int(df["densityMeasurements"].sum()),
    "Pits with Temp Info": int((df["tempMeasurements"] != 0).sum()),
    "Temp Measurements": int(df["tempMeasurements"].sum()),
    "ECT Results": int(df["ECT_qty"].sum()),
    "CT Results": int(df["CT_qty"].sum()),
    "PST Results": int(df["PST_qty"].sum()),
    "RBT Results": int(df["RBT_qty"].sum()),
    "Pits where wumpfCracking = true": int((df["whumpfCracking"] == "true").sum()),
    "Pits where whumpfNoCracking = true": int((df["whumpfNoCracking"] == "true").sum()),
    "Pits where crackingNoWhumpf = true": int((df["crackingNoWhumpf"] == "true").sum()),
    "Pits where whumpfNearPit = true": int((df["whumpfNearPit"] == "true").sum()),
    "Pits where whumpfDepthWeakLayer = true": int(
        (df["whumpfDepthWeakLayer"] == "true").sum()
    ),
    "Pits where whumpfTriggeredRemoteAva = true": int(
        (df["whumpfTriggeredRemoteAva"] == "true").sum()
    ),
    "Pits where whumpfSize is specified": int((df["whumpfSize"] != None).sum()),
}

for key, value in summary_info.items():
    print(key + ": " + str(value))

In [None]:
# Create histogram of pits by Operation
plt.figure(figsize=(12, 6))
df["Operation Name"].value_counts().plot(kind="bar")
plt.title("Distribution of Snow Pits by Operation")
plt.xlabel("Operation Name")
plt.ylabel("Number of Pits")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()  # Adjusts layout to prevent label cutoff
plt.show()

# Create summary table
operation_counts = df["Operation Name"].value_counts()
print("Summary of Snow Pits by Operation:")
print("----------------------------------")
print(f"Total number of Operations: {len(operation_counts)}")
print(f"Total number of Professional Pits: {operation_counts.sum()}")
print("\nBreakdown by Operation:")
print(operation_counts.to_frame().rename(columns={"Operation Name": "Number of Pits"}))

In [None]:
# Create histogram of pits by User
plt.figure(figsize=(12, 6))
df["SnowPilot Username"].value_counts().plot(kind="bar")
plt.title("Distribution of Snow Pits by User")
plt.xlabel("SnowPilot Username")
plt.ylabel("Number of Pits")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()  # Adjusts layout to prevent label cutoff
plt.show()

# Create summary table
operation_counts = df["SnowPilot Username"].value_counts()
print("Summary of Snow Pits by Individual:")
print("----------------------------------")
print(f"Total number of Individual Users: {len(operation_counts)}")
print(f"Total number of pits: {operation_counts.sum()}")
print("\nBreakdown by Individual:")
print(
    operation_counts.to_frame().rename(
        columns={"SnowPilot Username:": "Number of Pits"}
    )
)

In [None]:
# Print example pit
pit1 = caaml_parser(folder_path + "/" + caaml_files[0])
print(pit1)

In [None]:
print(pit1.coreInfo)

In [None]:
print(pit1.stabilityTests)