In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('json')
alt.renderers.enable('jupyterlab')

RendererRegistry.enable('jupyterlab')

# Open precipitation dataset

In [2]:
precip_file = "/data2/elilouis/sublimationofsnow/tilden_precip_data/kettle_ponds_precip.csv"

precip_df = pd.read_csv(precip_file)

precip_df['date'] = pd.to_datetime(precip_df['date'])

acc_precip_on_first_day = precip_df.set_index('date').loc['20221130'].acc_prec

precip_df = precip_df.set_index('date').loc['20221130': '20230510'].reset_index()
precip_df['acc_prec']  = precip_df['acc_prec'] - acc_precip_on_first_day

FileNotFoundError: [Errno 2] No such file or directory: '/data2/elilouis/sublimationofsnow/tilden_precip_data/kettle_ponds_precip.csv'

# Open SOS Measurement Dataset

In [3]:
start_date = '20221130'
end_date = '20230509'
# open files
tidy_df_5Min = pd.read_parquet('../sos/tidy_df_20221130_20230517_noplanar_fit.parquet')
tidy_df_30Min = pd.read_parquet('../sos/tidy_df_30Min_20221130_20230517_noplanar_fit.parquet')
# convert time column to datetime
tidy_df_5Min['time'] = pd.to_datetime(tidy_df_5Min['time'])
tidy_df_30Min['time'] = pd.to_datetime(tidy_df_30Min['time'])
# limit data to our dates of interest, based on continuous snow cover at Kettle Ponds
tidy_df_5Min = tidy_df_5Min.set_index('time').loc[start_date:end_date].reset_index()
tidy_df_30Min = tidy_df_30Min.set_index('time').loc[start_date:end_date].reset_index()

  tidy_df_5Min = tidy_df_5Min.set_index('time').loc[start_date:end_date].reset_index()


In [4]:
# quick way to get variable info if we want it 
# import xarray as xr
# ds = xr.open_dataset("/data2/elilouis/sublimationofsnow/sosnoqc/isfs_20221228.nc")
# ds['SWE_p2_c']

## Clean the data

### Step 1: remove all LH flux data points with less than 90% of 20hz data being good
### Step 2: remove all LH flux data points with magnitude greater than 1 g/m^2/s

In [5]:
ec_lhflux_and_counts_variables = [
    ('w_h2o__2m_c', 'counts_2m_c_1'), 
    ('w_h2o__3m_c', 'counts_3m_c_1'), 
    ('w_h2o__5m_c', 'counts_5m_c_1'), 
    ('w_h2o__10m_c', 'counts_10m_c_1'), 
    ('w_h2o__15m_c', 'counts_15m_c_1'), 
    ('w_h2o__20m_c', 'counts_20m_c_1'), 


    ('w_h2o__1m_d', 'counts_1m_d_1'), 
    ('w_h2o__3m_d', 'counts_3m_d_1'), 
    ('w_h2o__10m_d', 'counts_10m_d_1'), 
      
    ('w_h2o__1m_ue', 'counts_1m_ue_1'), 
    ('w_h2o__3m_ue', 'counts_3m_ue_1'), 
    ('w_h2o__10m_ue', 'counts_10m_ue_1'), 


    ('w_h2o__1m_uw',  'counts_1m_uw_1'), 
    ('w_h2o__3m_uw', 'counts_3m_uw_1'), 
    ('w_h2o__10m_uw', 'counts_10m_uw_1'), 
]
ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]

In [6]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
all_lhflux_measurements.mean(), all_lhflux_measurements.std(), all_lhflux_measurements.min(), all_lhflux_measurements.max()

(0.005835441437612255, 2.719713578112172, -123.26387023925781, 2131.2578125)

In [7]:
####################################################################################
# Remove all data points at once - perform both steps 1 and 2 simultaneously
####################################################################################
# for flux_var, counts_var in ec_lhflux_and_counts_variables:
#     print(flux_var, counts_var)
#     counts_src = tidy_df_5Min[tidy_df_5Min.variable == counts_var]
#     times_with_good_data_50percent = counts_src[counts_src.value >= 5400].time
#     n_before_dropping = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())
#     tidy_df_5Min.loc[
#         (~tidy_df_5Min['time'].isin(times_with_good_data_50percent)) &
#         (tidy_df_5Min['variable'] == flux_var),
#         'value'
#     ] = np.nan
#     n_after_step_1 = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())

#     variable_src = tidy_df_5Min[tidy_df_5Min.variable == flux_var]
#     times_with_outofbounds_values = variable_src[np.abs(variable_src.value) > 1].time
#     tidy_df_5Min.loc[
#         (tidy_df_5Min['time'].isin(times_with_outofbounds_values)) & 
#         (tidy_df_5Min['variable'] == flux_var),
#         'value'
#     ] = np.nan
#     n_after_step_2 = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())
#     print(n_before_dropping, n_after_step_1, n_after_step_2)
#     print(round((n_before_dropping-n_after_step_2)/n_before_dropping, 3))

####################################################################################
# Perform steps 1 and 2 separately 
####################################################################################
for flux_var, counts_var in ec_lhflux_and_counts_variables:
    counts_src = tidy_df_5Min[tidy_df_5Min.variable == counts_var]
    times_with_good_data_50percent = counts_src[counts_src.value >= 5400].time
    tidy_df_5Min.loc[
        (~tidy_df_5Min['time'].isin(times_with_good_data_50percent)) &
        (tidy_df_5Min['variable'] == flux_var),
        'value'
    ] = np.nan

all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
mean = all_lhflux_measurements.mean() 
stddev = all_lhflux_measurements.std()
print(mean, stddev, all_lhflux_measurements.min(), all_lhflux_measurements.max())

for flux_var, counts_var in ec_lhflux_and_counts_variables:
    variable_src = tidy_df_5Min[tidy_df_5Min.variable == flux_var]
    times_with_outofbounds_values = variable_src[
        ((variable_src.value) > (mean + 5*stddev)) |
        ((variable_src.value) < (mean - 5*stddev))
    ].time
    tidy_df_5Min.loc[
        (tidy_df_5Min['time'].isin(times_with_outofbounds_values)) & 
        (tidy_df_5Min['variable'] == flux_var),
        'value'
    ] = np.nan

0.005999783807095677 2.7375154019082126 -28.619958877563477 2131.2578125


In [8]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
print(all_lhflux_measurements.mean(), all_lhflux_measurements.std(), all_lhflux_measurements.min(), all_lhflux_measurements.max())

0.002559840776554578 0.018658400299491427 -8.453908920288086 4.933438777923584


# Open Model Ensemble Dataset

In [9]:
model_df = pd.read_parquet("model_results.parquet")
# add a bunch of columns that are descriptive, from the config column which has multiple bits of info
model_df['z0'] = model_df['config'].apply(
    lambda v: float(v.split(' ')[-1])
)
model_df['e_sat_curve'] = model_df['config'].apply(
    lambda v: 'metpy' if 'metpy' in v else 'alduchov'
)
model_df['surface_measurement'] = model_df['config'].apply(
    lambda v: v.split(' ')[-3]
)
model_df['scheme'] = model_df['config'].apply(
    lambda v: 'andreas' if 'andreas lengths' in v else 'yang'
)
model_df['most_config'] = model_df['config'].apply(lambda s: ' '.join(s.split(' ')[:-3]))
# remove the scalar roughness length parameterization info 
model_df['most_config'] = model_df['most_config'].str.replace(' andreas lengths', '')
model_df.head()

Unnamed: 0,time,config,latent heat flux,sensible heat flux,zeta,latent heat conductance,sensible heat conductance,z0,e_sat_curve,surface_measurement,scheme,most_config
0,2022-11-29 17:00:00,Standard Tsurf_c e_sat_metpy 1e-05,0.003384,0.002263,,0.00349,0.00349,1e-05,metpy,Tsurf_c,yang,Standard
1,2022-11-29 17:30:00,Standard Tsurf_c e_sat_metpy 1e-05,0.005943,0.000919,,0.00469,0.00469,1e-05,metpy,Tsurf_c,yang,Standard
2,2022-11-29 18:00:00,Standard Tsurf_c e_sat_metpy 1e-05,0.00447,-0.001354,,0.003773,0.003773,1e-05,metpy,Tsurf_c,yang,Standard
3,2022-11-29 18:30:00,Standard Tsurf_c e_sat_metpy 1e-05,0.001274,-0.001298,,0.001657,0.001657,1e-05,metpy,Tsurf_c,yang,Standard
4,2022-11-29 19:00:00,Standard Tsurf_c e_sat_metpy 1e-05,0.000506,-0.000939,,0.00104,0.00104,1e-05,metpy,Tsurf_c,yang,Standard


### Handle a pesky outlier

In [10]:
model_df.loc[(model_df.time == "2023-01-22 1400") & (model_df.surface_measurement == 'Tsurf_d'), 'latent heat flux'] = 0
model_df.loc[(model_df.time == "2023-01-22 1400") & (model_df.surface_measurement == 'Tsurf_d'), 'sensible heat flux'] = 0

# Calculate cumulative sublimation (mm)

## EC Measurements

In [11]:
from sublimpy import tidy
import metpy.constants

In [12]:
ec_lhflux_and_counts_variables

[('w_h2o__2m_c', 'counts_2m_c_1'),
 ('w_h2o__3m_c', 'counts_3m_c_1'),
 ('w_h2o__5m_c', 'counts_5m_c_1'),
 ('w_h2o__10m_c', 'counts_10m_c_1'),
 ('w_h2o__15m_c', 'counts_15m_c_1'),
 ('w_h2o__20m_c', 'counts_20m_c_1'),
 ('w_h2o__1m_d', 'counts_1m_d_1'),
 ('w_h2o__3m_d', 'counts_3m_d_1'),
 ('w_h2o__10m_d', 'counts_10m_d_1'),
 ('w_h2o__1m_ue', 'counts_1m_ue_1'),
 ('w_h2o__3m_ue', 'counts_3m_ue_1'),
 ('w_h2o__10m_ue', 'counts_10m_ue_1'),
 ('w_h2o__1m_uw', 'counts_1m_uw_1'),
 ('w_h2o__3m_uw', 'counts_3m_uw_1'),
 ('w_h2o__10m_uw', 'counts_10m_uw_1')]

In [13]:
ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]
seconds_per_5min = 60*5
for variable in ec_lhflux_variables:
    height = int(variable.split('_')[-2].split('m')[0])
    tower = variable.split('_')[-1]
    # print(len(tidy_df_5Min.query(f"variable == '{variable}'")))
    # print(len(np.nancumsum(tidy_df_5Min.query(f"variable == '{variable}'")['value']*seconds_per_5min)/metpy.constants.density_water.magnitude,))
    tidy_df_5Min = tidy.tidy_df_add_variable(
        tidy_df_5Min,
        np.nancumsum(tidy_df_5Min.query(f"variable == '{variable}'")['value']*seconds_per_5min)/metpy.constants.density_water.magnitude,
        f"cumulative_sub_measured_{height}m_{tower}",
        "Cumulative sublimation measured",  
        height,
        tower
    )

## Model Results

In [14]:
model_df_cumsum = pd.DataFrame(model_df.sort_values("time").set_index(
    ["time", "config", "scheme", "z0", "e_sat_curve", "surface_measurement", "most_config"]
).groupby(["config", "scheme", "z0", "e_sat_curve", "surface_measurement", "most_config"])['latent heat flux'].cumsum()).reset_index()
model_df_cumsum_daily = pd.DataFrame(model_df_cumsum.set_index("time").groupby(
    ['config', "scheme", "z0", "e_sat_curve", "surface_measurement", "most_config", pd.Grouper(freq='1440Min')]
)['latent heat flux'].max()).reset_index()

model_df_cumsum_daily = model_df_cumsum_daily[
    model_df_cumsum_daily.time < '2023-05-10'
][
    model_df_cumsum_daily.time >= '2022-11-30'
]

from metpy.constants import density_water
seconds_per_30min = 60*30
model_df_cumsum_daily['latent heat flux (mm)'] = model_df_cumsum_daily['latent heat flux'].values * seconds_per_30min/density_water.magnitude

  model_df_cumsum_daily = model_df_cumsum_daily[


In [28]:
measurements_chart = alt.Chart(
    tidy_df_5Min.query("measurement == 'Cumulative sublimation measured'").query("height > 1")
).transform_window(
    frame = [-24, 24],
    groupby=['variable'],
    rolling_avg = 'mean(value)'
).mark_line(opacity=0.5, strokeWidth=1).encode(
    alt.X("time:T"),
    alt.Y("rolling_avg:Q").title(["Cumulative sublimation (mm)", "(4 hour rolling average)"]).scale(domain=[-5, 150]),
    detail = 'variable:N',
    # alt.Color("variable:N"),
    tooltip = 'variable'
).properties(width = 250, height = 250)

In [29]:
models_chart = alt.Chart(
    model_df_cumsum_daily.query("z0 <= 1.e-03").dropna()
).mark_line(opacity = 0.5, color='grey', strokeWidth=0.2).encode(
    alt.X("time:T"),
    alt.Y("latent heat flux (mm)").scale(domain=[-5, 150]),
    detail = 'config'
)

In [30]:
snowpillow_chart = alt.Chart(
    tidy_df_30Min.query("variable == 'SWE_p2_c'").dropna()
).transform_window(
    frame = [-48, 48],
    rolling_median = 'median(value)'
).mark_line().encode(
    alt.X("time:T").axis(labels=False).title(None),
    alt.Y("rolling_median:Q").title(["Snow water", "equivalent (mm)"])
).properties(width = 250, height = 83)

# snowpillow_and_precip_chart = snowpillow_chart +\
# alt.Chart(
#     precip_df
# ).mark_line(strokeDash=[2,4]).encode(
#     alt.X('date:T'),
#     alt.Y("acc_prec")
# )

In [31]:
(
    snowpillow_chart &
    (measurements_chart)
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [32]:
(
    snowpillow_chart &
    (measurements_chart + models_chart)
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [33]:
model_df_cumsum_daily.query("z0 <= 1.e-03").dropna().z0.unique()

array([1.e-04, 5.e-04, 1.e-03, 1.e-05, 5.e-05])

In [41]:
models_chart_by_z0 = alt.Chart(
    model_df_cumsum_daily.query("z0 <= 1.e-03").dropna()
).mark_line(opacity = 0.75, color='grey', strokeWidth=0.2).encode(
    alt.X("time:T"),
    alt.Y("latent heat flux (mm)").scale(domain=[-5, 150]),
    alt.Color('z0:O').scale(
        domain = [1.e-03, 1.e-04,  1.e-05],
        range = ['#d62728', '#ff7f0e', '#bcbd22']
    ),
    detail = 'config'
)


(
    snowpillow_chart &
    (measurements_chart + models_chart_by_z0)
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [None]:



models_chart_by_ts_meas = alt.Chart(
    model_df_cumsum_daily[model_df_cumsum_daily.config.isin([
    'MO Holtslag de Bruin Tsurf_rad_d e_sat_alduchov 1e-05',
    'MO Holtslag de Bruin Tsurf_d e_sat_alduchov 1e-05',
])]
).mark_line(opacity = 0.75, color='grey', strokeWidth=0.2).encode(
    alt.X("time:T"),
    alt.Y("latent heat flux (mm)").scale(domain=[-5, 150]),
    alt.Color('z0:O').scale(
        domain = [1.e-03, 1.e-04,  1.e-05],
        range = ['#d62728', '#ff7f0e', '#bcbd22']
    ),
    detail = 'config'
)


(
    snowpillow_chart &
    (measurements_chart + models_chart_by_z0)
)

In [20]:
start_date = "2023-03-01"
end_date = "2023-03-05"
meas_chart = alt.Chart(
    tidy_df_30Min.set_index('time').loc[start_date:end_date].reset_index().query("variable == 'w_h2o__3m_c'")
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q")

)
model_chart = alt.Chart(
    model_df[model_df.config == 'MO Holtslag de Bruin andreas lengths Tsurf_c e_sat_alduchov 1e-05'].set_index('time').loc[start_date:end_date].reset_index()
).mark_line(color='red').encode(
    alt.X("time:T"),
    alt.Y("latent heat flux:Q")
)

(meas_chart + model_chart).properties(width = 600)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting
