In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('json')

from sublimpy import turbulence

# Open precipitation dataset

In [2]:
precip_file = "/storage/elilouis/sublimationofsnow/tilden_precip_data/kettle_ponds_precip.csv"

precip_df = pd.read_csv(precip_file)

precip_df['date'] = pd.to_datetime(precip_df['date'])

acc_precip_on_first_day = precip_df.set_index('date').loc['20221130'].acc_prec

precip_df = precip_df.set_index('date').loc['20221130': '20230510'].reset_index()
precip_df['acc_prec']  = precip_df['acc_prec'] - acc_precip_on_first_day

# Open SOS Measurement Dataset

In [3]:
start_date = '20221130'
end_date = '20230509'
# open files
tidy_df_5Min = pd.read_parquet('../sos/tidy_df_20221130_20230517_noplanar_fit.parquet')
tidy_df_30Min = pd.read_parquet('../sos/tidy_df_30Min_20221130_20230517_noplanar_fit.parquet')
# convert time column to datetime
tidy_df_5Min['time'] = pd.to_datetime(tidy_df_5Min['time'])
tidy_df_30Min['time'] = pd.to_datetime(tidy_df_30Min['time'])
# limit data to our dates of interest, based on continuous snow cover at Kettle Ponds
tidy_df_5Min = tidy_df_5Min.set_index('time').sort_index().loc[start_date:end_date].reset_index()
tidy_df_30Min = tidy_df_30Min.set_index('time').sort_index().loc[start_date:end_date].reset_index()

In [4]:
# quick way to get variable info if we want it 
# import xarray as xr
# ds = xr.open_dataset("/storage/elilouis/sublimationofsnow/sosnoqc/isfs_20221228.nc")
# ds['SWE_p2_c']

## Clean the data

In [5]:
ec_lhflux_and_counts_variables = [
    ('w_h2o__2m_c', 'counts_2m_c_1'), 
    ('w_h2o__3m_c', 'counts_3m_c_1'), 
    ('w_h2o__5m_c', 'counts_5m_c_1'), 
    ('w_h2o__10m_c', 'counts_10m_c_1'), 
    ('w_h2o__15m_c', 'counts_15m_c_1'), 
    ('w_h2o__20m_c', 'counts_20m_c_1'), 


    ('w_h2o__1m_d', 'counts_1m_d_1'), 
    ('w_h2o__3m_d', 'counts_3m_d_1'), 
    ('w_h2o__10m_d', 'counts_10m_d_1'), 
      
    ('w_h2o__1m_ue', 'counts_1m_ue_1'), 
    ('w_h2o__3m_ue', 'counts_3m_ue_1'), 
    ('w_h2o__10m_ue', 'counts_10m_ue_1'), 


    ('w_h2o__1m_uw',  'counts_1m_uw_1'), 
    ('w_h2o__3m_uw', 'counts_3m_uw_1'), 
    ('w_h2o__10m_uw', 'counts_10m_uw_1'), 
]
ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]

### Calculate the upper and lower threshold for removing data - 5*std deviation of ALL latent heat flux measurements

In [6]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
lower_threshold = all_lhflux_measurements.mean() - 5*all_lhflux_measurements.std()
upper_threshold = all_lhflux_measurements.mean() + 5*all_lhflux_measurements.std()
lower_threshold, upper_threshold

(-13.592732449123245, 13.604403331998471)

In [7]:
import pandas as pd
import numpy as np

good_data_fractions = [
    0, 
    # 0.05, 0.1, 0.25, 0.50, 
    # 0.75, 
    0.90, 0.95, 
    # 1
]
dataframes = []
for good_data_fraction in good_data_fractions:
    for flux_var, counts_var in ec_lhflux_and_counts_variables:
        new_values = turbulence.clean_eddy_covariance(
            tidy_df_5Min.query(f"variable == '{flux_var}'").value.values,
            tidy_df_5Min.query(f"variable == '{counts_var}'").value.values,
            lower_threshold,
            upper_threshold,
            fraction_good_data_reqd = good_data_fraction
        )
        dataframes.append(
            pd.DataFrame.from_dict({
                flux_var: new_values,
                'good_data_fraction': np.full(len(new_values), good_data_fraction)
            })
        )

## Calculate cumulative sublimation (mm)

In [8]:
from sublimpy import tidy
import metpy.constants

In [9]:
seconds_per_5min = 60*5

for dataframe in dataframes:
    flux_var_name = dataframe.columns[0]
    height = int(flux_var_name.split('_')[-2].split('m')[0])
    tower = flux_var_name.split('_')[-1]
    new_var_name = f"cumulative_sub_measured_{height}m_{tower}"
    cumulative_sublimation_values = np.nancumsum(dataframe[flux_var_name]*seconds_per_5min)/metpy.constants.density_water.magnitude
    print(flux_var_name, dataframe[dataframe.columns[1]].iloc[0], cumulative_sublimation_values.max())
    ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]

w_h2o__2m_c 0 29.790966933577884
w_h2o__3m_c 0 34.53360434647222
w_h2o__5m_c 0 37.48928122584268
w_h2o__10m_c 0 41.38660060019198
w_h2o__15m_c 0 32.592802110746334
w_h2o__20m_c 0 38.78241373327484
w_h2o__1m_d 0 18.823154550740192
w_h2o__3m_d 0 34.101627180207615
w_h2o__10m_d 0 39.21210266709886
w_h2o__1m_ue 0 23.970117234020446
w_h2o__3m_ue 0 34.65807965253853
w_h2o__10m_ue 0 36.74532449217792
w_h2o__1m_uw 0 18.673352041982085
w_h2o__3m_uw 0 34.50582011197007
w_h2o__10m_uw 0 40.242610255656516
w_h2o__2m_c 0.9 28.394552217976653
w_h2o__3m_c 0.9 33.69303365193264
w_h2o__5m_c 0.9 36.422269288882596
w_h2o__10m_c 0.9 39.343705631184626
w_h2o__15m_c 0.9 25.62803047084436
w_h2o__20m_c 0.9 33.84820112107782
w_h2o__1m_d 0.9 18.25986046041312
w_h2o__3m_d 0.9 32.528020942190096
w_h2o__10m_d 0.9 37.51086285244327
w_h2o__1m_ue 0.9 23.472797278054948
w_h2o__3m_ue 0.9 32.38991287475709
w_h2o__10m_ue 0.9 34.22256572053085
w_h2o__1m_uw 0.9 18.220575186436847
w_h2o__3m_uw 0.9 33.39049534844093
w_h2o__10

In [10]:
from io import StringIO
qc_df = pd.read_csv(StringIO("""
variable percent_reqd cumulative_sublimation
w_h2o__2m_c 0 29.790966933577884
w_h2o__3m_c 0 34.53360434647222
w_h2o__5m_c 0 37.48928122584268
w_h2o__10m_c 0 41.38660060019198
w_h2o__15m_c 0 32.592802110746334
w_h2o__20m_c 0 38.78241373327484
w_h2o__1m_d 0 18.823154550740192
w_h2o__3m_d 0 34.101627180207615
w_h2o__10m_d 0 39.21210266709886
w_h2o__1m_ue 0 23.970117234020446
w_h2o__3m_ue 0 34.65807965253853
w_h2o__10m_ue 0 36.74532449217792
w_h2o__1m_uw 0 18.673352041982085
w_h2o__3m_uw 0 34.50582011197007
w_h2o__10m_uw 0 40.242610255656516
w_h2o__2m_c 0.9 28.394552217976653
w_h2o__3m_c 0.9 33.69303365193264
w_h2o__5m_c 0.9 36.422269288882596
w_h2o__10m_c 0.9 39.343705631184626
w_h2o__15m_c 0.9 25.62803047084436
w_h2o__20m_c 0.9 33.84820112107782
w_h2o__1m_d 0.9 18.25986046041312
w_h2o__3m_d 0.9 32.528020942190096
w_h2o__10m_d 0.9 37.51086285244327
w_h2o__1m_ue 0.9 23.472797278054948
w_h2o__3m_ue 0.9 32.38991287475709
w_h2o__10m_ue 0.9 34.22256572053085
w_h2o__1m_uw 0.9 18.220575186436847
w_h2o__3m_uw 0.9 33.39049534844093
w_h2o__10m_uw 0.9 38.29135489682637
w_h2o__2m_c 0.95 27.91364683624598
w_h2o__3m_c 0.95 33.27010045873039
w_h2o__5m_c 0.95 35.89753457951142
w_h2o__10m_c 0.95 38.56225650570379
w_h2o__15m_c 0.95 24.287958103886755
w_h2o__20m_c 0.95 33.34104611209206
w_h2o__1m_d 0.95 17.98046285802274
w_h2o__3m_d 0.95 31.676122860323897
w_h2o__10m_d 0.95 36.50756915181889
w_h2o__1m_ue 0.95 23.0371963999716
w_h2o__3m_ue 0.95 31.90166883999031
w_h2o__10m_ue 0.95 33.056911817846824
w_h2o__1m_uw 0.95 17.682383058495933
w_h2o__3m_uw 0.95 32.71763690905719
w_h2o__10m_uw 0.95 37.32808754439046
"""), delim_whitespace=True)

In [11]:
import altair as alt
alt.Chart(qc_df).mark_line(point=True).encode(
    alt.X("percent_reqd:Q"),
    alt.Y("cumulative_sublimation:Q"),
    alt.Color("variable:N")
)

## Select our data-required value, replace dirty EC data with clean EC data, and calculate cumulative sublimation

In [12]:
dataframes = [df for df in dataframes if df['good_data_fraction'].iloc[0] == 0.90]

In [15]:
dataframes[1]

Unnamed: 0,w_h2o__3m_c,good_data_fraction
0,-0.003711,0.9
1,-0.000613,0.9
2,-0.001677,0.9
3,-0.000651,0.9
4,-0.000149,0.9
...,...,...
46351,0.003134,0.9
46352,0.002378,0.9
46353,0.002704,0.9
46354,0.002306,0.9


In [16]:
print(len(tidy_df_5Min.time.unique()))

46357


In [14]:
seconds_per_5min = 60*5

for dataframe in dataframes:
    flux_var_name = dataframe.columns[0]
    height = int(flux_var_name.split('_')[-2].split('m')[0])
    tower = flux_var_name.split('_')[-1]
    new_var_name = f"cumulative_sub_measured_{height}m_{tower}"
    cumulative_sublimation_values = np.nancumsum(dataframe[flux_var_name]*seconds_per_5min)/metpy.constants.density_water.magnitude
    # remove the old flux values
    tidy_df_5Min = tidy_df_5Min[tidy_df_5Min.variable != flux_var_name]
    # add the cleaned values
    print(len(dataframe[flux_var_name]))
    print(len(tidy_df_5Min.time.unique()))
    tidy_df_5Min = tidy.tidy_df_add_variable(
        tidy_df_5Min,
        dataframe[flux_var_name],
        flux_var_name,
        'w_h2o_',
        height,
        tower
    )
    # add the cumulative calculations values
    tidy_df_5Min = tidy.tidy_df_add_variable(
        tidy_df_5Min,
        cumulative_sublimation_values,
        new_var_name,
        "Cumulative sublimation measured",  
        height,
        tower
    )

46356
46356


AssertionError: 

# Open Model Ensemble Dataset

In [None]:
model_df = pd.read_parquet("model_results.parquet")
# add a bunch of columns that are descriptive, from the config column which has multiple bits of info
model_df['z0'] = model_df['config'].apply(
    lambda v: float(v.split(' ')[-1])
)
model_df['e_sat_curve'] = model_df['config'].apply(
    lambda v: 'metpy' if 'metpy' in v else 'alduchov'
)
model_df['surface_measurement'] = model_df['config'].apply(
    lambda v: v.split(' ')[-3]
)
model_df['scheme'] = model_df['config'].apply(
    lambda v: 'andreas' if 'andreas lengths' in v else 'yang'
)
model_df['most_config'] = model_df['config'].apply(lambda s: ' '.join(s.split(' ')[:-3]))
# remove the scalar roughness length parameterization info 
model_df['most_config'] = model_df['most_config'].str.replace(' andreas lengths', '')
model_df.head()

### Handle a pesky outlier

In [None]:
model_df.loc[(model_df.time == "2023-01-22 1400") & (model_df.surface_measurement == 'Tsurf_d'), 'latent heat flux'] = 0
model_df.loc[(model_df.time == "2023-01-22 1400") & (model_df.surface_measurement == 'Tsurf_d'), 'sensible heat flux'] = 0

## Calculate cumulative sublimation (mm)

In [None]:
model_df_cumsum = pd.DataFrame(model_df.sort_values("time").set_index(
    ["time", "config", "scheme", "z0", "e_sat_curve", "surface_measurement", "most_config"]
).groupby(["config", "scheme", "z0", "e_sat_curve", "surface_measurement", "most_config"])['latent heat flux'].cumsum()).reset_index()
model_df_cumsum_daily = pd.DataFrame(model_df_cumsum.set_index("time").groupby(
    ['config', "scheme", "z0", "e_sat_curve", "surface_measurement", "most_config", pd.Grouper(freq='1440Min')]
)['latent heat flux'].max()).reset_index()

from metpy.constants import density_water
seconds_per_30min = 60*30
model_df_cumsum_daily['latent heat flux (mm)'] = model_df_cumsum_daily['latent heat flux'].values * seconds_per_30min/density_water.magnitude

In [None]:
measurements_chart = alt.Chart(
    tidy_df_5Min.query("measurement == 'Cumulative sublimation measured'").query("height > 1")
).transform_window(
    frame = [-24, 24],
    groupby=['variable'],
    rolling_avg = 'mean(value)'
).mark_line(opacity=0.5, strokeWidth=1).encode(
    alt.X("time:T"),
    alt.Y("rolling_avg:Q").title(["Cumulative sublimation (mm)", "(4 hour rolling average)"]),
    detail = 'variable:N',
    # alt.Color("variable:N"),
    tooltip = 'variable'
).properties(width = 250, height = 250)

In [None]:
models_chart = alt.Chart(
    model_df_cumsum_daily[model_df_cumsum_daily.time < '2023-05-10'].query("z0 < 1.e-03").dropna()
).mark_line(opacity = 0.5, color='grey', strokeWidth=0.2).encode(
    alt.X("time:T"),
    alt.Y("latent heat flux (mm)"),
    detail = 'config'
)

In [None]:
snowpillow_and_precip_chart = alt.Chart(
    tidy_df_30Min.query("variable == 'SWE_p2_c'").dropna()
).transform_window(
    frame = [-48, 48],
    rolling_median = 'median(value)'
).mark_line().encode(
    alt.X("time:T").axis(labels=False).title(None),
    alt.Y("rolling_median:Q").title(["Snow water", "equivalent (mm)"])
).properties(width = 250, height = 83) + alt.Chart(
    precip_df
).mark_line(strokeDash=[2,4]).encode(
    alt.X('date:T'),
    alt.Y("acc_prec")
)

In [None]:
(
    snowpillow_and_precip_chart &
    (measurements_chart + models_chart)
).interactive().display(renderer='svg')

In [None]:
start_date = "2023-03-01"
end_date = "2023-03-05"
meas_chart = alt.Chart(
    tidy_df_30Min.set_index('time').loc[start_date:end_date].reset_index().query("variable == 'w_h2o__3m_c'")
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q")

)
model_chart = alt.Chart(
    model_df[model_df.config == 'MO Holtslag de Bruin andreas lengths Tsurf_c e_sat_alduchov 1e-05'].set_index('time').loc[start_date:end_date].reset_index()
).mark_line(color='red').encode(
    alt.X("time:T"),
    alt.Y("latent heat flux:Q")
)

(meas_chart + model_chart).properties(width = 600).display(renderer='svg')