In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

# Open SOS Measurement Dataset

In [3]:
start_date = '20221130'
end_date = '20230509'
# open files
tidy_df_5Min = pd.read_parquet('../sos/tidy_df_20221130_20230517_noplanar_fit.parquet')
tidy_df_30Min = pd.read_parquet('../sos/tidy_df_30Min_20221130_20230517_noplanar_fit.parquet')
# convert time column to datetime
tidy_df_5Min['time'] = pd.to_datetime(tidy_df_5Min['time'])
tidy_df_30Min['time'] = pd.to_datetime(tidy_df_30Min['time'])
# limit data to our dates of interest, based on continuous snow cover at Kettle Ponds
tidy_df_5Min = tidy_df_5Min.set_index('time').sort_index().loc[start_date:end_date].reset_index()
tidy_df_30Min = tidy_df_30Min.set_index('time').sort_index().loc[start_date:end_date].reset_index()

In [4]:
# quick way to get variable info if we want it 
# import xarray as xr
# ds = xr.open_dataset("/storage/elilouis/sublimationofsnow/sosnoqc/isfs_20221228.nc")
# ds['SWE_p2_c']

## Clean the data

### Step 1: remove all LH flux data points with less than 90% of 20hz data being good
### Step 2: remove all LH flux data points with magnitude greater than 1 g/m^2/s

In [5]:
ec_lhflux_and_counts_variables = [
    ('w_h2o__2m_c', 'counts_2m_c_1'), 
    ('w_h2o__3m_c', 'counts_3m_c_1'), 
    ('w_h2o__5m_c', 'counts_5m_c_1'), 
    ('w_h2o__10m_c', 'counts_10m_c_1'), 
    ('w_h2o__15m_c', 'counts_15m_c_1'), 
    ('w_h2o__20m_c', 'counts_20m_c_1'), 


    ('w_h2o__1m_d', 'counts_1m_d_1'), 
    ('w_h2o__3m_d', 'counts_3m_d_1'), 
    ('w_h2o__10m_d', 'counts_10m_d_1'), 
      
    ('w_h2o__1m_ue', 'counts_1m_ue_1'), 
    ('w_h2o__3m_ue', 'counts_3m_ue_1'), 
    ('w_h2o__10m_ue', 'counts_10m_ue_1'), 


    ('w_h2o__1m_uw',  'counts_1m_uw_1'), 
    ('w_h2o__3m_uw', 'counts_3m_uw_1'), 
    ('w_h2o__10m_uw', 'counts_10m_uw_1'), 
]
ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]

In [6]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
all_lhflux_measurements.mean(), all_lhflux_measurements.std(), all_lhflux_measurements.min(), all_lhflux_measurements.max()

(0.005835441437612257, 2.7197135781121715, -123.26387023925781, 2131.2578125)

In [7]:
####################################################################################
# Remove all data points at once - perform both steps 1 and 2 simultaneously
####################################################################################
# for flux_var, counts_var in ec_lhflux_and_counts_variables:
#     print(flux_var, counts_var)
#     counts_src = tidy_df_5Min[tidy_df_5Min.variable == counts_var]
#     times_with_good_data_50percent = counts_src[counts_src.value >= 5400].time
#     n_before_dropping = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())
#     tidy_df_5Min.loc[
#         (~tidy_df_5Min['time'].isin(times_with_good_data_50percent)) &
#         (tidy_df_5Min['variable'] == flux_var),
#         'value'
#     ] = np.nan
#     n_after_step_1 = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())

#     variable_src = tidy_df_5Min[tidy_df_5Min.variable == flux_var]
#     times_with_outofbounds_values = variable_src[np.abs(variable_src.value) > 1].time
#     tidy_df_5Min.loc[
#         (tidy_df_5Min['time'].isin(times_with_outofbounds_values)) & 
#         (tidy_df_5Min['variable'] == flux_var),
#         'value'
#     ] = np.nan
#     n_after_step_2 = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())
#     print(n_before_dropping, n_after_step_1, n_after_step_2)
#     print(round((n_before_dropping-n_after_step_2)/n_before_dropping, 3))

####################################################################################
# Perform steps 1 and 2 separately 
####################################################################################
for flux_var, counts_var in ec_lhflux_and_counts_variables:
    counts_src = tidy_df_5Min[tidy_df_5Min.variable == counts_var]
    times_with_good_data_50percent = counts_src[counts_src.value >= 5400].time
    tidy_df_5Min.loc[
        (~tidy_df_5Min['time'].isin(times_with_good_data_50percent)) &
        (tidy_df_5Min['variable'] == flux_var),
        'value'
    ] = np.nan

all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
mean = all_lhflux_measurements.mean() 
stddev = all_lhflux_measurements.std()
print(mean, stddev, all_lhflux_measurements.min(), all_lhflux_measurements.max())

for flux_var, counts_var in ec_lhflux_and_counts_variables:
    variable_src = tidy_df_5Min[tidy_df_5Min.variable == flux_var]
    times_with_outofbounds_values = variable_src[
        ((variable_src.value) > (mean + 5*stddev)) |
        ((variable_src.value) < (mean - 5*stddev))
    ].time
    tidy_df_5Min.loc[
        (tidy_df_5Min['time'].isin(times_with_outofbounds_values)) & 
        (tidy_df_5Min['variable'] == flux_var),
        'value'
    ] = np.nan

In [None]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
print(all_lhflux_measurements.mean(), all_lhflux_measurements.std(), all_lhflux_measurements.min(), all_lhflux_measurements.max())

0.002559840776554578 0.018658400299491427 -8.453908920288086 4.933438777923584


# Create dataset labeled by blowing snow/not

In [None]:
blowing_snow_data = tidy_df_5Min[
    tidy_df_5Min.time.isin(
        tidy_df_5Min[tidy_df_5Min.variable.isin(['SF_avg_1m_ue', 'SF_avg_2m_ue'])].query(
            f"value > 0"
        ).time
    )
]
calm_data = tidy_df_5Min[
    ~ tidy_df_5Min.time.isin(
        tidy_df_5Min[tidy_df_5Min.variable.isin(['SF_avg_1m_ue', 'SF_avg_2m_ue'])].query(f"value > 0").time
    )
]

tidy_5min_by_blowing_df = pd.concat([
    blowing_snow_data.assign(type = 'blowing snow'),
    calm_data.assign(type = 'clear')
])

In [None]:
tidy_5min_by_blowing_lh_flux_df = tidy_5min_by_blowing_df[
    tidy_5min_by_blowing_df["variable"].isin(['w_h2o__3m_c', 'spd_3m_c', 'Rsw_in_9m_d', 'Rnet_9m_d', 'T_3m_c', 'tke_3m_c'])
]
tidy_5min_by_blowing_lh_flux_df = tidy_5min_by_blowing_lh_flux_df.pivot_table(
    index=['time','type'],
    values='value',
    columns='variable'
).reset_index()

# add convenience time columns
tidy_5min_by_blowing_lh_flux_df['time_no_date'] = tidy_5min_by_blowing_lh_flux_df['time'].apply(
    lambda x: x.replace(year=2023, month=1, day=1)
)
tidy_5min_by_blowing_lh_flux_df['month'] = tidy_5min_by_blowing_lh_flux_df['time'].dt.month
tidy_5min_by_blowing_lh_flux_df['date'] = tidy_5min_by_blowing_lh_flux_df['time'].dt.date

In [None]:
src = tidy_5min_by_blowing_lh_flux_df.groupby(["time_no_date", "month", "type"]).mean().reset_index()

alt.Chart(src).transform_filter(
    (alt.datum.month != 11) & (alt.datum.month != 5)
).transform_window(
    frame = [-6, 6],
    rolling_mean = "mean(w_h2o__3m_c)",
    groupby = ['month', 'type']
).mark_line().encode(
    alt.X("time_no_date:T", title='Time of day'),
    alt.Y("rolling_mean:Q", title=['LH Flux (g/m^2/s)','(1 hour rolling avg)']),
    alt.Color("month:O", sort=[12,1,2,3,4,5]).scale(scheme='turbo'),
    alt.Row("type:N", title=None)
).properties(
    width=300,
    height=150,
    title=['Daily average LH fluxes during each month,','separated by blowing snow and calm conditions']
)

  src = tidy_5min_by_blowing_lh_flux_df.groupby(["time_no_date", "month", "type"]).mean().reset_index()
