In [13]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

# Open SOS Measurement Dataset

In [14]:
start_date = '20221130'
end_date = '20230509'
# open files
tidy_df_5Min = pd.read_parquet(f'../sos/tidy_df_{start_date}_{end_date}_noplanar_fit.parquet')
tidy_df_30Min = pd.read_parquet(f'../sos/tidy_df_30Min_{start_date}_{end_date}_noplanar_fit.parquet')
# convert time column to datetime
tidy_df_5Min['time'] = pd.to_datetime(tidy_df_5Min['time'])
tidy_df_30Min['time'] = pd.to_datetime(tidy_df_30Min['time'])
# limit data to our dates of interest, based on continuous snow cover at Kettle Ponds
tidy_df_5Min = tidy_df_5Min.set_index('time').sort_index().loc[start_date:end_date].reset_index()
tidy_df_30Min = tidy_df_30Min.set_index('time').sort_index().loc[start_date:end_date].reset_index()

In [15]:
# quick way to get variable info if we want it 
# import xarray as xr
# ds = xr.open_dataset("/storage/elilouis/sublimationofsnow/sosnoqc/isfs_20221228.nc")
# ds['SWE_p2_c']

## Clean the data

### Step 1: remove all LH flux data points with less than 90% of 20hz data being good
### Step 2: remove all LH flux data points with magnitude greater than 1 g/m^2/s

In [16]:
ec_lhflux_and_counts_variables = [
    ('w_h2o__2m_c', 'counts_2m_c_1'), 
    ('w_h2o__3m_c', 'counts_3m_c_1'), 
    ('w_h2o__5m_c', 'counts_5m_c_1'), 
    ('w_h2o__10m_c', 'counts_10m_c_1'), 
    ('w_h2o__15m_c', 'counts_15m_c_1'), 
    ('w_h2o__20m_c', 'counts_20m_c_1'), 


    ('w_h2o__1m_d', 'counts_1m_d_1'), 
    ('w_h2o__3m_d', 'counts_3m_d_1'), 
    ('w_h2o__10m_d', 'counts_10m_d_1'), 
      
    ('w_h2o__1m_ue', 'counts_1m_ue_1'), 
    ('w_h2o__3m_ue', 'counts_3m_ue_1'), 
    ('w_h2o__10m_ue', 'counts_10m_ue_1'), 


    ('w_h2o__1m_uw',  'counts_1m_uw_1'), 
    ('w_h2o__3m_uw', 'counts_3m_uw_1'), 
    ('w_h2o__10m_uw', 'counts_10m_uw_1'), 
]
ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]

In [17]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
all_lhflux_measurements.mean(), all_lhflux_measurements.std(), all_lhflux_measurements.min(), all_lhflux_measurements.max()

(0.005839618240661465, 2.7220981387241916, -123.26387023925781, 2131.2578125)

In [18]:
import pandas as pd
import numpy as np

good_data_fractions = [
    0, 
    # 0.05, 0.1, 0.25, 0.50, 
    # 0.75, 
    0.90, 0.95, 
    # 1
]
dataframes = []
for good_data_fraction in good_data_fractions:
    for flux_var, counts_var in ec_lhflux_and_counts_variables:
        new_values = turbulence.clean_eddy_covariance(
            tidy_df_5Min.query(f"variable == '{flux_var}'").value.values,
            tidy_df_5Min.query(f"variable == '{counts_var}'").value.values,
            lower_threshold,
            upper_threshold,
            fraction_good_data_reqd = good_data_fraction
        )
        dataframes.append(
            pd.DataFrame.from_dict({
                flux_var: new_values,
                'good_data_fraction': np.full(len(new_values), good_data_fraction)
            })
        )

seconds_per_5min = 60*5

for dataframe in dataframes:
    flux_var_name = dataframe.columns[0]
    height = int(flux_var_name.split('_')[-2].split('m')[0])
    tower = flux_var_name.split('_')[-1]
    new_var_name = f"cumulative_sub_measured_{height}m_{tower}"
    cumulative_sublimation_values = np.nancumsum(dataframe[flux_var_name]*seconds_per_5min)/metpy.constants.density_water.magnitude
    # remove the old flux values
    tidy_df_5Min = tidy_df_5Min[tidy_df_5Min.variable != flux_var_name]
    # # add the new (cleaned) flux values
    tidy_df_5Min = tidy.tidy_df_add_variable(
        tidy_df_5Min,
        dataframe[flux_var_name],
        flux_var_name,
        'w_h2o_',
        height,
        tower
    )
    # add the cumulative calculations values
    tidy_df_5Min = tidy.tidy_df_add_variable(
        tidy_df_5Min,
        cumulative_sublimation_values,
        new_var_name,
        "Cumulative sublimation measured",  
        height,
        tower
    )

NameError: name 'turbulence' is not defined

In [None]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
print(all_lhflux_measurements.mean(), all_lhflux_measurements.std(), all_lhflux_measurements.min(), all_lhflux_measurements.max())

0.0025582411593165024 0.018673257004782503 -8.453908920288086 4.933438777923584


In [None]:
dataframes = [df for df in dataframes if df['good_data_fraction'].iloc[0] == 0.90]

# Calculate daily sublimation

In [9]:
import metpy.constants
ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]
seconds_per_5min = 60*5

In [10]:
daily_sub_by_blowingsnow_src = tidy_df_5Min[
    tidy_df_5Min.variable.isin(['w_h2o__3m_c', 'SF_avg_1m_ue', 'SF_avg_2m_ue'])
].pivot_table(
    values = 'value',
    index = 'time',
    columns = 'variable'
)
daily_sub_by_blowingsnow_src['SF_avg_max_ue'] = daily_sub_by_blowingsnow_src[['SF_avg_1m_ue', 'SF_avg_2m_ue']].max(axis=1)
daily_sub_by_blowingsnow_src['blowing snow'] = daily_sub_by_blowingsnow_src['SF_avg_max_ue'] > 0
daily_sub_by_blowingsnow_src['Sublimation (mm)'] = daily_sub_by_blowingsnow_src[
    'w_h2o__3m_c'
]*seconds_per_5min / metpy.constants.density_water.magnitude# calculate daily sublimation
daily_sub_by_blowingsnow_src = daily_sub_by_blowingsnow_src.groupby([pd.Grouper(freq='1440Min'), 'blowing snow']).sum()
daily_sub_by_blowingsnow_src = daily_sub_by_blowingsnow_src.reset_index()
daily_sub_by_blowingsnow_src

variable,time,blowing snow,SF_avg_1m_ue,SF_avg_2m_ue,w_h2o__3m_c,SF_avg_max_ue,Sublimation (mm)
0,2022-11-30,False,0.00,0.00,-0.173214,0.00,-0.051965
1,2022-12-01,False,0.00,0.00,0.573706,0.00,0.172116
2,2022-12-02,False,0.00,0.00,0.350991,0.00,0.105300
3,2022-12-02,True,17.70,23.98,1.006539,28.09,0.301969
4,2022-12-03,False,0.00,0.00,-0.070897,0.00,-0.021270
...,...,...,...,...,...,...,...
260,2023-05-06,False,0.00,0.00,1.898066,0.00,0.569434
261,2023-05-06,True,0.38,0.73,-0.006510,0.74,-0.001953
262,2023-05-07,False,0.00,0.00,0.751659,0.00,0.225503
263,2023-05-08,False,0.00,0.00,1.158780,0.00,0.347643


In [11]:
daily_sub_chart = alt.Chart(daily_sub_by_blowingsnow_src).mark_bar(width=2.5).encode(
    alt.X("time:T", title=None),
    alt.Y("Sublimation (mm):Q").title("Daily sublimation (mm)"), 
    alt.Color("blowing snow:N"),
    tooltip='time:T'
).properties(height = 100, width=500)

In [12]:
def temp_gradient_to_stability_regime(x):
    if np.isnan(x):
        return None
    elif x < -0.01:
        return "unstable"
    elif x >= -0.01 and x <= 0.01:
        return "neutral"
    elif x > 0.01:
        return "stable"
    else:
        raise ValueError("what?")
src = tidy_df_5Min.query("variable == 'temp_gradient_3m_c'")
src['date'] = src.time.dt.date
src[src.time.dt.hour.isin([0,1])]
src_day = src[src.time.dt.hour.isin([12,13])].assign(time_of_day = 'day')
src_night = src[src.time.dt.hour.isin([0,1])].assign(time_of_day = 'night')
src = pd.concat([src_day, src_night])
src = src.groupby(['date', 'time_of_day']).mean().reset_index()
src['stability regime'] = src['value'].apply(temp_gradient_to_stability_regime)

stability_regime_chart = alt.Chart(src).mark_bar().encode(
    alt.X("date:T"),
    alt.Color("stability regime:N").scale(domain = ['neutral', 'stable', 'unstable'], range=['#000000', '#1f77b4', '#ff7f0e']  ),
    alt.Y("time_of_day:N", sort=['night', 'day'], title=None)
).properties(width = 500)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  src['date'] = src.time.dt.date


TypeError: agg function failed [how->mean,dtype->object]

In [None]:
(daily_sub_chart & stability_regime_chart).resolve_scale(color='independent').interactive()

In [None]:
import datetime as dt

In [None]:
tidy_df_30Min[tidy_df_30Min.measurement.isin([
    'potential virtual temperature',
    'surface potential virtual temperature'
])].variable.unique()

array(['Tpotvirtual_10m_c', 'Tpotvirtual_11m_c', 'Tpotvirtual_12m_c',
       'Tpotvirtual_13m_c', 'Tpotvirtual_14m_c', 'Tpotvirtual_15m_c',
       'Tpotvirtual_16m_c', 'Tpotvirtual_17m_c', 'Tpotvirtual_18m_c',
       'Tpotvirtual_19m_c', 'Tpotvirtual_20m_c', 'Tpotvirtual_2m_c',
       'Tpotvirtual_3m_c', 'Tpotvirtual_4m_c', 'Tpotvirtual_5m_c',
       'Tpotvirtual_6m_c', 'Tpotvirtual_7m_c', 'Tpotvirtual_8m_c',
       'Tpotvirtual_9m_c', 'Tsurfpotvirtual_c', 'Tsurfpotvirtual_rad_d'],
      dtype=object)

In [None]:
src = tidy_df_30Min[tidy_df_30Min.measurement.isin([
    'potential virtual temperature',
    'surface potential virtual temperature'
])]
src = src.set_index("time").groupby([pd.Grouper(freq='60Min'), 'height']).mean().reset_index()
src['hour'] = src.time.dt.hour
src = src[src['hour']%4 == 0]

def profile_chart(src):
    return alt.Chart(src).mark_line().encode(
        alt.X("value:Q").sort('-y').title("Potential virtual temperature (˚C)"),
        alt.Y("height:Q"),
        alt.Color('hour:O').scale(scheme='rainbow')
    ).properties(width = 200, height = 150)

  src = src.set_index("time").groupby([pd.Grouper(freq='60Min'), 'height']).mean().reset_index()


In [None]:
profile_chart(src[src.time.dt.date == dt.date(2022, 12, 22)]).properties(title = "Blowing snow day (2022-12-22)") |\
profile_chart(src[src.time.dt.date == dt.date(2023, 2, 11)]).properties(title = "Sunny, midwinter day (2023-02-11)") |\
profile_chart(src[src.time.dt.date == dt.date(2023, 3, 28)]).properties(title = "Sunny, spring day (2023-03-28)")

In [None]:
(profile_chart(src.query("height != 3")[src.hour.isin([0,12])][src.time.dt.date == dt.date(2022, 12, 22)]).properties(title = "Blowing snow day (2022-12-22)") |\
profile_chart(src.query("height != 3")[src.hour.isin([0,12])][src.time.dt.date == dt.date(2023, 2, 11)]).properties(title = "Sunny, midwinter day (2023-02-11)") |\
profile_chart(src.query("height != 3")[src.hour.isin([0,12])][src.time.dt.date == dt.date(2023, 3, 28)]).properties(title = "Sunny, spring day (2023-03-28)")).display(renderer='svg')

  (profile_chart(src.query("height != 3")[src.hour.isin([0,12])][src.time.dt.date == dt.date(2022, 12, 22)]).properties(title = "Blowing snow day (2022-12-22)") |\
  profile_chart(src.query("height != 3")[src.hour.isin([0,12])][src.time.dt.date == dt.date(2023, 2, 11)]).properties(title = "Sunny, midwinter day (2023-02-11)") |\
  profile_chart(src.query("height != 3")[src.hour.isin([0,12])][src.time.dt.date == dt.date(2023, 3, 28)]).properties(title = "Sunny, spring day (2023-03-28)")).display(renderer='svg')


In [None]:
alt.Chart(
    tidy_df_5Min[tidy_df_5Min.time.dt.date == dt.date(2022,12,22)].query("measurement == 'snow flux'")
).mark_circle().encode(
    alt.X("time:T"),
    alt.Y("value:Q"),
    alt.Color("variable:N")
) | alt.Chart(
    tidy_df_5Min[tidy_df_5Min.time.dt.date == dt.date(2023,2,11)].query("measurement == 'snow flux'")
).mark_circle().encode(
    alt.X("time:T"),
    alt.Y("value:Q"),
    alt.Color("variable:N")
) | alt.Chart(
    tidy_df_5Min[tidy_df_5Min.time.dt.date == dt.date(2023,3,28)].query("measurement == 'snow flux'")
).mark_circle().encode(
    alt.X("time:T"),
    alt.Y("value:Q"),
    alt.Color("variable:N")
)
