In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

# Open SOS Measurement Dataset

In [2]:
start_date = '20221130'
end_date = '20230509'
# open files
tidy_df_5Min = pd.read_parquet('../sos/tidy_df_20221130_20230517_noplanar_fit.parquet')
tidy_df_30Min = pd.read_parquet('../sos/tidy_df_30Min_20221130_20230517_noplanar_fit.parquet')
# convert time column to datetime
tidy_df_5Min['time'] = pd.to_datetime(tidy_df_5Min['time'])
tidy_df_30Min['time'] = pd.to_datetime(tidy_df_30Min['time'])
# limit data to our dates of interest, based on continuous snow cover at Kettle Ponds
tidy_df_5Min = tidy_df_5Min.set_index('time').loc[start_date:end_date].reset_index()
tidy_df_30Min = tidy_df_30Min.set_index('time').loc[start_date:end_date].reset_index()

KeyError: 'Value based partial slicing on non-monotonic DatetimeIndexes with non-existing keys is not allowed.'

In [None]:
# quick way to get variable info if we want it 
# import xarray as xr
# ds = xr.open_dataset("/data2/elilouis/sublimationofsnow/sosnoqc/isfs_20221228.nc")
# ds['SWE_p2_c']

## Clean the data

### Step 1: remove all LH flux data points with less than 90% of 20hz data being good
### Step 2: remove all LH flux data points with magnitude greater than 1 g/m^2/s

In [None]:
ec_lhflux_and_counts_variables = [
    ('w_h2o__2m_c', 'counts_2m_c_1'), 
    ('w_h2o__3m_c', 'counts_3m_c_1'), 
    ('w_h2o__5m_c', 'counts_5m_c_1'), 
    ('w_h2o__10m_c', 'counts_10m_c_1'), 
    ('w_h2o__15m_c', 'counts_15m_c_1'), 
    ('w_h2o__20m_c', 'counts_20m_c_1'), 


    ('w_h2o__1m_d', 'counts_1m_d_1'), 
    ('w_h2o__3m_d', 'counts_3m_d_1'), 
    ('w_h2o__10m_d', 'counts_10m_d_1'), 
      
    ('w_h2o__1m_ue', 'counts_1m_ue_1'), 
    ('w_h2o__3m_ue', 'counts_3m_ue_1'), 
    ('w_h2o__10m_ue', 'counts_10m_ue_1'), 


    ('w_h2o__1m_uw',  'counts_1m_uw_1'), 
    ('w_h2o__3m_uw', 'counts_3m_uw_1'), 
    ('w_h2o__10m_uw', 'counts_10m_uw_1'), 
]
ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]

In [None]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
all_lhflux_measurements.mean(), all_lhflux_measurements.std(), all_lhflux_measurements.min(), all_lhflux_measurements.max()

In [None]:
####################################################################################
# Remove all data points at once - perform both steps 1 and 2 simultaneously
####################################################################################
# for flux_var, counts_var in ec_lhflux_and_counts_variables:
#     print(flux_var, counts_var)
#     counts_src = tidy_df_5Min[tidy_df_5Min.variable == counts_var]
#     times_with_good_data_50percent = counts_src[counts_src.value >= 5400].time
#     n_before_dropping = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())
#     tidy_df_5Min.loc[
#         (~tidy_df_5Min['time'].isin(times_with_good_data_50percent)) &
#         (tidy_df_5Min['variable'] == flux_var),
#         'value'
#     ] = np.nan
#     n_after_step_1 = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())

#     variable_src = tidy_df_5Min[tidy_df_5Min.variable == flux_var]
#     times_with_outofbounds_values = variable_src[np.abs(variable_src.value) > 1].time
#     tidy_df_5Min.loc[
#         (tidy_df_5Min['time'].isin(times_with_outofbounds_values)) & 
#         (tidy_df_5Min['variable'] == flux_var),
#         'value'
#     ] = np.nan
#     n_after_step_2 = len(tidy_df_5Min.loc[(tidy_df_5Min['variable'] == flux_var)].dropna())
#     print(n_before_dropping, n_after_step_1, n_after_step_2)
#     print(round((n_before_dropping-n_after_step_2)/n_before_dropping, 3))

####################################################################################
# Perform steps 1 and 2 separately 
####################################################################################
for flux_var, counts_var in ec_lhflux_and_counts_variables:
    counts_src = tidy_df_5Min[tidy_df_5Min.variable == counts_var]
    times_with_good_data_50percent = counts_src[counts_src.value >= 5400].time
    tidy_df_5Min.loc[
        (~tidy_df_5Min['time'].isin(times_with_good_data_50percent)) &
        (tidy_df_5Min['variable'] == flux_var),
        'value'
    ] = np.nan

all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
mean = all_lhflux_measurements.mean() 
stddev = all_lhflux_measurements.std()
print(mean, stddev, all_lhflux_measurements.min(), all_lhflux_measurements.max())

for flux_var, counts_var in ec_lhflux_and_counts_variables:
    variable_src = tidy_df_5Min[tidy_df_5Min.variable == flux_var]
    times_with_outofbounds_values = variable_src[
        ((variable_src.value) > (mean + 5*stddev)) |
        ((variable_src.value) < (mean - 5*stddev))
    ].time
    tidy_df_5Min.loc[
        (tidy_df_5Min['time'].isin(times_with_outofbounds_values)) & 
        (tidy_df_5Min['variable'] == flux_var),
        'value'
    ] = np.nan

In [None]:
all_lhflux_measurements = tidy_df_5Min[tidy_df_5Min.variable.isin(ec_lhflux_variables)].value
print(all_lhflux_measurements.mean(), all_lhflux_measurements.std(), all_lhflux_measurements.min(), all_lhflux_measurements.max())

# Open Model Ensemble Dataset

In [None]:
model_df = pd.read_parquet("model_results.parquet")
# add a bunch of columns that are descriptive, from the config column which has multiple bits of info
model_df['z0'] = model_df['config'].apply(
    lambda v: float(v.split(' ')[-1])
)
model_df['e_sat_curve'] = model_df['config'].apply(
    lambda v: 'metpy' if 'metpy' in v else 'alduchov'
)
model_df['surface_measurement'] = model_df['config'].apply(
    lambda v: v.split(' ')[-3]
)
model_df['scheme'] = model_df['config'].apply(
    lambda v: 'andreas' if 'andreas lengths' in v else 'yang'
)
model_df['most_config'] = model_df['config'].apply(lambda s: ' '.join(s.split(' ')[:-3]))
# remove the scalar roughness length parameterization info 
model_df['most_config'] = model_df['most_config'].str.replace(' andreas lengths', '')
model_df.head()

### Handle a pesky outlier

In [None]:
model_df.loc[(model_df.time == "2023-01-22 1400") & (model_df.surface_measurement == 'Tsurf_d'), 'latent heat flux'] = 0
model_df.loc[(model_df.time == "2023-01-22 1400") & (model_df.surface_measurement == 'Tsurf_d'), 'sensible heat flux'] = 0

# Scatterplots

In [None]:
# Find the best performing model configuration
best_model_df = model_df[
    model_df['config'].isin([
        'MO Holtslag de Bruin andreas lengths Tsurf_c e_sat_alduchov 1e-05',
        'MO Holtslag de Bruin andreas lengths Tsurf_c e_sat_alduchov 0.0001'
    ]) 
]
best_model_df['z0'] = best_model_df['z0'].astype('str')
best_model_df = best_model_df[['time', 'z0', 'latent heat flux']].pivot_table(
    index='time', columns='z0',
)
best_model_df.columns = best_model_df.columns.to_flat_index().str.join('_')
best_model_df

In [None]:
model_meas_df = tidy_df_30Min.query("variable == 'w_h2o__3m_c'").set_index('time')[['value']].join(
    best_model_df
).reset_index().rename(columns={
    'value': 'measured',
    'latent heat flux_0.0001': 'modeled z0=1e-4',
    'latent heat flux_1e-05': 'modeled z0=1e-5',
})

In [None]:
model_meas_df

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(
    model_meas_df.dropna()['measured'],
    model_meas_df.dropna()['modeled z0=1e-5'],
)

In [None]:
def model_comparison_plot(src):
    one_to_one_line = alt.Chart(pd.DataFrame({
        'x': [-0.05, -0.025, 0.0, 0.025, 0.05],
        'y': [-0.05, -0.025, 0.0, 0.025, 0.05]
    })).mark_line(
        color='grey'
    ).encode(x = 'x', y = 'y')

    value_r2_score_1eneg4 = round(
        r2_score(
            src.dropna()['measured'],
            src.dropna()['modeled z0=1e-4'],
        ),
        3
    )
    value_r2_score_1eneg5 = round(
        r2_score(
            src.dropna()['measured'],
            src.dropna()['modeled z0=1e-5'],
        ),
        3
    )
    scale = alt.Scale(domain = [-0.05, 0.05], clamp=True)
    axis = alt.Axis(values=[-0.05, -0.025, 0.0, 0.025, 0.05])
    return (
        # (
        #     one_to_one_line+alt.Chart(src).mark_circle(size=10, opacity=0.1).encode(
        #         alt.X("measured:Q").scale(scale).axis(axis),
        #         alt.Y("modeled z0=1e-4:Q").scale(scale).axis(axis),
        #     ).properties(width=200, height = 200, title=f"r² = {value_r2_score_1eneg4} (n = {len(src)})") | \
        #     one_to_one_line+alt.Chart(src).mark_rect().encode(
        #         alt.X("measured:Q").bin(maxbins=30).scale(scale).axis(axis),
        #         alt.Y("modeled z0=1e-4:Q").bin(maxbins=30).scale(scale).axis(axis),
        #         alt.Color("count():Q")
        #     ).properties(width=200, height = 200, title=f"r² = {value_r2_score_1eneg4} (n = {len(src)})")
        # ) &\
        (
            one_to_one_line+alt.Chart(src).mark_circle(size=10, opacity=0.1).encode(
                alt.X("measured:Q").scale(scale).axis(axis),
                alt.Y("modeled z0=1e-5:Q").scale(scale).axis(axis),
            ).properties(width=200, height = 200, title=f"r² = {value_r2_score_1eneg5} (n = {len(src)})") | \
            one_to_one_line+alt.Chart(src).mark_rect().encode(
                alt.X("measured:Q").bin(maxbins=30).scale(scale).axis(axis),
                alt.Y("modeled z0=1e-5:Q").bin(maxbins=30).scale(scale).axis(axis),
                alt.Color("count():Q")
            ).properties(width=200, height = 200, title=f"r² = {value_r2_score_1eneg5} (n = {len(src)})")   
        )
    )

In [None]:
unstable_times = tidy_df_30Min.query("variable == 'temp_gradient_3m_c'").query(
    "value < -0.01"
).time
stable_times = tidy_df_30Min.query("variable == 'temp_gradient_3m_c'").query(
    "value > 0.01"
).time
neutral_times = tidy_df_30Min.query("variable == 'temp_gradient_3m_c'").query(
    "value <= 0.01 & value >= -0.01"
).time

blowing_snow_times = pd.concat([
    tidy_df_30Min.query("variable == 'SF_avg_2m_ue'").query("value > 0").time,
    tidy_df_30Min.query("variable == 'SF_avg_1m_ue'").query("value > 0").time
])
clear_times = tidy_df_30Min.query("variable == 'SF_avg_2m_ue'").time[
    ~ tidy_df_30Min.query("variable == 'SF_avg_2m_ue'").time.isin(blowing_snow_times)
]

winter_times = tidy_df_30Min.query("variable == 'SF_avg_2m_ue'").set_index("time").loc[:"2023-02-28"].index.values
spring_times = tidy_df_30Min.query("variable == 'SF_avg_2m_ue'").set_index("time").loc["2023-03-01":].index.values

In [None]:
model_comparison_plot(model_meas_df)

In [None]:
(model_comparison_plot(model_meas_df[model_meas_df.time.isin(unstable_times.values)]).properties(title='Unstable data') |\
model_comparison_plot(model_meas_df[model_meas_df.time.isin(stable_times.values)]).properties(title='Stable data') |\
model_comparison_plot(model_meas_df[model_meas_df.time.isin(neutral_times.values)]).properties(title='Neutral data')).resolve_scale(color='independent')

In [None]:
(
model_comparison_plot(model_meas_df[model_meas_df.time.isin(blowing_snow_times.values)]).properties(title='Blowing snow data') |\
model_comparison_plot(model_meas_df[model_meas_df.time.isin(clear_times.values)]).properties(title='Clear data')).resolve_scale(color='independent')

In [None]:
(
model_comparison_plot(model_meas_df[model_meas_df.time.isin(winter_times.values)]).properties(title='Winter data') |\
model_comparison_plot(model_meas_df[model_meas_df.time.isin(spring_times.values)]).properties(title='Spring data')).resolve_scale(color='independent')