**Really should be called `clean latent heat flux data`**

This notebook:
- Examines how blowing snow fluxes and "bad" EC data (data with quality control flags raised) are related
- Removes LH flux data points from the 5 minute dataset that have flags raised for >10% of the instantaneous (20hz) measurements included in each 5-minute Reynolds average
  - Analyzes how different thresholds effect sublimation estimates
- Removes LH flux data points outside a threshold 5*$\sigma$ where $\sigma$ is standard deviation
- Interpolate data gaps upto 1-hour long
  - Analyzes how changing the "max gap size interpolated" effects sublimation estimates
- Adds "Cumulative sublimation measured"  measurements to the 5 minute dataset, and saves it to disc

In [None]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('json')

from sublimpy import turbulence
import matplotlib.pyplot as plt
from sublimpy import tidy
import metpy.constants
import datetime as dt

# Open data

## Open SOS Measurement Dataset

In [None]:
start_date = '20221130'
end_date = '20230509'
# open files
tidy_df = pd.read_parquet(f'tidy_df_{start_date}_{end_date}_noplanar_fit.parquet')
# convert time column to datetime
tidy_df['time'] = pd.to_datetime(tidy_df['time'])
# limit data to our dates of interest, based on continuous snow cover at Kettle Ponds
tidy_df = tidy_df.set_index('time').sort_index().loc[start_date:end_date].reset_index()

## Set file path to save the modified tidy_df dataset

In [None]:
output_fn = f'tidy_df_{start_date}_{end_date}_noplanar_fit_clean.parquet'

# Identify snowfall dates (from NCAR report)

In [None]:
snowfall_dates = """
Start Time,End Time
2022 Oct 23 04:00:00, 2022 Oct 23 07:00:00
2022 Oct 25 19:00:00, 2022 Oct 26 12:00:00
2022 Oct 26 21:00:00, 2022 Oct 27 05:00:00
2022 Nov 03 04:00:00, 2022 Nov 04 12:00:00
2022 Nov 05 02:00:00, 2022 Nov 05 17:00:00
2022 Nov 09 16:30:00, 2022 Nov 10 09:30:00
2022 Nov 18 05:00:00, 2022 Nov 18 08:00:00
2022 Nov 27 00:00:00, 2022 Nov 27 05:00:00
2022 Nov 28 18:00:00, 2022 Nov 29 14:00:00
2022 Dec 02 03:00:00, 2022 Dec 02 16:00:00
2022 Dec 03 17:00:00, 2022 Dec 04 17:00:00
2022 Dec 05 00:00:00, 2022 Dec 05 10:00:00
2022 Dec 06 00:00:00, 2022 Dec 07 14:00:00
2022 Dec 08 00:00:00, 2022 Dec 08 10:00:00
2022 Dec 20 18:00:00, 2022 Dec 22 07:00:00
2022 Dec 27 18:00:00, 2023 Jan 02 03:00:00
2023 Jan 02 10:00:00, 2023 Jan 04 06:00:00
2023 Jan 06 03:00:00, 2023 Jan 07 08:00:00
2023 Jan 09 22:00:00, 2023 Jan 12 00:00:00
2023 Jan 14 20:00:00, 2023 Jan 15 14:00:00
2023 Jan 16 00:00:00, 2023 Jan 18 10:00:00
2023 Jan 27 14:00:00, 2023 Jan 28 06:00:00
2023 Jan 29 00:00:00, 2023 Jan 31 17:00:00
2023 Feb 06 00:00:00, 2023 Feb 06 14:00:00
2023 Feb 13 12:00:00, 2023 Feb 15 07:00:00
2023 Feb 20 10:00:00, 2023 Feb 24 15:00:00
2023 Feb 28 00:00:00, 2023 Mar 02 06:00:00
2023 Mar 10 06:00:00, 2023 Mar 11 11:00:00
2023 Mar 12 00:00:00, 2023 Mar 16 00:00:00
2023 Mar 20 12:00:00, 2023 Mar 27 12:00:00
2023 Mar 30 15:00:00, 2023 Apr 01 07:00:00
2023 Apr 03 20:00:00, 2023 Apr 4 08:00:00
2023 Apr 14 10:00:00, 2023 Apr 15 14:00:00
2023 Apr 20 20:00:00, 2023 Apr 23 10:00:00
2023 Apr 24 22:00:00, 2023 Apr 26 00:00:00
2023 Apr 27 22:00:00, 2023 Apr 28 08:00:00
2023 May 10 22:00:00, 2023 May 12 08:00:00
"""
from io import StringIO
snowfall_df = pd.read_csv(StringIO(snowfall_dates))
snowfall_df['Start Time'] = pd.to_datetime(snowfall_df['Start Time'])
snowfall_df['End Time'] = pd.to_datetime(snowfall_df['End Time'])

timestamps_during_snowfall = np.concatenate(snowfall_df.apply(
    lambda row: pd.date_range(row['Start Time'], row['End Time'], freq='30Min').to_numpy(),
    axis=1
).values).ravel()

## Identify the relevant turbulent flux measurement variables

In [None]:
ec_lhflux_and_counts_variables = [
    # ('w_h2o__2m_c', 'counts_2m_c_1'), 
    ('w_h2o__3m_c', 'counts_3m_c_1'), 
    ('w_h2o__5m_c', 'counts_5m_c_1'), 
    ('w_h2o__10m_c', 'counts_10m_c_1'), 
    ('w_h2o__15m_c', 'counts_15m_c_1'), 
    ('w_h2o__20m_c', 'counts_20m_c_1'), 


    # ('w_h2o__1m_d', 'counts_1m_d_1'), 
    ('w_h2o__3m_d', 'counts_3m_d_1'), 
    ('w_h2o__10m_d', 'counts_10m_d_1'), 
      
    # ('w_h2o__1m_ue', 'counts_1m_ue_1'), 
    ('w_h2o__3m_ue', 'counts_3m_ue_1'), 
    ('w_h2o__10m_ue', 'counts_10m_ue_1'), 


    # ('w_h2o__1m_uw',  'counts_1m_uw_1'), 
    ('w_h2o__3m_uw', 'counts_3m_uw_1'), 
    ('w_h2o__10m_uw', 'counts_10m_uw_1'), 
]
ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]

# Analyze the quality of the EC dataset

## Add combined blowing snow variable to dataset

In [None]:
tidy_df = tidy.tidy_df_add_variable(
    tidy_df,
    (
        tidy_df.query("variable == 'SF_avg_1m_ue'")['value'].values + 
        tidy_df.query("variable == 'SF_avg_2m_ue'")['value'].values
    ), 
    'SF_avg_ue',
    'snow flux',
    1,
    'ue',
)


## Plot data counts during blowing snow, as a function of blowing snow flux

In [None]:
counts_variables = list(list(zip(*ec_lhflux_and_counts_variables))[1])

In [None]:
bs_times = list(tidy_df[tidy_df.variable == 'SF_avg_ue'].query("value > 0").time.unique())
nobs_times = list(tidy_df[tidy_df.variable == 'SF_avg_ue'].query("value == 0").time.unique())

In [None]:
src = tidy_df[tidy_df.variable == 'counts_3m_c_1']
src = pd.concat([
    src[src.time.isin(bs_times)].assign(bs = 'blowing snow'),
    src[src.time.isin(nobs_times)].assign(bs = 'no blowing snow')
])
src

blowing_snow_vs_counts_src = tidy_df[tidy_df.variable.isin([
    'SF_avg_2m_ue', 
    'SF_avg_1m_ue', 
    'SF_avg_ue', 
    #############
    'counts_2m_c_1',    'w_h2o__2m_c',  'spd_2m_c',     'RH_2m_c',      'Ri_2m_c',
    'counts_3m_c_1',    'w_h2o__3m_c',  'spd_3m_c',     'RH_3m_c',      'Ri_3m_c',
    'counts_5m_c_1',    'w_h2o__5m_c',  'spd_5m_c',     'RH_5m_c',      'Ri_5m_c',
    'counts_10m_c_1',   'w_h2o__10m_c', 'spd_10m_c',    'RH_10m_c',     'Ri_10m_c',
    'counts_15m_c_1',   'w_h2o__15m_c', 'spd_15m_c',    'RH_15m_c',     'Ri_15m_c',
    'counts_20m_c_1',   'w_h2o__20m_c', 'spd_20m_c',    'RH_20m_c',     'Ri_20m_c',
])].pivot(
        index='time',
        columns='variable',
        values='value'
    ).reset_index()
blowing_snow_vs_counts_src['Dec 22'] = blowing_snow_vs_counts_src.time.dt.date == dt.date(2022, 12, 22)

rule = alt.Chart().transform_calculate(rule='32400').mark_rule(strokeDash=[4,2]).encode(y='rule:Q')

all_blowing_snow_data = blowing_snow_vs_counts_src.query("SF_avg_ue>0")

counts_and_blowingsnow_3m_and_20m_c_plot = (
    (
        alt.Chart(
            all_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("SF_avg_ue").scale(type='log').title(None),
            alt.Y("counts_2m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"2m Tower C (n_bad = {len(all_blowing_snow_data.query("counts_2m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            all_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("SF_avg_ue").scale(type='log').title(None),
            alt.Y("counts_3m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"3m Tower C (n_bad = {len(all_blowing_snow_data.query("counts_3m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            all_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("SF_avg_ue").scale(type='log').title(None),
            alt.Y("counts_5m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"5m Tower C (n_bad = {len(all_blowing_snow_data.query("counts_5m_c_1 < 32400"))})")+rule 
    )
    & (
    alt.Chart(
            all_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("SF_avg_ue").scale(type='log').title([
                "Blowing snow flux (g/m^2/s)"
            ]),
            alt.Y("counts_10m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"10m Tower C (n_bad = {len(all_blowing_snow_data.query("counts_10m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            all_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("SF_avg_ue").scale(type='log').title([
                "Blowing snow flux (g/m^2/s)"
            ]),
            alt.Y("counts_15m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"15m Tower C (n_bad = {len(all_blowing_snow_data.query("counts_15m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            all_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("SF_avg_ue").scale(type='log').title([
                "Blowing snow flux (g/m^2/s)"
            ]),
            alt.Y("counts_20m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"20m Tower C (n_bad = {len(all_blowing_snow_data.query("counts_20m_c_1 < 32400"))})")+rule 
    )    
)
counts_and_blowingsnow_3m_and_20m_c_plot.save("counts_and_blowingsnow_3m_c_plot.png", ppi=200)
counts_and_blowingsnow_3m_and_20m_c_plot.display(renderer='svg')

In [None]:
no_blowing_snow_data = blowing_snow_vs_counts_src.query("SF_avg_ue==0")
no_blowing_snow_data = no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)]

winds_and_noblowingsnow_3m_and_20m_c_plot = (
    (
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("spd_2m_c").title(None),
            alt.Y("counts_2m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"2m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_2m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("spd_3m_c").title(None),
            alt.Y("counts_3m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"3m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_3m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("spd_5m_c").title(None),
            alt.Y("counts_5m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"5m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_5m_c_1 < 32400"))})")+rule 
    )
    & (
    alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("spd_10m_c").title([
                "Wind speed (m/s)"
            ]),
            alt.Y("counts_10m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"10m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_10m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("spd_15m_c").title([
                "Wind speed (m/s)"
            ]),
            alt.Y("counts_15m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"15m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_15m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("spd_20m_c").title([
                "Wind speed (m/s)"
            ]),
            alt.Y("counts_20m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"20m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_20m_c_1 < 32400"))})")+rule 
    )    
)
winds_and_noblowingsnow_3m_and_20m_c_plot.display(renderer='svg')

In [None]:
no_blowing_snow_data = blowing_snow_vs_counts_src.query("SF_avg_ue==0")
no_blowing_snow_data = no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)]

RH_and_noblowingsnow_3m_and_20m_c_plot = (
    (
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("RH_2m_c").title(None),
            alt.Y("counts_2m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"2m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_2m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("RH_3m_c").title(None),
            alt.Y("counts_3m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"3m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_3m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("RH_5m_c").title(None),
            alt.Y("counts_5m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"5m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_5m_c_1 < 32400"))})")+rule 
    )
    & (
    alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("RH_10m_c").title([
                "Relative humidity (%)"
            ]),
            alt.Y("counts_10m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"10m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_10m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("RH_15m_c").title([
                "Relative humidity (%)"
            ]),
            alt.Y("counts_15m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"15m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_15m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("RH_20m_c").title([
                "Relative humidity (%)"
            ]),
            alt.Y("counts_20m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"20m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_20m_c_1 < 32400"))})")+rule 
    )    
)
RH_and_noblowingsnow_3m_and_20m_c_plot.display(renderer='svg')

In [None]:
no_blowing_snow_data = blowing_snow_vs_counts_src.query("SF_avg_ue==0")
no_blowing_snow_data = no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)]

RH_and_noblowingsnow_3m_and_20m_c_plot = (
    (
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("Ri_2m_c").title(None).scale(domain=[-2,2], clamp=True),
            alt.Y("counts_2m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"2m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_2m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("Ri_3m_c").title(None).scale(domain=[-2,2], clamp=True),
            alt.Y("counts_3m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"3m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_3m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("Ri_5m_c").title(None).scale(domain=[-2,2], clamp=True),
            alt.Y("counts_5m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"5m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_5m_c_1 < 32400"))})")+rule 
    )
    & (
    alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("Ri_10m_c").title([
                "Ri"
            ]).scale(domain=[-2,2], clamp=True),
            alt.Y("counts_10m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"10m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_10m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("Ri_15m_c").title([
                "Ri"
            ]).scale(domain=[-2,2], clamp=True),
            alt.Y("counts_15m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"15m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_15m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("Ri_20m_c").title([
                "Ri"
            ]).scale(domain=[-2,2], clamp=True),
            alt.Y("counts_20m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"20m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_20m_c_1 < 32400"))})")+rule 
    )    
)
RH_and_noblowingsnow_3m_and_20m_c_plot.display(renderer='svg')

In [None]:
no_blowing_snow_data = blowing_snow_vs_counts_src.query("SF_avg_ue==0")
# no_blowing_snow_data = no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)]

RH_and_noblowingsnow_3m_and_20m_c_plot = (
    (
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("w_h2o__2m_c").title(None).scale(domain=[-0.1,0.1], clamp=True),
            alt.Y("counts_2m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"2m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_2m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("w_h2o__3m_c").title(None).scale(domain=[-0.1,0.1], clamp=True),
            alt.Y("counts_3m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"3m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_3m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("w_h2o__5m_c").title(None).scale(domain=[-0.1,0.1], clamp=True),
            alt.Y("counts_5m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"5m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_5m_c_1 < 32400"))})")+rule 
    )
    & (
    alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("w_h2o__10m_c").title([
                "w'q'"
            ]).scale(domain=[-0.1,0.1], clamp=True),
            alt.Y("counts_10m_c_1").title(["Count unflagged 20hz w'q'", "measurements"]),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"10m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_10m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("w_h2o__15m_c").title([
                "w'q'"
            ]).scale(domain=[-0.1,0.1], clamp=True),
            alt.Y("counts_15m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"15m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_15m_c_1 < 32400"))})")+rule 
        | 
        alt.Chart(
            no_blowing_snow_data
        ).mark_circle(size=10).encode(
            alt.X("w_h2o__20m_c").title([
                "w'q'"
            ]).scale(domain=[-0.1,0.1], clamp=True),
            alt.Y("counts_20m_c_1").title(None),
            alt.Color("Dec 22:N").title("On Dec. 22"),
            tooltip='time'
        ).properties(width=150, height=150, title=f"20m Tower C (n_bad = {len(no_blowing_snow_data.query("counts_20m_c_1 < 32400"))})")+rule 
    )    
)
RH_and_noblowingsnow_3m_and_20m_c_plot.display(renderer='svg')

In [None]:

print(
    "n bad data",
    "n bad data, no BS",
    "n bad data, no BS during snowfall"
    "n bad data, during BS",
    "n bad data, during BS during snowfall"
)
print(
    "counts_2m_c_1",
    len(blowing_snow_vs_counts_src.query("counts_2m_c_1 < 32400")),   
    len(no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_2m_c_1 < 32400")),   
    len(no_blowing_snow_data[no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_2m_c_1 < 32400")),   
    len(all_blowing_snow_data[~all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_2m_c_1 < 32400")),
    len(all_blowing_snow_data[all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_2m_c_1 < 32400"))
)
print(
    "counts_3m_c_1",
    len(blowing_snow_vs_counts_src.query("counts_3m_c_1 < 32400")),   
    len(no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_3m_c_1 < 32400")),   
    len(no_blowing_snow_data[no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_3m_c_1 < 32400")),   
    len(all_blowing_snow_data[~all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_3m_c_1 < 32400")),
    len(all_blowing_snow_data[all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_3m_c_1 < 32400"))
)
print(
    "counts_5m_c_1",
    len(blowing_snow_vs_counts_src.query("counts_5m_c_1 < 32400")),   
    len(no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_5m_c_1 < 32400")),   
    len(no_blowing_snow_data[no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_5m_c_1 < 32400")),   
    len(all_blowing_snow_data[~all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_5m_c_1 < 32400")),
    len(all_blowing_snow_data[all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_5m_c_1 < 32400"))
)
print(
    "counts_10m_c_1",
    len(blowing_snow_vs_counts_src.query("counts_10m_c_1 < 32400")),  
    len(no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_10m_c_1 < 32400")),  
    len(no_blowing_snow_data[no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_10m_c_1 < 32400")),  
    len(all_blowing_snow_data[~all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_10m_c_1 < 32400")),
    len(all_blowing_snow_data[all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_10m_c_1 < 32400"))
)
print(
    "counts_15m_c_1",
    len(blowing_snow_vs_counts_src.query("counts_15m_c_1 < 32400")),  
    len(no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_15m_c_1 < 32400")),  
    len(no_blowing_snow_data[no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_15m_c_1 < 32400")),  
    len(all_blowing_snow_data[~all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_15m_c_1 < 32400")),
    len(all_blowing_snow_data[all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_15m_c_1 < 32400"))
)
print(
    "counts_20m_c_1",
    len(blowing_snow_vs_counts_src.query("counts_20m_c_1 < 32400")),  
    len(no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_20m_c_1 < 32400")),  
    len(no_blowing_snow_data[no_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_20m_c_1 < 32400")),  
    len(all_blowing_snow_data[~all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_20m_c_1 < 32400")),
    len(all_blowing_snow_data[all_blowing_snow_data.time.isin(timestamps_during_snowfall)].query("counts_20m_c_1 < 32400"))
)

# Examine when "bad" data occurs

In [None]:
from io import StringIO
bad_data_df = pd.read_csv(StringIO("""
var,total,no bs no snowfall,no bs snowfall,bs no snowfall,bs snowfall
counts_2m_c_1,790,166,487,36,79
counts_3m_c_1,813,207,501,25,58
counts_5m_c_1,463,84,301,19,37
counts_10m_c_1,625,150,365,38,50
counts_15m_c_1,661,375,179,24,61
counts_20m_c_1,1594,650,518,211,193
"""))
bad_data_df

In [None]:
alt.Chart(bad_data_df).transform_fold([
    'no bs no snowfall', 'no bs snowfall', 'bs no snowfall', 'bs snowfall'   
]).mark_bar().encode(
    alt.Y("var").sort(['counts_2m_c_1', 'counts_3m_c_1', 'counts_5m_c_1', 'counts_10m_c_1', 'counts_15m_c_1', 'counts_20m_c_1']),
    alt.X("value:Q").title("n data points marked \"bad\""),
    alt.Color("key:N").title("Conditions")
).display(renderer='svg')

Examine distributions of data during these different bad data periods

In [102]:
all_data_df["counts_3m_c_1"].head()

0    35981.0
1    36000.0
2    36001.0
3    36000.0
4    35999.0
Name: counts_3m_c_1, dtype: float64

In [103]:
(all_data_df["counts_3m_c_1"]< 32400).head()

0    False
1    False
2    False
3    False
4    False
Name: counts_3m_c_1, dtype: bool

In [None]:
all_data_df = blowing_snow_vs_counts_src.copy()
nobs_nosnowfall_df =    no_blowing_snow_data[~no_blowing_snow_data.time.isin(timestamps_during_snowfall)]
nobs_snowfall_df =      no_blowing_snow_data[no_blowing_snow_data.time.isin(timestamps_during_snowfall)]
bs_nosnowfall_df =      all_blowing_snow_data[~all_blowing_snow_data.time.isin(timestamps_during_snowfall)]
bs_snowfall_df =        all_blowing_snow_data[all_blowing_snow_data.time.isin(timestamps_during_snowfall)]

all_data_df['qc'] =           (all_data_df["counts_3m_c_1"] < 32400).apply(lambda is_bad: "good" if not is_bad else "bad")
nobs_nosnowfall_df['qc'] = (nobs_nosnowfall_df["counts_3m_c_1"] < 32400).apply(lambda is_bad: "good" if not is_bad else "bad")
nobs_snowfall_df['qc'] =   (nobs_snowfall_df["counts_3m_c_1"] < 32400).apply(lambda is_bad: "good" if not is_bad else "bad")
bs_nosnowfall_df['qc'] =   (bs_nosnowfall_df["counts_3m_c_1"] < 32400).apply(lambda is_bad: "good" if not is_bad else "bad")
bs_snowfall_df['qc'] =     (bs_snowfall_df["counts_3m_c_1"] < 32400).apply(lambda is_bad: "good" if not is_bad else "bad")

In [138]:
(
    (
        alt.Chart(all_data_df).mark_bar().encode(
            alt.X("w_h2o__3m_c:Q").bin(maxbins=25),
            alt.Y("count():Q").stack(None),
            alt.Color("qc:N")
        ).properties(
            height = 150, 
            title=[
                f"All data (n = {len(all_data_df)}, {round(100*len(all_data_df)/len(all_data_df))}% of data)",
                f"{round(100*len(all_data_df.query("qc =='bad'"))/len(all_data_df))}% of this subset is bad, " +
                f"{round(100*len(all_data_df.query("qc =='good'"))/len(all_data_df))}% good"
            ]
        ) &\
        alt.Chart(nobs_nosnowfall_df).mark_bar().encode(
            alt.X("w_h2o__3m_c:Q").bin(maxbins=25),
            alt.Y("count():Q").stack(None),
            alt.Color("qc:N")
        ).properties(
            height = 150, 
            title=[
                f"No BS, no snowfall (n = {len(nobs_nosnowfall_df)}, {round(100*len(nobs_nosnowfall_df)/len(all_data_df))}% of data)",
                f"{round(100*len(nobs_nosnowfall_df.query("qc =='bad'"))/len(nobs_nosnowfall_df))}% of this subset is bad, " +
                f"{round(100*len(nobs_nosnowfall_df.query("qc =='good'"))/len(nobs_nosnowfall_df))}% good"
            ]
        ) &\
        alt.Chart(nobs_snowfall_df).mark_bar().encode(
            alt.X("w_h2o__3m_c:Q").bin(maxbins=25),
            alt.Y("count():Q").stack(None),
            alt.Color("qc:N")
        ).properties(
            height = 150, 
            title=[
                f"No BS, during snowfall (n = {len(nobs_snowfall_df)}, {round(100*len(nobs_snowfall_df)/len(all_data_df))}% of data)",
                f"{round(100*len(nobs_snowfall_df.query("qc =='bad'"))/len(nobs_snowfall_df))}% of this subset is bad, " +
                f"{round(100*len(nobs_snowfall_df.query("qc =='good'"))/len(nobs_snowfall_df))}% good"
            ]
        )
    ).resolve_scale(x='shared') | (
        alt.Chart(bs_nosnowfall_df).mark_bar().encode(
            alt.X("w_h2o__3m_c:Q").bin(maxbins=25),
            alt.Y("count():Q").stack(None),
            alt.Color("qc:N")
        ).properties(
            height = 150, 
            title=[
                f"During BS, no snowfall (n = {len(bs_nosnowfall_df)}, {round(100*len(bs_nosnowfall_df)/len(all_data_df))}% of data)",
                f"{round(100*len(bs_nosnowfall_df.query("qc =='bad'"))/len(bs_nosnowfall_df))}% of this subset is bad, " +
                f"{round(100*len(bs_nosnowfall_df.query("qc =='good'"))/len(bs_nosnowfall_df))}% good"
            ]
        ) &\
        alt.Chart(bs_snowfall_df).mark_bar().encode(
            alt.X("w_h2o__3m_c:Q").bin(maxbins=25),
            alt.Y("count():Q").stack(None),
            alt.Color("qc:N")
        ).properties(
            height = 150, 
            title=[
                f"During BS, during snowfall (n = {len(bs_snowfall_df)}, {round(100*len(bs_snowfall_df)/len(all_data_df))}% of data)",
                f"{round(100*len(bs_snowfall_df.query("qc =='bad'"))/len(bs_snowfall_df))}% of this subset is bad, " +
                f"{round(100*len(bs_snowfall_df.query("qc =='good'"))/len(bs_snowfall_df))}% good"
            ]
        )
    ).resolve_scale(x='shared')
).resolve_scale(x='shared').properties(title=[
    "Distributions of latent heat flux measurements during different weather conditions",
    "Latent heat fluxes are measured at 3m. \"bad\" data has more than 10% of ",
    "instantaneous (20Hz) measurements flagged by instruments."
]).display(renderer='svg')

In [105]:
def get_data_plot(df, title, variable):
    return (
        alt.Chart(df).mark_text().encode(
            alt.Text("count()"),
            alt.Y("qc:O").title(None).axis(None),
        ).properties(width=20, title=["Sample", "n"])
        |
        alt.Chart(df).transform_calculate(
            sublimation_mm = f"datum.{variable}*30*60/1000",
        ).mark_text().encode(
            alt.Text("sum(sublimation_mm):Q", format=".000f"),
            alt.Y("qc:O").title(None).axis(None),
        ).properties(width=20, title=["Sublimation", "(mm SWE)"])
        |
        alt.Chart(df).mark_boxplot(outliers={'size':1}).encode(
            alt.X(f"{variable}:Q").scale(domain=[-0.06,0.06], clamp=True),
            alt.Y("qc:O").axis(orient='right'),
            alt.Color("qc:N").legend(None)
        ).properties(width=200, title=title)
    )

In [106]:
(
    (
        get_data_plot(all_data_df, ["All data"],     "w_h2o__3m_c") &\
        get_data_plot(nobs_nosnowfall_df, ["No BS,", "No Snowfall"],"w_h2o__3m_c") &\
        get_data_plot(nobs_snowfall_df, ["No BS,", "snowfall"],     "w_h2o__3m_c") &\
        get_data_plot(bs_nosnowfall_df, ["BS,", "no snowfall"],     "w_h2o__3m_c") &\
        get_data_plot(bs_snowfall_df, ["BS,", "snowfall"],          "w_h2o__3m_c")
    ).resolve_scale(x='shared').properties(title='3m C')
).configure_axis(
    grid=False,
    # domain=False
).configure_view(
    strokeOpacity=0
).display(renderer='svg')

In [None]:
(tidy_df[tidy_df.variable.isin(ec_lhflux_variables)].pivot(
    index='time',
    columns='variable',
    values = 'value'
) == 0).sum()

What are lh flux values when counts are 0?

In [None]:
print(blowing_snow_vs_counts_src[blowing_snow_vs_counts_src['counts_2m_c_1'] == 0]['w_h2o__2m_c'].unique())
print(blowing_snow_vs_counts_src[blowing_snow_vs_counts_src['counts_3m_c_1'] == 0]['w_h2o__3m_c'].unique())
print(blowing_snow_vs_counts_src[blowing_snow_vs_counts_src['counts_5m_c_1'] == 0]['w_h2o__5m_c'].unique())
print(blowing_snow_vs_counts_src[blowing_snow_vs_counts_src['counts_10m_c_1'] == 0]['w_h2o__10m_c'].unique())
print(blowing_snow_vs_counts_src[blowing_snow_vs_counts_src['counts_15m_c_1'] == 0]['w_h2o__15m_c'].unique())
print(blowing_snow_vs_counts_src[blowing_snow_vs_counts_src['counts_20m_c_1'] == 0]['w_h2o__20m_c'].unique())

In [None]:
[blowing_snow_vs_counts_src[[
    'counts_2m_c_1',
    'counts_3m_c_1',
    'counts_5m_c_1',
    'counts_10m_c_1',
    'counts_15m_c_1',
    'counts_20m_c_1',
]] == 0]

# Examine data quality during snowfall events

In [None]:
timestamps_during_snowfall
src = blowing_snow_vs_counts_src.query("counts_3m_c_1 < 32400").query("SF_avg_ue > 0")
print(len(src))
src = src[~src.time.isin(timestamps_during_snowfall)]
print(len(src))

print()

timestamps_during_snowfall
src = blowing_snow_vs_counts_src.query("counts_3m_c_1 < 32400").query("SF_avg_1m_ue == 0")
print(len(src))
src = src[~src.time.isin(timestamps_during_snowfall)]
print(len(src))

In [None]:
def bad_data_dist_plot(df, x, title):
    df = df.query(f"{x} < 32400")
    return alt.Chart(
        df
    ).mark_bar().encode(
        alt.X(f"{x}:Q").bin(maxbins=50).scale(
            domain=[0, 36000]
        ),
        alt.Y("count():Q"),
    ).properties(height = 75, width=150, title = title)

bad_yes_bs_no_snowfall = blowing_snow_vs_counts_src.query("SF_avg_ue > 0")
bad_yes_bs_no_snowfall = bad_yes_bs_no_snowfall[~bad_yes_bs_no_snowfall.time.isin(timestamps_during_snowfall)]

bad_yes_bs_yes_snowfall = blowing_snow_vs_counts_src.query("SF_avg_ue > 0")
bad_yes_bs_yes_snowfall = bad_yes_bs_yes_snowfall[bad_yes_bs_yes_snowfall.time.isin(timestamps_during_snowfall)]

bad_no_bs_no_snowfall = blowing_snow_vs_counts_src.query("SF_avg_ue == 0")
bad_no_bs_no_snowfall = bad_no_bs_no_snowfall[~bad_no_bs_no_snowfall.time.isin(timestamps_during_snowfall)]

bad_no_bs_yes_snowfall = blowing_snow_vs_counts_src.query("SF_avg_ue == 0")
bad_no_bs_yes_snowfall = bad_no_bs_yes_snowfall[bad_no_bs_yes_snowfall.time.isin(timestamps_during_snowfall)]

(
(
    bad_data_dist_plot(bad_yes_bs_no_snowfall, 'counts_3m_c_1', 'During BS, not during snowfall') |
    bad_data_dist_plot(bad_yes_bs_yes_snowfall, 'counts_3m_c_1', 'During BS, during snowfall') |
    bad_data_dist_plot(bad_no_bs_no_snowfall, 'counts_3m_c_1', 'No BS, not during snowfall') |
    bad_data_dist_plot(bad_no_bs_yes_snowfall, 'counts_3m_c_1', 'No BS, during snowfall')
).properties(title='3m, Tower C, Counts of bad data during conditions (<90% 20hz measurements unflagged)').resolve_scale(x='shared', y='independent') &\
(
    bad_data_dist_plot(bad_yes_bs_no_snowfall, 'counts_5m_c_1', 'During BS, not during snowfall') |
    bad_data_dist_plot(bad_yes_bs_yes_snowfall, 'counts_5m_c_1', 'During BS, during snowfall') |
    bad_data_dist_plot(bad_no_bs_no_snowfall, 'counts_5m_c_1', 'No BS, not during snowfall') |
    bad_data_dist_plot(bad_no_bs_yes_snowfall, 'counts_5m_c_1', 'No BS, during snowfall')
).properties(title='5m, Tower C, Counts of bad data during conditions (<90% 20hz measurements unflagged)').resolve_scale(x='shared', y='independent') &\
(
    bad_data_dist_plot(bad_yes_bs_no_snowfall, 'counts_10m_c_1', 'During BS, not during snowfall') |
    bad_data_dist_plot(bad_yes_bs_yes_snowfall, 'counts_10m_c_1', 'During BS, during snowfall') |
    bad_data_dist_plot(bad_no_bs_no_snowfall, 'counts_10m_c_1', 'No BS, not during snowfall') |
    bad_data_dist_plot(bad_no_bs_yes_snowfall, 'counts_10m_c_1', 'No BS, during snowfall')
).properties(title='10m, Tower C, Counts of bad data during conditions (<90% 20hz measurements unflagged)').resolve_scale(x='shared', y='independent') &\
(
    bad_data_dist_plot(bad_yes_bs_no_snowfall, 'counts_15m_c_1', 'During BS, not during snowfall') |
    bad_data_dist_plot(bad_yes_bs_yes_snowfall, 'counts_15m_c_1', 'During BS, during snowfall') |
    bad_data_dist_plot(bad_no_bs_no_snowfall, 'counts_15m_c_1', 'No BS, not during snowfall') |
    bad_data_dist_plot(bad_no_bs_yes_snowfall, 'counts_15m_c_1', 'No BS, during snowfall')
).properties(title='15m, Tower C, Counts of bad data during conditions (<90% 20hz measurements unflagged)').resolve_scale(x='shared', y='independent') &\
(
    bad_data_dist_plot(bad_yes_bs_no_snowfall, 'counts_20m_c_1', 'During BS, not during snowfall') |
    bad_data_dist_plot(bad_yes_bs_yes_snowfall, 'counts_20m_c_1', 'During BS, during snowfall') |
    bad_data_dist_plot(bad_no_bs_no_snowfall, 'counts_20m_c_1', 'No BS, not during snowfall') |
    bad_data_dist_plot(bad_no_bs_yes_snowfall, 'counts_20m_c_1', 'No BS, during snowfall')
).properties(title='20m, Tower C, Counts of bad data during conditions (<90% 20hz measurements unflagged)').resolve_scale(x='shared', y='independent')
).resolve_scale(x='shared', y='independent')

In [None]:


bad_data_during_bs_nosnowfall_chart = (alt.Chart(
    src
).mark_bar().encode(
    alt.X("counts_3m_c_1:Q").title(["Count unflagged 20hz w'q'", "measurements, 3m Tower C"]).bin(maxbins=50).scale(
        domain=[0, 36000]
    ),
    alt.Y("count():Q"),
).properties(height = 150, width=150, title = 'Bad data during BS, not during snowfall')
)

src = blowing_snow_vs_counts_src.query("counts_3m_c_1 < 32400").query("SF_avg_ue == 0")
print(len(src))
src = src[~src.time.isin(timestamps_during_snowfall)]
print(len(src))
bad_data_notduring_bs_nosnowfall_chart = (alt.Chart(
    src
).mark_bar().encode(
    alt.X("counts_3m_c_1:Q").title(["Count unflagged 20hz w'q'", "measurements, 3m Tower C"]).bin(maxbins=50).scale(
        domain=[0, 36000]
    ),
    alt.Y("count():Q"),
).properties(height = 150, width=150, title = 'Bad data during no BS, not during snowfall'))


src = blowing_snow_vs_counts_src.query("counts_3m_c_1 < 32400").query("SF_avg_ue > 0")
print(len(src))
src = src[src.time.isin(timestamps_during_snowfall)]
print(len(src))
bad_data_during_bs_during_snowfall_chart = (alt.Chart(
    src
).mark_bar().encode(
    alt.X("counts_3m_c_1:Q").title(["Count unflagged 20hz w'q'", "measurements, 3m Tower C"]).bin(maxbins=50).scale(
        domain=[0, 36000]
    ),
    alt.Y("count():Q"),
).properties(height = 150, width=150, title = 'Bad data during BS, during snowfall')
)

src = blowing_snow_vs_counts_src.query("counts_3m_c_1 < 32400").query("SF_avg_ue == 0")
print(len(src))
src = src[src.time.isin(timestamps_during_snowfall)]
print(len(src))
bad_data_notduring_bs_during_snowfall_chart = (alt.Chart(
    src
).mark_bar().encode(
    alt.X("counts_3m_c_1:Q").title(["Count unflagged 20hz w'q'", "measurements, 3m Tower C"]).bin(maxbins=50).scale(
        domain=[0, 36000]
    ),
    alt.Y("count():Q"),
).properties(height = 150, width=150, title = 'Bad data during no BS, during snowfall'))

(bad_data_during_bs_nosnowfall_chart | bad_data_during_bs_during_snowfall_chart) &\
(bad_data_notduring_bs_nosnowfall_chart | bad_data_notduring_bs_during_snowfall_chart) 

## Remove data points with more than 10% of the data flagged

In [None]:
all_lhflux_measurements = tidy_df[tidy_df.variable.isin(ec_lhflux_variables)].value
# lower_threshold = all_lhflux_measurements.mean() - 5*all_lhflux_measurements.std()
# upper_threshold = all_lhflux_measurements.mean() + 5*all_lhflux_measurements.std()
# don't actually do any outlier filtering like this anymore
lower_threshold = all_lhflux_measurements.min() - 1
upper_threshold = all_lhflux_measurements.max() + 1
lower_threshold, upper_threshold

In [None]:
import pandas as pd
import numpy as np

good_data_fractions = [
    0, 
    0.05, 0.1, 
    0.25, 0.50, 0.75, 
    0.90, 0.95, 
    1
]
dataframes = []
for good_data_fraction in good_data_fractions:
    for flux_var, counts_var in ec_lhflux_and_counts_variables:
        flux_values = tidy_df.query(f"variable == '{flux_var}'").value.values
        nan_count_b4_processing = pd.isnull(flux_values).sum()
        new_values = turbulence.clean_eddy_covariance(
            flux_values,
            tidy_df.query(f"variable == '{counts_var}'").value.values,
            lower_threshold,
            upper_threshold,
            fraction_good_data_reqd = good_data_fraction,
            counts_per_datapoint = 36000
        )
        nan_count_after_processing = pd.isnull(new_values).sum()
        print(good_data_fraction, flux_var, nan_count_b4_processing, nan_count_after_processing)
        dataframes.append(
            pd.DataFrame.from_dict({
                flux_var: new_values,
                'good_data_fraction': np.full(len(new_values), good_data_fraction)
            })
        )

In [None]:
seconds_per_timestep = 60*30

variable_ls = []
height_ls = []
percent_reqd_ls = []
cumulative_sublimation_ls = []

for dataframe in dataframes:
    flux_var_name = dataframe.columns[0]
    height = int(flux_var_name.split('_')[-2].split('m')[0])
    tower = flux_var_name.split('_')[-1]
    good_data_fraction = dataframe['good_data_fraction'].iloc[0]
    new_var_name = f"cumulative_sub_measured_{height}m_{tower}"
    cumulative_sublimation_values = np.nancumsum(
        dataframe[flux_var_name]*seconds_per_timestep
    )/metpy.constants.density_water.magnitude
    ec_lhflux_variables = list(zip(*ec_lhflux_and_counts_variables))[0]
    variable_ls.append(new_var_name)
    height_ls.append(height)
    percent_reqd_ls.append(good_data_fraction)
    cumulative_sublimation_ls.append(cumulative_sublimation_values.max())

In [None]:
percentreqd_qc_df = pd.DataFrame({
    'variable': variable_ls,
    'height': height_ls,
    'percent_reqd': percent_reqd_ls,
    'cumulative_sublimation': cumulative_sublimation_ls,
})
percent_reqd_seasonal_sub_sensitivity_chart = alt.Chart(percentreqd_qc_df).mark_line(point=True).transform_filter(alt.datum.height > 1).encode(
    alt.X("percent_reqd:Q").title("% 'good' data req'd per 5min average"),
    alt.Y("cumulative_sublimation:Q").title("Seasonal sublimation (mm)"),
    alt.Color("height:O").scale(scheme='viridis'),
    detail = 'variable:N',
    tooltip = 'variable:N'
)
percent_reqd_seasonal_sub_sensitivity_chart.save("percent_reqd_seasonal_sub_sensitivity_chart.png", ppi=200)
percent_reqd_seasonal_sub_sensitivity_chart

## Select our data-required value

In [None]:
dataframes = [df for df in dataframes if df['good_data_fraction'].iloc[0] == 0.90]

## Examine how calculated cumulative sublimation values var for different interpolation-window-limits, substitute clean data for old/dirty date (using 12-timestep gap limit), and add cumulative sublimation values to the dataset

In [None]:
seconds_per_timestep = 60*30

flux_var_name_ls = []
height_ls = []
interp_window_limit_ls = []
cumsum_with_interp_ls = []

for dataframe in dataframes:
    flux_var_name = dataframe.columns[0]
    height = int(flux_var_name.split('_')[-2].split('m')[0])
    tower = flux_var_name.split('_')[-1]
    new_var_name = f"cumulative_sub_measured_{height}m_{tower}"
    # Print calculated cumulative sublimation values for different interpolation-window-limits
    for interp_window_limit in [None, 1, 2, 5, 10, 12, 20, 30, 80, 100, 200, 300, 800, 1000]:
            flux_var_name_ls.append(flux_var_name)
            height_ls.append(height)
            interp_window_limit_ls.append(interp_window_limit)
            cumsum_with_interp_ls.append((np.nancumsum(
                (dataframe[flux_var_name]*seconds_per_timestep).interpolate(method='linear', limit=interp_window_limit)
            )/metpy.constants.density_water.magnitude).max()
        )
    
    # remove the old flux values
    tidy_df = tidy_df[tidy_df.variable != flux_var_name]
    
    # # add the new (cleaned) flux values
    tidy_df = tidy.tidy_df_add_variable(
        tidy_df,
        dataframe[flux_var_name].interpolate(method='linear', limit=2),
        flux_var_name,
        'w_h2o_',
        height,
        tower
    )
    # add the cumulative calculations values
    tidy_df = tidy.tidy_df_add_variable(
        tidy_df,
        (
              np.nancumsum(
                (dataframe[flux_var_name]*seconds_per_timestep).interpolate(method='linear', limit=2)
            )/metpy.constants.density_water.magnitude
        ),
        new_var_name,
        "Cumulative sublimation measured",  
        height,
        tower
    )

In [None]:
(dataframe[flux_var_name]*seconds_per_timestep).interpolate(method='linear', limit=2)

In [None]:
gapsize_qc_df= pd.DataFrame({
    'variable': flux_var_name_ls,
    'height': height_ls,
    'max_interp_limit': interp_window_limit_ls,
    'cumulative_sub': cumsum_with_interp_ls,
})

alt.Chart(gapsize_qc_df).mark_line(point=True).transform_filter(alt.datum.height > 1).encode(
    alt.X("max_interp_limit:Q").title("max gap size for interpolation").scale(type='log'),
    alt.Y("cumulative_sub:Q").title("Seasonal sublimation (mm)"),
    alt.Color("height:O").scale(scheme='viridis'),
    detail = 'variable:N',
    tooltip='variable:N'
)

In [None]:
data_clean_steps_df = pd.merge(
    percentreqd_qc_df.query("percent_reqd == 0")[['variable','cumulative_sublimation']].rename(
        columns = {'cumulative_sublimation': 'cumulative_sublimation_raw'}
    ),
    gapsize_qc_df.assign(variable = gapsize_qc_df.variable.str.replace(
        'w_h2o__', 'cumulative_sub_measured_'
    )).query("max_interp_limit == 2").rename(
        columns = {'cumulative_sub': 'cumulative_sublimation_gapfilled'}
    ),
    on='variable'
).merge(
    percentreqd_qc_df.query("percent_reqd == 0.9")[['variable','cumulative_sublimation']].rename(
        columns = {'cumulative_sublimation': 'cumulative_sublimation_filtered'}
    ),
    on='variable'
)

In [None]:
data_clean_steps_df['tower'] = data_clean_steps_df.variable.apply(lambda s:s.split('_')[-1])
data_clean_steps_df = data_clean_steps_df[
    ['variable', 'tower', 'height', 'cumulative_sublimation_raw', 'cumulative_sublimation_filtered', 'cumulative_sublimation_gapfilled']
].round(2)
data_clean_steps_df = data_clean_steps_df.rename(columns={
    'cumulative_sublimation_raw': 'raw',
    'cumulative_sublimation_filtered': 'filtered',
    'cumulative_sublimation_gapfilled': 'gapfilled',
})

In [None]:
data_clean_steps_df

In [None]:
vars = [
    'raw',	'filtered',	'gapfilled'
    ]
data_cleaning_steps_affects_chart = alt.Chart(data_clean_steps_df).mark_line().transform_fold(
    vars
).encode(
    alt.Y("key:O", sort=vars).title(None),
    alt.X("value:Q").title("Cumulatuve sublimation (mm)"),
    alt.Color('height:N'),
    detail = 'variable'
)
data_cleaning_steps_affects_chart.save("data_cleaning_steps_affects_chart.png", ppi=200)
data_cleaning_steps_affects_chart

# Save cleaned data

In [None]:
tidy_df.to_parquet(output_fn)

In [None]:
ls -lah | grep parquet

# Explore the data points we left out, based on the thresholds (for filtering data) that we determined earlier
# These are the datapoints we don't trust and are excluding - we should get an idea of what's happening during these limited times
* Measurements during times where SF_avg_XX_ue measurements are > 10^0, and have “good data” counts less than 0.9*6000 
* If too much data, only remove isolated incidents (I.e. less than 2 of those data points in a row)

In [None]:
alt.Chart(
    tidy_df[
        tidy_df.variable == 'cumulative_sub_measured_10m_d'
    ].set_index('time').loc[
        "2023-04-24":"2023-04-26"
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q")
)


In [None]:
bad_data = tidy_df[
    tidy_df.variable.isin(['SF_avg_2m_ue', 'SF_avg_1m_ue'])
].query("value > 1")

In [None]:
bad_data = bad_data.sort_values('time')
bad_data['month'] = bad_data.time.dt.month

In [None]:
alt.Chart(
    bad_data
).mark_tick().encode(
    alt.X("time:T").title(None),
    alt.Row("month:O").sort([12,1,2,3,4])
).resolve_scale(x='independent', y='shared').properties(width=400)

Select a few case studies based on the plot of bad data occurences above

In [None]:
case_study_ls = [
    ("20221213", "20221215"),
    ("20221221", "20221223"),
    ("2023-01-10 12:00:00" , "2023-01-10 14:00:00"),
    ("20230205", "20230207"),
    ("20230308", "20230310"),
    ("20230330", "20230401"),
    ("20230403", "20230404"),
    ("20230424 2300", "20230425 0200"),
    

]

In [None]:
tidy_df = pd.read_parquet(f'tidy_df_{start_date}_{end_date}_noplanar_fit.parquet')

In [None]:
src = tidy_df[
        tidy_df.variable.isin(list(sum(ec_lhflux_and_counts_variables, ())) + [
            'spd_3m_c', 'spd_5m_c', 'spd_10m_c', 'spd_15m_c', 'spd_20m_c',
            'spd_3m_d', 'spd_10m_d',
            'spd_3m_ue', 'spd_10m_ue',
            'spd_3m_uw', 'spd_10m_uw',
        ])
    ].set_index('time').sort_index()
src['tower and height'] = src.apply(lambda row : row['tower'] + ' ' + str(row['height']), axis=1)

In [None]:
data_dropped_case_study_src = src[
        case_study_ls[1][0]: case_study_ls[1][1]
    ].reset_index().query("tower == 'c'").query("height == 3")

base_chart = alt.Chart().mark_line().properties(
    height = 100,
    width = 200
)
data_dropped_case_study_plot = (
    alt.layer(
        base_chart, data=data_dropped_case_study_src.query(
            "measurement == 'eddy covariance h2o high rate count'"
        )
    ).encode(
        alt.X("time:T"),
        alt.Y("value:Q").title(["Count unflagged 20hz w'q'", "measurements, 3m Tower C"]),
    ) & 
    alt.layer(
        base_chart, data=data_dropped_case_study_src.query(
            "measurement == 'w_h2o_'"
        )
    ).encode(
        alt.X("time:T"),
        alt.Y("value:Q").title(["w'q'", "3m Tower C"]),
    ) & 
    alt.layer(
        base_chart, data=data_dropped_case_study_src.query(
            "measurement == 'wind speed'"
        )
    ).encode(
        alt.X("time:T"),
        alt.Y("value:Q").title(["Wind speed", "3m Tower C"]),
    )
).resolve_scale(x='shared')

In [None]:
no_data_dropped_case_study_src = src[
        case_study_ls[0][0]: case_study_ls[0][1]
    ].reset_index().query("tower == 'ue'").query("height == 10")

base_chart = alt.Chart().mark_line().properties(
    height = 100,
    width = 200
)
no_data_dropped_case_study_plot = (
    alt.layer(
        base_chart, data=no_data_dropped_case_study_src.query(
            "measurement == 'eddy covariance h2o high rate count'"
        )
    ).encode(
        alt.X("time:T"),
        alt.Y("value:Q").title(["Count unflagged 20hz w'q'", "measurements, 10m Tower UE"]),
    ) & 
    alt.layer(
        base_chart, data=no_data_dropped_case_study_src.query(
            "measurement == 'w_h2o_'"
        )
    ).encode(
        alt.X("time:T"),
        alt.Y("value:Q").title(["w'q'", "10m Tower UE"]),
    ) & 
    alt.layer(
        base_chart, data=no_data_dropped_case_study_src.query(
            "measurement == 'wind speed'"
        )
    ).encode(
        alt.X("time:T"),
        alt.Y("value:Q").title(["Wind speed", "10m Tower UE"]),
    )
).resolve_scale(x='shared')

In [None]:
data_dropped_casestudies_chart = (no_data_dropped_case_study_plot | data_dropped_case_study_plot)
data_dropped_casestudies_chart.save("data_dropped_casestudies_chart.png", ppi=200)
data_dropped_casestudies_chart

In [None]:
alt.Chart(
    src[
        case_study_ls[0][0]: case_study_ls[0][1]
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q").scale(zero=False),
    # alt.Color("tower:N"),
    alt.Row("tower and height:O"),
    alt.Column("measurement:N")
).resolve_scale(y='independent').properties(
    height = 100
)

In [None]:
alt.Chart(
    src[
        case_study_ls[1][0]: case_study_ls[1][1]
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q").scale(zero=False),
    # alt.Color("tower:N"),
    alt.Row("tower and height:O"),
    alt.Column("measurement:N")
).resolve_scale(y='independent').properties(
    height = 100
)

In [None]:
alt.Chart(
    src[
        case_study_ls[2][0]: case_study_ls[2][1]
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q").scale(zero=False),
    # alt.Color("tower:N"),
    alt.Row("tower and height:O"),
    alt.Column("measurement:N")
).resolve_scale(y='independent').properties(
    height = 100
)

In [None]:
alt.Chart(
    src[
        case_study_ls[3][0]: case_study_ls[3][1]
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q").scale(zero=False),
    # alt.Color("tower:N"),
    alt.Row("tower and height:O"),
    alt.Column("measurement:N")
).resolve_scale(y='independent').properties(
    height = 100
)

In [None]:
alt.Chart(
    src[
        case_study_ls[4][0]: case_study_ls[4][1]
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q").scale(zero=False),
    # alt.Color("tower:N"),
    alt.Row("tower and height:O"),
    alt.Column("measurement:N")
).resolve_scale(y='independent').properties(
    height = 100
)

In [None]:
alt.Chart(
    src[
        case_study_ls[5][0]: case_study_ls[5][1]
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q").scale(zero=False),
    # alt.Color("tower:N"),
    alt.Row("tower and height:O"),
    alt.Column("measurement:N")
).resolve_scale(y='independent').properties(
    height = 100
)

In [None]:
alt.Chart(
    src[
        case_study_ls[6][0]: case_study_ls[6][1]
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q").scale(zero=False),
    # alt.Color("tower:N"),
    alt.Row("tower and height:O"),
    alt.Column("measurement:N")
).resolve_scale(y='independent').properties(
    height = 100
)

In [None]:
alt.Chart(
    src[
        case_study_ls[7][0]: case_study_ls[7][1]
    ].reset_index()
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q").scale(zero=False),
    # alt.Color("tower:N"),
    alt.Row("tower and height:O"),
    alt.Column("measurement:N")
).resolve_scale(y='independent').properties(
    height = 100
)