In [76]:
import pandas as pd
import glob
import altair as alt
from sublimpy import utils
import datetime as dt
import numpy as np

In [None]:
files = glob.glob("turb_datasets/tidy_df**.parquet")
df = pd.DataFrame()
for file in files:
    local = pd.read_parquet(file).query("measurement == 'w_h2o_'")
    df = pd.concat([df, local.assign(filename = file)])
df = df[df.variable.isin([
    'w_h2o__2m_c', 'w_h2o__3m_c', 'w_h2o__5m_c', 'w_h2o__10m_c', 'w_h2o__15m_c', 'w_h2o__20m_c', 
    'w_h2o__3m_uw', 'w_h2o__10m_uw', 
    'w_h2o__3m_ue', 'w_h2o__10m_ue', 
    'w_h2o__3m_d', 'w_h2o__10m_d', 
])]
df = utils.modify_df_timezone(df, 'UTC', 'US/Mountain')
df = df[(df['time'] > '20221130') & (df['time'] < '20230508')]

In [None]:
df['filename'] = df['filename'].str.replace('turb_datasets/tidy_df_20221101_20230619_planar_fit_multiplane_STRAIGHTUP_', '')
df['processing'] = df['filename'].str.replace('.parquet', '')

In [None]:
df.head()

In [None]:
df.value = df.value.where(df.value < 0.5, other=0)

# Calculate and plot seasonal totals

In [None]:
totals_df = pd.DataFrame(df.groupby(['variable', 'processing'])['value'].sum()*30*60/1000).reset_index()

In [None]:
totals_df['despiking'] = totals_df['processing'].apply(lambda s: s.split('_')[0])
totals_df['instrument_flagging'] = totals_df['processing'].apply(lambda s: s.split('_')[1])
totals_df['snowfall_filtering'] = totals_df['processing'].apply(lambda s: s.split('_')[2])
totals_df['height'] = totals_df['variable'].str.split('_').str[3].str[:-1].astype(int)
totals_df['tower'] = totals_df['variable'].str.split('_').str[-1]
totals_df

In [74]:
alt.Chart(totals_df).mark_point(size=100).encode(
    alt.X('value:Q').scale(zero=False, domain=[20,40]),
    alt.Y('height:Q'),
    alt.Shape('tower:N'),
    alt.Column('instrument_flagging').sort(['flags36000', 'flags9000', 'flags3600']),
    alt.Row('despiking'),
    alt.Color('snowfall_filtering')
).properties(width=150, height = 150)

In [None]:
processing_options = df.processing.unique()
local = pd.DataFrame()
for process in processing_options:
    local = pd.concat([
        local,
        df.query("variable == 'w_h2o__20m_c'").query(f"processing == '{process}'").set_index('time').sort_index()[['value']].cumsum().assign(processing = process)
    ])
local['value'] = local['value']*1.8
local['despiking'] = local['processing'].apply(lambda s: s.split('_')[0])
local['instrument_flagging'] = local['processing'].apply(lambda s: s.split('_')[1])
local['snowfall_filtering'] = local['processing'].apply(lambda s: s.split('_')[2])

# Calculate and examine seasonal cumulative

In [None]:
alt.data_transformers.enable('json')

In [None]:
alt.Chart(local.reset_index().dropna()).mark_line().encode(
    alt.X('time:T'),
    alt.Y('value:Q'),
    alt.Color('processing:N'),
    alt.Row('snowfall_filtering:N'),
    tooltip = 'processing:N'
).properties(width=1000).resolve_scale(color='independent')

In [None]:
pd.options.display.max_rows = 500
display(df.groupby(['variable','processing']).value.max())

In [69]:
def restrict_time(src):
    return src[
        (
            src.time > '20230111'
        )&(
            src.time < '20230111 0600'
        )
    ]
alt.Chart(restrict_time(
    df.query("variable == 'w_h2o__20m_c'")
)).mark_line().encode(
    alt.X('time:T'),
    alt.Y('value:Q'),
    alt.Color('processing'),
    facet='processing'
)

In [70]:
def restrict_time(src):
    return src[
        (
            src.time > '20221221'
        )&(
            src.time < '20221223'
        )
    ]
alt.Chart(restrict_time(
    df.query("variable == 'w_h2o__20m_c'")
)).mark_line().encode(
    alt.X('time:T'),
    alt.Y('value:Q'),
    alt.Color('processing'),
    facet='processing'
)

In [None]:
restrict_time(
    df.query("variable == 'w_h2o__20m_c'")
).value.value_counts()

# Apply mean diurnal gap filling 

In [96]:
df_gapfilled = pd.DataFrame()
for lhflux_variable in df.variable.unique():
    for process in df.processing.unique():
        subset = df.query(f"variable == '{lhflux_variable}'")
        subset = subset.query(f"processing == '{process}'")
        subset = subset.set_index('time')
        for i,row in subset.iterrows():
            if np.isnan(row['value']) or row['value'] == 0:
                start_window = i - dt.timedelta(days=3, hours=12)
                end_window = i + dt.timedelta(days=3, hours=12)
                src = subset.loc[start_window: end_window].reset_index()
                means = pd.DataFrame(
                    src.groupby([src.time.dt.hour, src.time.dt.minute])['value'].mean()
                )
                subset.loc[i, 'value'] = means.loc[i.hour, i.minute].value
        df_gapfilled = pd.concat([df_gapfilled, subset])

In [97]:
totals_gapfilled_df = pd.DataFrame(df_gapfilled.groupby(['variable', 'processing'])['value'].sum()*30*60/1000).reset_index()

In [98]:
totals_gapfilled_df['despiking'] = totals_gapfilled_df['processing'].apply(lambda s: s.split('_')[0])
totals_gapfilled_df['instrument_flagging'] = totals_gapfilled_df['processing'].apply(lambda s: s.split('_')[1])
totals_gapfilled_df['snowfall_filtering'] = totals_gapfilled_df['processing'].apply(lambda s: s.split('_')[2])
totals_gapfilled_df['height'] = totals_gapfilled_df['variable'].str.split('_').str[3].str[:-1].astype(int)
totals_gapfilled_df['tower'] = totals_gapfilled_df['variable'].str.split('_').str[-1]
totals_gapfilled_df

Unnamed: 0,variable,processing,value,despiking,instrument_flagging,snowfall_filtering,height,tower
0,w_h2o__10m_c,nodespiking_flags36000_snowfallfiltered0.5mm,39.902017,nodespiking,flags36000,snowfallfiltered0.5mm,10,c
1,w_h2o__10m_c,nodespiking_flags36000_snowfallfiltered0mm,39.221165,nodespiking,flags36000,snowfallfiltered0mm,10,c
2,w_h2o__10m_c,nodespiking_flags36000_snowfallfilteredno,39.120948,nodespiking,flags36000,snowfallfilteredno,10,c
3,w_h2o__10m_c,nodespiking_flags3600_snowfallfiltered0.5mm,38.070777,nodespiking,flags3600,snowfallfiltered0.5mm,10,c
4,w_h2o__10m_c,nodespiking_flags3600_snowfallfiltered0mm,37.454751,nodespiking,flags3600,snowfallfiltered0mm,10,c
5,w_h2o__10m_c,nodespiking_flags3600_snowfallfilteredno,37.401353,nodespiking,flags3600,snowfallfilteredno,10,c
6,w_h2o__10m_c,nodespiking_flags9000_snowfallfiltered0.5mm,38.856018,nodespiking,flags9000,snowfallfiltered0.5mm,10,c
7,w_h2o__10m_c,nodespiking_flags9000_snowfallfiltered0mm,37.954073,nodespiking,flags9000,snowfallfiltered0mm,10,c
8,w_h2o__10m_c,nodespiking_flags9000_snowfallfilteredno,37.900676,nodespiking,flags9000,snowfallfilteredno,10,c
9,w_h2o__10m_c,q7_flags36000_snowfallfiltered0.5mm,38.011942,q7,flags36000,snowfallfiltered0.5mm,10,c


In [100]:
alt.Chart(totals_gapfilled_df).mark_point(size=100).encode(
    alt.X('value:Q').scale(zero=False, domain=[20,40]),
    alt.Y('height:Q'),
    alt.Shape('tower:N'),
    alt.Column('instrument_flagging').sort(['flags36000', 'flags9000', 'flags3600']),
    alt.Row('despiking'),
    alt.Color('snowfall_filtering')
).properties(width=150, height = 150).display(renderer='svg')

In [None]:
def restrict_time(src):
    return src[
        (
            src.time > '20221221'
        )&(
            src.time < '20221223'
        )
    ]
alt.Chart(restrict_time(
    df.query("variable == 'w_h2o__20m_c'")
)).mark_line().encode(
    alt.X('time:T'),
    alt.Y('value:Q'),
    alt.Color('processing'),
    facet='processing'
)