In [1]:
import sys
sys.path.append('/home/elilouis/sublimationofsnow/')
import sosutils

import numpy as np
import xarray as xr

import datetime as dt
import pandas as pd

import matplotlib.pyplot as plt

import altair as alt
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [2]:
file_list = []
for date in ['20221218']:
    print(f"Downloading {date}")
    for hour in range(0,24):
        file = sosutils.download_sos_highrate_data_hour(
            date = date,
            hour = "{:02d}".format(hour),
            local_download_dir='/data2/elilouis/sublimationofsnow/hr_noqc_geo/',
            cache=True
        )
        file_list.append(file)

Downloading 20221218
Caching...skipping download for 20221218, 00
Caching...skipping download for 20221218, 01
Caching...skipping download for 20221218, 02
Caching...skipping download for 20221218, 03
Caching...skipping download for 20221218, 04
Caching...skipping download for 20221218, 05
Caching...skipping download for 20221218, 06
Caching...skipping download for 20221218, 07
Caching...skipping download for 20221218, 08
Caching...skipping download for 20221218, 09
Caching...skipping download for 20221218, 10
Caching...skipping download for 20221218, 11
Caching...skipping download for 20221218, 12
Caching...skipping download for 20221218, 13
Caching...skipping download for 20221218, 14
Caching...skipping download for 20221218, 15
Caching...skipping download for 20221218, 16
Caching...skipping download for 20221218, 17
Caching...skipping download for 20221218, 18
Caching...skipping download for 20221218, 19
Caching...skipping download for 20221218, 20
Caching...skipping download for 20

In [3]:
ds = xr.open_mfdataset(file_list, concat_dim="time",
    combine="nested",)

# Create timestamp
To use the datam, its necessary to combine 3 columns of data from the dataset to get the full timestamp. This is demonstrated below. The 'time' column actually only incudes the second and minute information. For all datapoints, the hour according to the 'time' column is 1.  The 'base_time' column indicates the hour of the day. The 'sample' column indicates the 20hz sample number. 

We demonstrate this in the plots below

In [4]:
df1 = pd.DataFrame({'time': np.unique(ds['time'])})
df2 = pd.DataFrame({'base_time': np.unique(ds['base_time'])})
df3 = pd.DataFrame({'sample': np.unique(ds['sample'])})
(
    alt.Chart(df3).mark_tick(thickness=5).encode(
        alt.X("sample:Q").title(
            f'sample (n = {len(df3)})'
        )
    ).properties(width=600) & 

    alt.Chart(df1).mark_tick(thickness=1).encode(
        alt.X("time:T").axis(
            format='%H%M%p'
        ).title(
            f'time (n = {len(df1)})'
        )
    ).properties(width=600) & 

    alt.Chart(df2).mark_tick(thickness=5).encode(
        alt.X("base_time:T").title(
            f'base_time (n = {len(df2)})'
        )
    ).properties(width=600)
)

# Get subset of data and turn it into a dataframe

In [5]:
df = ds.to_dataframe().reset_index()

In [6]:
df

Unnamed: 0,time,sample,base_time,P_0_8m_c,P_0_9m_c,P_1_0m_c,P_1_1m_c,P_10m_c,P_10m_d,P_10m_ue,...,Vpile_c,Vpile_d,Vpile_ue,Vpile_uw,Vpile_Off_d,Vpile_On_d,Vtherm_c,Vtherm_d,Vtherm_ue,Vtherm_uw
0,2022-12-18 00:00:00.500,0,2022-12-18 00:00:00,,716.923157,,,716.148987,716.218018,716.062073,...,,-0.00013,,-0.000131,0.0,0.000445,,1.860211,,1.875406
1,2022-12-18 00:00:00.500,1,2022-12-18 00:00:00,,,,,716.148987,716.218262,716.062256,...,,-0.00013,,-0.000131,0.0,0.000445,,1.860211,,1.875406
2,2022-12-18 00:00:00.500,2,2022-12-18 00:00:00,,,,,,716.217896,716.062378,...,,-0.00013,,-0.000131,0.0,0.000445,,1.860211,,1.875406
3,2022-12-18 00:00:00.500,3,2022-12-18 00:00:00,,,,,716.149170,716.218018,716.062134,...,,-0.00013,,-0.000131,0.0,0.000445,,1.860211,,1.875406
4,2022-12-18 00:00:00.500,4,2022-12-18 00:00:00,,,,,716.148010,716.217957,716.062195,...,,-0.00013,,-0.000131,0.0,0.000445,,1.860211,,1.875406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1727995,2022-12-18 00:59:59.500,15,2022-12-18 23:00:00,714.671875,714.573364,713.525269,714.458496,713.813843,713.887207,713.722961,...,,,,,,,,,,
1727996,2022-12-18 00:59:59.500,16,2022-12-18 23:00:00,714.671753,714.573486,713.524963,714.458740,713.814148,713.887024,713.722656,...,,,,,,,,,,
1727997,2022-12-18 00:59:59.500,17,2022-12-18 23:00:00,714.671387,714.573608,713.524963,714.458435,713.814087,713.887024,713.723206,...,,,,,,,,,,
1727998,2022-12-18 00:59:59.500,18,2022-12-18 23:00:00,714.671021,714.573242,713.525085,714.458252,713.814087,713.887085,713.723083,...,,,,,,,,,,


# Create a complete time index

In [7]:
df['time'] = df.apply(lambda row: dt.datetime(
        year = row['time'].year,
        month = row['time'].month,
        day = row['time'].day,
        hour = row['base_time'].hour,
        minute = row['time'].minute,
        second = row['time'].second,
        microsecond = int(row['sample'] * (1e6/20))
    ),
    axis = 1
)

## Add spd variable to dataset 

In [8]:
df['spd_2m_c'] = np.sqrt(df['u_2m_c']**2 + df['v_2m_c']**2)
df['spd_3m_c'] = np.sqrt(df['u_3m_c']**2 + df['v_3m_c']**2)
df['spd_5m_c'] = np.sqrt(df['u_5m_c']**2 + df['v_5m_c']**2)
df['spd_10m_c'] = np.sqrt(df['u_10m_c']**2 + df['v_10m_c']**2)
df['spd_15m_c'] = np.sqrt(df['u_15m_c']**2 + df['v_15m_c']**2)
df['spd_20m_c'] = np.sqrt(df['u_20m_c']**2 + df['v_20m_c']**2)

## Get a subset of the large dataset and make it tidy

In [9]:
df.time.min(), df.time.max()

(Timestamp('2022-12-18 00:00:00'), Timestamp('2022-12-18 23:59:59.950000'))

In [10]:
src = df[
    (df.time.dt.hour == 0)
    & 
    (df.time.dt.minute >= 20)
    & 
    (df.time.dt.hour < 45)
]

In [11]:
variable_names = [
    'spd_2m_c', 'spd_3m_c', 'spd_5m_c', 'spd_10m_c', 'spd_15m_c', 'spd_20m_c', 
    'u_2m_c', 'u_3m_c', 'u_5m_c', 'u_10m_c', 'u_15m_c', 'u_20m_c', 
    'v_2m_c', 'v_3m_c', 'v_5m_c', 'v_10m_c', 'v_15m_c', 'v_20m_c', 
    'w_2m_c', 'w_3m_c', 'w_5m_c', 'w_10m_c', 'w_15m_c', 'w_20m_c', 
]

tidy_src = sosutils.get_tidy_dataset(src, variable_names)

AttributeError: module 'sosutils' has no attribute 'get_tidy_dataset'

# Plot time series of velocity

In [None]:
OFFSET = 1.5
tidy_src.loc[tidy_src['variable'] == 'spd_3m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'spd_3m_c', 'value'] + 1*OFFSET
tidy_src.loc[tidy_src['variable'] == 'spd_5m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'spd_5m_c', 'value'] + 2*OFFSET
tidy_src.loc[tidy_src['variable'] == 'spd_10m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'spd_10m_c', 'value'] + 3*OFFSET
tidy_src.loc[tidy_src['variable'] == 'spd_15m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'spd_15m_c', 'value'] + 4*OFFSET
tidy_src.loc[tidy_src['variable'] == 'spd_20m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'spd_20m_c', 'value'] + 5*OFFSET

tidy_src.loc[tidy_src['variable'] == 'u_3m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'u_3m_c', 'value'] + 1*OFFSET
tidy_src.loc[tidy_src['variable'] == 'u_5m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'u_5m_c', 'value'] + 2*OFFSET
tidy_src.loc[tidy_src['variable'] == 'u_10m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'u_10m_c', 'value'] + 3*OFFSET
tidy_src.loc[tidy_src['variable'] == 'u_15m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'u_15m_c', 'value'] + 4*OFFSET
tidy_src.loc[tidy_src['variable'] == 'u_20m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'u_20m_c', 'value'] + 5*OFFSET

tidy_src.loc[tidy_src['variable'] == 'v_3m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'v_3m_c', 'value'] + 1*OFFSET
tidy_src.loc[tidy_src['variable'] == 'v_5m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'v_5m_c', 'value'] + 2*OFFSET
tidy_src.loc[tidy_src['variable'] == 'v_10m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'v_10m_c', 'value'] + 3*OFFSET
tidy_src.loc[tidy_src['variable'] == 'v_15m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'v_15m_c', 'value'] + 4*OFFSET
tidy_src.loc[tidy_src['variable'] == 'v_20m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'v_20m_c', 'value'] + 5*OFFSET

tidy_src.loc[tidy_src['variable'] == 'w_3m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'w_3m_c', 'value'] + 1*OFFSET
tidy_src.loc[tidy_src['variable'] == 'w_5m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'w_5m_c', 'value'] + 2*OFFSET
tidy_src.loc[tidy_src['variable'] == 'w_10m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'w_10m_c', 'value'] + 3*OFFSET
tidy_src.loc[tidy_src['variable'] == 'w_15m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'w_15m_c', 'value'] + 4*OFFSET
tidy_src.loc[tidy_src['variable'] == 'w_20m_c', 'value'] = tidy_src.loc[tidy_src['variable'] == 'w_20m_c', 'value'] + 5*OFFSET

In [None]:
tidy_src.measurement.unique()

array(['wind speed', 'u', 'v', 'w'], dtype=object)

In [None]:
alt.Chart(tidy_src).transform_filter(
    alt.FieldOneOfPredicate('measurement', [
        'u', 'v', 'w', 'wind speed'
    ])
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q", title='velocity (m/s)'),
    alt.Color("height:O").scale(scheme='rainbow'),
    alt.Facet("measurement:N", columns = 2)
).properties(width=400, height=400)

In [None]:
alt.Chart(tidy_src[tidy_src['time'].dt.minute > 55]).transform_filter(
    alt.FieldOneOfPredicate('measurement', [
        'u', 'v', 'w', 'wind speed'
    ])
).mark_line().encode(
    alt.X("time:T"),
    alt.Y("value:Q", title='velocity (m/s)'),
    alt.Color("height:O").scale(scheme='rainbow'),
    alt.Facet("measurement:N", columns = 2)
).properties(width=300, height=300)

# Calculate Planar Fit

In [None]:
(a,b,c), (tilt, tiltaz), W_f = sosutils.calculate_planar_fit(
    ds['u_1m_c'].values.flatten(),
    ds['v_1m_c'].values.flatten(),
    ds['w_1m_c'].values.flatten()
)

In [None]:
(a,b,c), (np.rad2deg(tilt), np.rad2deg(tiltaz)), W_f

In [None]:
u, v, w = sosutils.apply_planar_fit(
    ds['u_1m_c'].values.flatten(),
    ds['v_1m_c'].values.flatten(),
    ds['w_1m_c'].values.flatten(),
    a,
    W_f
)

In [None]:
ds['u_1m_c'].values.flatten(),

In [None]:
u,v,w