In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import altair as alt
alt.data_transformers.enable('json')
alt.renderers.enable('jupyterlab')

RendererRegistry.enable('jupyterlab')

# User inputs

In [2]:
start_date = '20221130'
end_date = '20230509'

tidy_dataset_fn = f"tidy_df_30Min_{start_date}_{end_date}_noplanar_fit.parquet"
tidy_dataset_5min_fn = f"tidy_df_{start_date}_{end_date}_noplanar_fit.parquet"
tidy_daily_dataset_output_fn = f"tidy_df_daily_{start_date}_{end_date}_noplanar_fit.parquet"

# Load data

In [3]:
try:
    tidy_df_30Min = pd.read_parquet(
        tidy_dataset_fn
    )
except FileNotFoundError:
    print("No file such file exists for these dates.")
tidy_df_30Min['time'] = pd.to_datetime(tidy_df_30Min['time'])

try:
    tidy_df_5Min = pd.read_parquet(
        tidy_dataset_5min_fn
    )
except FileNotFoundError:
    print("No file such file exists for these dates.")
tidy_df_5Min['time'] = pd.to_datetime(tidy_df_5Min['time'])

# Identify data outages

In [18]:
src_nans_only = tidy_df_5Min.query("variable == 'T_3m_c'").set_index('time')[['value']]
src_nans_only = src_nans_only[src_nans_only.value.isna()].reset_index()
src_nans_only['diff'] = (src_nans_only['time'] - src_nans_only['time'].shift(1)).apply(lambda t_delta: t_delta.seconds)
src_nans_only = src_nans_only.reset_index(drop=True)
src_nans_only

Unnamed: 0,time,value,diff
0,2023-01-22 23:42:30,,
1,2023-01-22 23:47:30,,300.0
2,2023-01-22 23:52:30,,300.0
3,2023-01-31 16:07:30,,58500.0
4,2023-01-31 16:12:30,,300.0
...,...,...,...
904,2023-04-23 22:07:30,,300.0
905,2023-04-23 22:12:30,,300.0
906,2023-05-05 15:32:30,,62400.0
907,2023-05-05 15:37:30,,300.0


In [19]:
data = tidy_df_5Min.query("variable == 'T_3m_c'").set_index('time')['value']
na_groups = data.notna().cumsum()[data.isna()]
t_lengths_consecutive_na = na_groups.groupby(na_groups).agg(len)

data = tidy_df_5Min.query("variable == 'RH_3m_c'").set_index('time')['value']
na_groups = data.notna().cumsum()[data.isna()]
rh_lengths_consecutive_na = na_groups.groupby(na_groups).agg(len)

In [20]:
all(t_lengths_consecutive_na == rh_lengths_consecutive_na)

True

In [21]:
t_lengths_consecutive_na.value_counts()

value
3      11
2      11
1       8
4       5
5       5
136     1
6       1
7       1
652     1
Name: count, dtype: int64

Using this info, we identify two major outages that lasted a significant amount of time (at least multiple hours).

In [22]:
src = tidy_df_5Min.query("variable == 'T_3m_c'").set_index('time')
src.loc[
    "2023-03-10T23:42:30.000" :
    "2023-03-11T11:07:30.000"
]

Unnamed: 0_level_0,variable,value,height,tower,measurement
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-03-10 23:42:30,T_3m_c,-2.704268,3.0,c,temperature
2023-03-10 23:47:30,T_3m_c,,3.0,c,temperature
2023-03-10 23:52:30,T_3m_c,,3.0,c,temperature
2023-03-10 23:57:30,T_3m_c,,3.0,c,temperature
2023-03-11 00:02:30,T_3m_c,,3.0,c,temperature
...,...,...,...,...,...
2023-03-11 10:47:30,T_3m_c,,3.0,c,temperature
2023-03-11 10:52:30,T_3m_c,,3.0,c,temperature
2023-03-11 10:57:30,T_3m_c,,3.0,c,temperature
2023-03-11 11:02:30,T_3m_c,,3.0,c,temperature


In [23]:
src.loc[
    "2023-04-21T15:52:30.000" :
    "2023-04-23T22:17:30.000"
]

Unnamed: 0_level_0,variable,value,height,tower,measurement
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-04-21 15:52:30,T_3m_c,-4.623627,3.0,c,temperature
2023-04-21 15:57:30,T_3m_c,,3.0,c,temperature
2023-04-21 16:02:30,T_3m_c,,3.0,c,temperature
2023-04-21 16:07:30,T_3m_c,,3.0,c,temperature
2023-04-21 16:12:30,T_3m_c,,3.0,c,temperature
...,...,...,...,...,...
2023-04-23 21:57:30,T_3m_c,,3.0,c,temperature
2023-04-23 22:02:30,T_3m_c,,3.0,c,temperature
2023-04-23 22:07:30,T_3m_c,,3.0,c,temperature
2023-04-23 22:12:30,T_3m_c,,3.0,c,temperature


Besides these two outages, there are 30 (11+11+8) cases where between 1 and 3 consecutive 5-minute-averages are missing. There are 10 cases (5+5) where 4 or 5 consecutive 5-minute-averages are missing. There is an additional 1 case of 6 consecutive and 1 case of 7 consecutive 5-minute averages missing. This accounts for all missing data (according to T and RH measurements at 3m on Tower C).

How many nans were there exactly, aside form the two major outages?

In [26]:
src_nans_only[
    (src_nans_only.time < "2023-03-10T23:42:30.000")
    |
    (src_nans_only.time > "2023-03-11T11:07:30.000")
][
    (src_nans_only.time < "2023-04-21T15:52:30.000")
    |
    (src_nans_only.time > "2023-04-23T22:17:30.000")
]

  src_nans_only[


Unnamed: 0,time,value,diff
0,2023-01-22 23:42:30,,
1,2023-01-22 23:47:30,,300.0
2,2023-01-22 23:52:30,,300.0
3,2023-01-31 16:07:30,,58500.0
4,2023-01-31 16:12:30,,300.0
...,...,...,...
252,2023-04-12 10:37:30,,300.0
253,2023-04-12 11:42:30,,3900.0
906,2023-05-05 15:32:30,,62400.0
907,2023-05-05 15:37:30,,300.0
