In [14]:
import pandas as pd
import os
from src.functions import replicate_seasonal_pattern
import matplotlib.pyplot as plt

In [15]:
dfs = []

train_path = '../data/labeled'

files = os.listdir(train_path)
combined_df = pd.DataFrame({})
for i, f in enumerate(files):
    df = pd.read_csv(os.path.join(train_path, f))
    df['station_code'] = f[:-4]
    combined_df = pd.concat([combined_df, df], ignore_index=True)

combined_df = combined_df[['station_code', 'measure_date', 'HS', 'TSS_30MIN_MEAN', 'RSWR_30MIN_MEAN', 'TA_30MIN_MEAN', 'VW_30MIN_MEAN']]

In [16]:
missing_values_matrix = combined_df.groupby('station_code').apply(lambda df: df.isna().sum())
missing_values_matrix = missing_values_matrix[['TSS_30MIN_MEAN', 'RSWR_30MIN_MEAN', 'TA_30MIN_MEAN', 'VW_30MIN_MEAN']]
missing_values_matrix = missing_values_matrix[~(missing_values_matrix < 10).all(axis=1)]
missing_values_matrix.head(n=20)

Unnamed: 0_level_0,TSS_30MIN_MEAN,RSWR_30MIN_MEAN,TA_30MIN_MEAN,VW_30MIN_MEAN
station_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AMD2,0,0,0,64
ARO3,37027,37027,0,37027
FNH2,1024,0,0,0
KLO2,0,0,0,39062
LAG3,173040,173040,0,0
RNZ2,16659,16659,0,0
TRU2,0,4,0,51202
TUM2,38356,38356,38356,38356
WFJ2,119927,119927,119927,119927


In [23]:
def display_missing_values(station_code, col, show_imputations=False):
    station = combined_df[combined_df['station_code'] == station_code][['measure_date', col]].copy()
    station.set_index(pd.to_datetime(station['measure_date']), inplace=True)

    plt.figure(figsize=(10, 5))
    plt.plot(station[col], label='Non-missing values')

    missing_values = station[station[col].isna()]
    if len(missing_values):
        plt.plot(station[:missing_values.index[-1]].index, [0] * len(missing_values), color='red', label='Missing values')

    plt.xlabel('Measure Date')
    plt.ylabel(col)
    plt.legend()
    plt.title(station_code)
    plt.show()
    
    if show_imputations:
        plt.figure(figsize=(10, 5))
        plt.plot(station[col], label='Non-missing values')

        station['replicated'] = replicate_seasonal_pattern(station, col)[col]
        plt.plot(station[:missing_values.index[-1]]['replicated'], color='orange', label='Imputed Values')

        plt.xlabel('Measure Date')
        plt.ylabel(col)
        plt.legend()
        plt.title(station_code)
        plt.show()

In [None]:
for index in missing_values_matrix.index.values:
    for col in missing_values_matrix.columns.values:
        display_missing_values(index, col, False)

In [38]:
def check_consecutive_missing_values(station_code, col):
    station = combined_df[combined_df['station_code'] == station_code][['measure_date', col]].copy()
    station.set_index(pd.to_datetime(station['measure_date']), inplace=True)
    
    is_missing = station[col].isna()
    first_non_missing_idx = is_missing.idxmin()
    missing_values = is_missing.loc[:first_non_missing_idx]
    missing_at_start = missing_values[:-1].all()
    
    print(station_code, col, missing_at_start, first_non_missing_idx, is_missing.sum(), len(missing_values))

for index in missing_values_matrix.index.values:
    for col in missing_values_matrix.columns.values:
        check_consecutive_missing_values(index, col)

Station: AMD2, Column: TSS_30MIN_MEAN, Missing: True, First idx: 1997-10-14 17:00:00+00:00, Missing len: 0, Missing cons len: 0
Station: AMD2, Column: RSWR_30MIN_MEAN, Missing: True, First idx: 1997-10-14 17:00:00+00:00, Missing len: 0, Missing cons len: 0
Station: AMD2, Column: TA_30MIN_MEAN, Missing: True, First idx: 1997-10-14 17:00:00+00:00, Missing len: 0, Missing cons len: 0
Station: AMD2, Column: VW_30MIN_MEAN, Missing: True, First idx: 1997-10-16 01:00:00+00:00, Missing len: 64, Missing cons len: 64
Station: ARO3, Column: TSS_30MIN_MEAN, Missing: True, First idx: 1998-11-11 08:30:00+00:00, Missing len: 37027, Missing cons len: 37027
Station: ARO3, Column: RSWR_30MIN_MEAN, Missing: True, First idx: 1998-11-11 08:30:00+00:00, Missing len: 37027, Missing cons len: 37027
Station: ARO3, Column: TA_30MIN_MEAN, Missing: True, First idx: 1996-09-30 23:00:00+00:00, Missing len: 0, Missing cons len: 0
Station: ARO3, Column: VW_30MIN_MEAN, Missing: True, First idx: 1998-11-11 08:30:00+00: