In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd

from src.functions import replicate_seasonal_pattern

In [None]:
dfs = []

test_path = '../data/labeled'

files = os.listdir(test_path)
combined_df = pd.DataFrame({})
for i, f in enumerate(files):
    df = pd.read_csv(os.path.join(test_path, f))
    df['station_code'] = f[:-4]
    combined_df = pd.concat([combined_df, df], ignore_index=True)

combined_df = combined_df[['station_code', 'measure_date', 'HS', 'TSS_30MIN_MEAN', 'RSWR_30MIN_MEAN', 'TA_30MIN_MEAN', 'VW_30MIN_MEAN']]

In [None]:
missing_values_matrix = combined_df.groupby('station_code').apply(lambda df: df.isna().sum())
missing_values_matrix = missing_values_matrix[['TSS_30MIN_MEAN', 'RSWR_30MIN_MEAN', 'TA_30MIN_MEAN', 'VW_30MIN_MEAN']]
missing_values_matrix = missing_values_matrix[~(missing_values_matrix < 10).all(axis=1)]
missing_values_matrix.head(n=20)

In [None]:
def display_missing_values(station_code, col, show_imputations=False):
    station = combined_df[combined_df['station_code'] == station_code][['measure_date', col]].copy()
    station.set_index(pd.to_datetime(station['measure_date']), inplace=True)

    plt.figure(figsize=(10, 5))
    plt.plot(station[col], label='Non-missing values')

    missing_values = station[station[col].isna()]
    if len(missing_values):
        plt.plot(station[:missing_values.index[-1]].index, [0] * len(missing_values), color='red',
                 label='Missing values')

    plt.xlabel('Measure Date')
    plt.ylabel(col)
    plt.legend()
    plt.title(station_code)
    plt.show()

    if show_imputations:
        plt.figure(figsize=(10, 5))
        plt.plot(station[col], label='Non-missing values')

        station['replicated'] = replicate_seasonal_pattern(station, col)[col]
        plt.plot(station[:missing_values.index[-1]]['replicated'], color='orange', label='Imputed Values')

        plt.xlabel('Measure Date')
        plt.ylabel(col)
        plt.legend()
        plt.title(station_code)
        plt.show()

In [None]:
def check_consecutive_missing_values(station_code, col):
    station = combined_df[combined_df['station_code'] == station_code][['measure_date', col]].copy()
    station.set_index(pd.to_datetime(station['measure_date']), inplace=True)

    is_missing = station[col].isna()
    first_non_missing_idx = is_missing.idxmin()
    missing_values = is_missing.loc[:first_non_missing_idx]
    missing_at_start = missing_values[:-1].all()

    print(station_code, col, missing_at_start, first_non_missing_idx, is_missing.sum(), len(missing_values))

In [None]:
for index in missing_values_matrix.index.values:
    for col in missing_values_matrix.columns.values:
        check_consecutive_missing_values(index, col)

In [None]:
for index in missing_values_matrix.index.values:
    for col in missing_values_matrix.columns.values:
        if missing_values_matrix.loc[index, col] == 0:
            continue
        display_missing_values(index, col, False)