In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd

In [2]:
def truncate_at_first_valid_index(df):
    first_valid_index = df.apply(lambda col: col.first_valid_index()).max()
    truncated_df = df.loc[first_valid_index:]
    return truncated_df

In [3]:
dfs = []

read_path = '../data/labeled'
test_path = '../data/labeled_benchmark/test'
train_path = '../data/labeled_benchmark/train'

files = os.listdir(read_path)

for i, f in enumerate(files):
    df = pd.read_csv(os.path.join(read_path, f))
    df['measure_date'] = pd.to_datetime(df['measure_date'])
    station_code = f[:-4]

    df.set_index('measure_date', drop=True, inplace=True)
    cols = [
        'HS',
        'TSS_30MIN_MEAN',
        'RSWR_30MIN_MEAN',
        'TA_30MIN_MEAN',
        'VW_30MIN_MEAN',
        'no_snow'
    ]
    df = df[cols]
    
    # 1. Truncate values at first index
    df = truncate_at_first_valid_index(df)

    aggregations = {c: 'mean' for c in cols}
    aggregations['no_snow'] = lambda x: x.value_counts().idxmax()
    
    df = df.resample('D').agg(aggregations)

    # 2. Replicate seasonal pattern. Only enable if 1. is disabled
    # for col in cols[:-1]:
        # df[col] = replicate_seasonal_pattern(df, col)[col]
        # df[col] = df[col].fillna(df[col].mean())

    df['station_code'] = station_code
    dfs.append(df) # Avoid missing the index
    df.reset_index(inplace=True)

    save_path = test_path if station_code in ['STN2', 'SHE2', 'TRU2', 'WFJ2', 'SLF2', 'KLO2'] else train_path
    df.to_csv(os.path.join(save_path, f))