In [1]:
import os
import random

import numpy as np
import pandas as pd


def merge_into_one():
    path = '../data/labeled_daily'
    files = os.listdir(path)

    pd.concat(pd.read_csv(f'{path}/{s}') for s in files).to_csv('merged.csv')


merge_into_one()

FileNotFoundError: [Errno 2] No such file or directory: '../data/labeled_daily'

In [None]:
def resample_and_save(read_path='../data/labeled', save_path='../data/labeled_daily'):
    files = os.listdir(read_path)

    for s in files:
        station = pd.read_csv(f'{read_path}/{s}')
        station['station_code'] = s[:-4]

        station = preprocess(station, resample='D')

        station.to_csv(f'{save_path}/{s}')

In [None]:
def load_stations(path, count, shuffle=False):
    files = os.listdir(path)

    if shuffle:
        random.shuffle(files)
    else:
        files = sorted(files)

    dataframes = []
    for s in files[:count]:
        if not s.endswith('csv'):
            continue
        station = pd.read_csv(f'{path}/{s}')
        station['station_code'] = s[:-4]
        dataframes.append(station)

    return pd.concat(dataframes, ignore_index=True), files[:count], files[count:]

In [None]:
def load_station(path, station_name, resample=None):
    df = pd.read_csv(f'{path}/{station_name}')
    df['station_code'] = station_name[:-4]

    if resample is not None:
        df = resample(df, resample)

    return df

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler


def preprocess(df, scale=False, resample=None):
    data = df[['station_code', 'measure_date', 'HS', 'no_snow', 'anomaly']].copy()
    data['measure_date'] = pd.to_datetime(data['measure_date']).dt.tz_localize(None)
    data.dropna(inplace=True)

    if scale:
        # def scale_group(group):
        #     scaler = StandardScaler()
        #     # TODO: Could try standardizing and then scaling
        #     group['HS'] = scaler.fit_transform(group[['HS']])
        #     return group

        # data = data.groupby('station_code').apply(scale_group).reset_index(drop=True)
        data['HS'] = MinMaxScaler().fit_transform(data[['HS']])

    if resample:
        data.set_index('measure_date', inplace=True)
        data = data.groupby('station_code').resample(resample).agg({
            'HS': lambda x: x.iloc[np.argmax(np.abs(x.values - x.mean()))],
            'no_snow': lambda x: x.value_counts().idxmax(), # TODO: Could try x.any() to improve predictions?
            'anomaly': lambda x: x.any()
        }).reset_index()

    return data

In [None]:
from keras.callbacks import TensorBoard
from datetime import datetime

SEQ_SIZE = 20
EPOCHS = 50
TRAIN_SIZE = 3
VALIDATION_PERCENTAGE = 0.3
BATCH_SIZE = 64
SHUFFLE_SEQS = False
SCALE = True

current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_name = f"{current_time}_epochs-{EPOCHS}_seq-{SEQ_SIZE}_train-{TRAIN_SIZE}_test-{VALIDATION_PERCENTAGE}_shuffle-seqs-{SHUFFLE_SEQS}_scale-{SCALE}"
log_dir = f"../logs/{model_name}"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)