# Build datasets

Transfer stock data to dataset for AlphaNet.

In [7]:
import os
import numpy as np
import pandas as pd
import h5py

#INPUT = 120      # minutes as predictor
#INPUT = 60      # minutes as predictor
INPUT = 30      # minutes as predictor
OUTPUT = 1       # minutes as target
SHUFFLE = False  # shuffle dataset or not
SEED = 123       # shuffle seed

DATA_DIR = "train_valid"
TRAIN_DIR = "train_" + str(INPUT)
VALID_DIR = "valid_" + str(INPUT)

In [8]:
def load_data(dir):
    r"""
    Load stock data from files in `dir`.

    Parameters
    ----------
    dir: str
        Fold contains stock data. Each stock is in a csv file.
    
    Returns
    -------
    value: generator
        A generator with elements like (name, data, number_of_null)
    """
    for file in os.listdir(dir):
        if file.endswith("csv"):
            dt = pd.read_csv(f"{dir}/{file}", index_col=0) \
                .drop(columns=["timestamp"])
            dt = dt.transpose()
            name = file.split('.')[0]
            null = dt.isnull().values.sum()
            yield (name, dt, null)

def preprocess(dt, shuffle=False, seed=None):
    r"""
    Preprocessing for given data.

    Parameters
    ----------
    dt: pd.DataFrame
        Raw data.
    shuffle: bool, optional
        Shuffle data or not.
    seed: shuffle seed, optional

    Returns
    -------
    X: np.ndarray
        Dataset X for fitting.
    y: np.ndarray
        Dataset y for fitting.
    """
    X = []
    y = []
    n = dt.shape[1] - INPUT - OUTPUT
    for i in range(n):
        X_new = dt.iloc[:, i:i+INPUT] \
            .to_numpy()[np.newaxis, :]
        y_new = dt.loc["return"].iloc[i+INPUT:i+INPUT+OUTPUT] \
            .to_numpy()[np.newaxis, :]

        X.append(X_new)
        y.append(y_new)
    X = np.concatenate(X, axis=0)
    y = np.concatenate(y, axis=0)
    if shuffle:
        rng = np.random.default_rng(seed=seed)
        rng.shuffle(X)
        rng.shuffle(y)
    return X, y

def save_data(dir, name, X, y):
    r"""
    Save dataset as HDF5 file.
    """
    if not os.path.exists(dir):
        os.makedirs(dir)
    with h5py.File(f'{dir}/{name}.hdf5', 'w') as f:
        f.create_dataset("X", data=X, chunks=True)
        f.create_dataset("y", data=y, chunks=True)
    print(f"Dataset {name} Saved.")

In [9]:
for name, dt, null in load_data(DATA_DIR):
    if null > 0:
        print(f"Data {name} has NULL values. Pass.")
    else:
        X, y = preprocess(dt, shuffle=SHUFFLE, seed=SEED)
        print(X.shape, y.shape)
        if name.endswith("train"):
            save_data(TRAIN_DIR, name, X, y)
        elif name.endswith("valid"):
            save_data(VALID_DIR, name, X, y)

(60533, 7, 30) (60533, 1)
Dataset AR_train Saved.
(18028, 7, 30) (18028, 1)
Dataset AR_valid Saved.
(46829, 7, 30) (46829, 1)
Dataset BAH_train Saved.
(13080, 7, 30) (13080, 1)
Dataset BAH_valid Saved.
(58857, 7, 30) (58857, 1)
Dataset FTI_train Saved.
(16737, 7, 30) (16737, 1)
Dataset FTI_valid Saved.
(29220, 7, 30) (29220, 1)
Dataset HII_train Saved.
(7937, 7, 30) (7937, 1)
Dataset HII_valid Saved.
(52782, 7, 30) (52782, 1)
Dataset LMT_train Saved.
(15032, 7, 30) (15032, 1)
Dataset LMT_valid Saved.
(25538, 7, 30) (25538, 1)
Dataset MLI_train Saved.
(6571, 7, 30) (6571, 1)
Dataset MLI_valid Saved.
(57542, 7, 30) (57542, 1)
Dataset NFE_train Saved.
(14528, 7, 30) (14528, 1)
Dataset NFE_valid Saved.
(44545, 7, 30) (44545, 1)
Dataset NOC_train Saved.
(12183, 7, 30) (12183, 1)
Dataset NOC_valid Saved.
(67865, 7, 30) (67865, 1)
Dataset PBR_train Saved.
(19898, 7, 30) (19898, 1)
Dataset PBR_valid Saved.
(71834, 7, 30) (71834, 1)
Dataset STLD_train Saved.
(16626, 7, 30) (16626, 1)
Dataset ST