# Build datasets

Transfer stock data to dataset for AlphaNet.

In [6]:
import os
import numpy as np
import pandas as pd
import h5py

DATA_DIR = "test"

SAVE_DIR = "test_60"
INPUT = 60  # minutes as predictor
#SAVE_DIR = "test_30"
#INPUT = 30  # minutes as predictor
#SAVE_DIR = "test_120"
#INPUT = 120  # minutes as predictor
OUTPUT = 1  # minutes as target
SHUFFLE = False  # shuffle dataset or not
SEED = 123       # shuffle seed

In [7]:
def load_data(dir):
    r"""
    Load stock data from files in `dir`.

    Parameters
    ----------
    dir: str
        Fold contains stock data. Each stock is in a csv file.
    
    Returns
    -------
    value: generator
        A generator with elements like (name, data, number_of_null)
    """
    for file in os.listdir(dir):
        if file.endswith("csv"):
            dt = pd.read_csv(f"{dir}/{file}").drop(columns=["Datetime"])
            # dt["return"] = -np.log(dt["return"])
            # dt["free_turn"] = -np.log(dt["free_turn"])
            dt = dt.transpose()
            name = file.split('.')[0]
            null = dt.isnull().values.sum()
            yield (name, dt, null)

def preprocess(dt, shuffle=False, seed=None):
    r"""
    Preprocessing for given data.

    Parameters
    ----------
    dt: pd.DataFrame
        Raw data.
    shuffle: bool, optional
        Shuffle data or not.
    seed: shuffle seed, optional
    
    Returns
    -------
    X: np.ndarray
        Dataset X for fitting.
    y: np.ndarray
        Dataset y for fitting.
    """
    X = []
    y = []
    n = dt.shape[1] - INPUT - OUTPUT
    for i in range(n):
        X_new = dt.iloc[:, i:i+INPUT] \
            .to_numpy()[np.newaxis, :]
        y_new = dt.loc["return"][i+INPUT:i+INPUT+OUTPUT] \
            .to_numpy()[np.newaxis, :]
        X.append(X_new)
        y.append(y_new)

    X = np.concatenate(X, axis=0)
    y = np.concatenate(y, axis=0)
    if shuffle:
        rng = np.random.default_rng(seed=seed)
        rng.shuffle(X)
        rng.shuffle(y)
    return X, y

def save_data(dir, name, X, y):
    r"""
    Save dataset as HDF5 file.
    """
    with h5py.File(f'{dir}/{name}.hdf5', 'w') as f:
        f.create_dataset("X", data=X, chunks=True)
        f.create_dataset("y", data=y, chunks=True)
    print(f"Dataset {name} Saved.")

In [8]:
for name, dt, null in load_data(DATA_DIR):
    if null > 0:
        print(f"Data {name} has NULL values. Pass.")
    else:
        X, y = preprocess(dt, shuffle=SHUFFLE, seed=SEED)
        print(X.shape, y.shape)
        save_data(SAVE_DIR, name, X, y)

(8099, 7, 60) (8099, 1)
Dataset AR_test Saved.
(7628, 7, 60) (7628, 1)
Dataset BAH_test Saved.
(8080, 7, 60) (8080, 1)
Dataset FTI_test Saved.
(5190, 7, 60) (5190, 1)
Dataset HII_test Saved.
(7783, 7, 60) (7783, 1)
Dataset LMT_test Saved.
(4815, 7, 60) (4815, 1)
Dataset MLI_test Saved.
(7244, 7, 60) (7244, 1)
Dataset NFE_test Saved.
(7524, 7, 60) (7524, 1)
Dataset NOC_test Saved.
(8111, 7, 60) (8111, 1)
Dataset PBR_test Saved.
(8021, 7, 60) (8021, 1)
Dataset STLD_test Saved.
