In [42]:
#from build_dataset_new import build_dataset
import pandas as pd
import numpy as np

In [91]:
import numpy as np
import pandas as pd

def get_sliding_window(data, window_size, slide=1):
    """Get a sliding window.
    
    Args:
        data: an n-dimensional ndarray of shape (d1, d2, ..., dn)
        window_size: the window size
        slide: how many elements to per window
    
    Returns
        An (n+1) dimensional ndarray of shape (d1, d2, ..., dn, dn+1)
        where a subarray of shape (d2, ..., dn+1) is a window
    """
    strides = (data.strides[0] * slide,) + data.strides
    dataset_size = data.shape[0] - window_size - slide + 2
    
    if len(data.shape) > 1:
        shape = (dataset_size, window_size) + data.shape[1:]
    else:
        shape = (dataset_size, window_size)
        
    data_start = data.shape[0] % slide
    return np.lib.stride_tricks.as_strided(data[data_start:], shape, strides)

def get_moving_avg(stock_data, stock_code, column, n, skip_last = 0, **kwargs):
    if skip_last != -1:
        target = stock_data[stock_code][column].values
    else:
        target = stock_data[stock_code][column].values[:-skip_last]
        
    return get_sliding_window(target, n).mean(axis=1).reshape(-1,1)
    
def get_lookback(stock_data, stock_code, column, n, skip_last = 0, **kwargs):
    if skip_last != -1:
        target = stock_data[stock_code][column].values
    else:
        target = stock_data[stock_code][column].values[:-skip_last]
    return get_sliding_window(target, n)

def get_index(stock_data, n, skip_last = 0, **kwargs):
    return np.arange(1, n + 1).reshape(-1, 1)

def get_stock_data(stock_codes):
    """Retrieves data from a list of stock codes

    Args:
        stock_codes: <array_of_stock_codes_needed>

    Returns:
        a dict with stock codes as the keys and corresponding stock data as values

    """
    stock_data = {}
    for code in stock_codes:
        stock_data[code] = pd.read_csv("data/stock_prices/" + code + ".csv", index_col=0).iloc[::-1]

    return stock_data

def build_dataset(input_config, predict_n, training):
    """Build dataset.

    Args:
        input_config: A input config dict.
            Format:
            {
                "stock_codes": <array_of_stock_codes_needed_to_build_the_dataset>,
                "stock_code": "predicting stock code",
                "column": "predicting value column name",
                "config": [
                    {"type": "feature type", <other_feature_configs},
                    {"type": "feature type", <other_feature_configs},
                    ...
                ]
            }
            Refer to train_models_sample.json.

        predict_n: Number of days of stock prices to predict
        training: True to get the training dataset, False to get the features for prediction.

    Returns:
        A tuple of m-data-by-n-features NumPy array and m-data-by-predict-n-labels NumPy array for training,
        or a tuple of m-data-by-t-timesteps-by-n-features NumPy array and m-data-by-predict-n-labels NumPy 
        array for training (if RNN/LSTM)
        or a 1-by-number-of-features NumPy array for prediction
    """
    
    # Get all the stock data
    stock_data = get_stock_data(input_config["stock_codes"])
    target = stock_data[input_config["stock_code"]][input_config["column"]].values

    # Build feature vectors by applying transformations on dataset 
    # specified in input_config
    transform = {
        "moving_avg": get_moving_avg,
        "lookback": get_lookback,
        "index_price": get_index,
    }
    
    if training:
        config_mapper = lambda config: transform[config["type"]](stock_data, skip_last=predict_n, **config)
    else:
        config_mapper = lambda config: transform[config["type"]](stock_data, **config)
    
    transformed_data = list(map(config_mapper, input_config["config"]))
    dataset_size = min(map(lambda arr: arr.shape[0], transformed_data))
    
    features = [ feature[-dataset_size:] for feature in transformed_data ]
    x = np.concatenate(features, axis=1)

    # Get a rolling time window if specified in config
    if "time_window" in input_config:
        time_window = input_config["time_window"]
        x = get_sliding_window(x, time_window)
    
    if training:
        output_shape = (x.shape[0], predict_n)
        y_size = output_shape[0] + predict_n - 1
        y = get_sliding_window(target[-y_size:], predict_n)

        return x, y
    else:
        # Get non-overlapping windows, aligning to the end
        return x[::-1][:predict_n*10:predict_n][::-1]

In [92]:
input_option = {
    "config": [
        {"type": "index_price", "n": 10}
    ],
    "stock_codes": ["GOOGL"],
    "stock_code": "GOOGL",
    "column": "adjusted_close"
}

In [93]:
x, y = build_dataset(input_option, 10, True)
print(x)
print(y)

[[ 1]
 [ 2]
 [ 3]
 [ 4]
 [ 5]
 [ 6]
 [ 7]
 [ 8]
 [ 9]
 [10]]
[[1061.65 1073.73 1073.54 1051.71 1025.65 1043.41 1035.46 1023.58  991.25
   984.67]
 [1073.73 1073.54 1051.71 1025.65 1043.41 1035.46 1023.58  991.25  984.67
  1047.85]
 [1073.54 1051.71 1025.65 1043.41 1035.46 1023.58  991.25  984.67 1047.85
  1052.9 ]
 [1051.71 1025.65 1043.41 1035.46 1023.58  991.25  984.67 1047.85 1052.9
  1046.68]
 [1025.65 1043.41 1035.46 1023.58  991.25  984.67 1047.85 1052.9  1046.68
  1044.96]
 [1043.41 1035.46 1023.58  991.25  984.67 1047.85 1052.9  1046.68 1044.96
  1054.68]
 [1035.46 1023.58  991.25  984.67 1047.85 1052.9  1046.68 1044.96 1054.68
  1025.47]
 [1023.58  991.25  984.67 1047.85 1052.9  1046.68 1044.96 1054.68 1025.47
  1078.07]
 [ 991.25  984.67 1047.85 1052.9  1046.68 1044.96 1054.68 1025.47 1078.07
  1075.92]
 [ 984.67 1047.85 1052.9  1046.68 1044.96 1054.68 1025.47 1078.07 1075.92
  1085.37]]


In [88]:
x = build_dataset(input_option, 10, False)
print(x)

[[[1248.64 1258.14 1232.22]
  [1258.14 1232.22 1224.06]
  [1232.22 1224.06 1215.85]
  [1224.06 1215.85 1221.95]
  [1215.85 1221.95 1217.41]
  [1221.95 1217.41 1221.75]
  [1217.41 1221.75 1221.16]
  [1221.75 1221.16 1236.75]
  [1221.16 1236.75 1256.27]
  [1236.75 1256.27 1245.86]]

 [[1256.27 1245.86 1264.65]
  [1245.86 1264.65 1254.44]
  [1264.65 1254.44 1231.8 ]
  [1254.44 1231.8  1211.31]
  [1231.8  1211.31 1199.1 ]
  [1211.31 1199.1  1183.99]
  [1199.1  1183.99 1177.59]
  [1183.99 1177.59 1175.06]
  [1177.59 1175.06 1189.99]
  [1175.06 1189.99 1171.6 ]]

 [[1189.99 1171.6  1182.14]
  [1171.6  1182.14 1177.98]
  [1182.14 1177.98 1159.83]
  [1177.98 1159.83 1167.11]
  [1159.83 1167.11 1174.27]
  [1167.11 1174.27 1191.57]
  [1174.27 1191.57 1172.12]
  [1191.57 1172.12 1179.56]
  [1172.12 1179.56 1193.89]
  [1179.56 1193.89 1194.06]]

 [[1193.89 1194.06 1207.36]
  [1194.06 1207.36 1207.08]
  [1207.36 1207.08 1208.53]
  [1207.08 1208.53 1207.64]
  [1208.53 1207.64 1211.53]
  [1207.64 121