In [1]:
from feature_engineering import *
import os
import pandas as pd
import numpy as np

In [2]:
base_path = "../price/raw"
stock_data = load_all_stock_data(base_path)

Loaded and processed data for BA: 1253 records.
Loaded and processed data for PCLN: 1253 records.
Loaded and processed data for GOOG: 1253 records.
Loaded and processed data for PPL: 1253 records.
Loaded and processed data for NGG: 1253 records.
Loaded and processed data for GMRE: 292 records.
Loaded and processed data for UNH: 1253 records.
Loaded and processed data for HRG: 1253 records.
Loaded and processed data for WMT: 1253 records.
Loaded and processed data for MSFT: 1253 records.
Loaded and processed data for AMGN: 1253 records.
Loaded and processed data for PFE: 1253 records.
Loaded and processed data for T: 1253 records.
Loaded and processed data for KO: 1253 records.
Loaded and processed data for EXC: 1253 records.
Loaded and processed data for SO: 1253 records.
Loaded and processed data for GD: 1253 records.
Loaded and processed data for AAPL: 1253 records.
Loaded and processed data for PM: 1253 records.
Loaded and processed data for TM: 1253 records.
Loaded and processed da

In [5]:
#count tickers in stock_data
tickers = list(stock_data.keys())
print("Number of tickers: ", len(tickers))


Number of tickers:  88


In [5]:
def align_stock_data(stock_data):
    """
    Given a dictionary of DataFrames (stock_data) where keys are ticker symbols and each DataFrame
    has a 'Date' column, filter each DataFrame so that only dates common to all DataFrames are retained.
    
    Args:
        stock_data: dict, mapping ticker -> DataFrame.
        
    Returns:
        stock_data_aligned: dict, same keys but each DataFrame is filtered to common dates.
        common_dates: list of dates (as strings or Timestamps) in sorted order.
    """
    # Get the set of unique dates from each DataFrame
    date_sets = [set(df['Date'].unique()) for df in stock_data.values()]
    # Compute the intersection (common dates across all stocks)
    common_dates = sorted(list(set.intersection(*date_sets)))
    
    # Filter each DataFrame to include only the common dates and sort by date
    stock_data_aligned = {}
    for ticker, df in stock_data.items():
        df_aligned = df[df['Date'].isin(common_dates)].copy()
        df_aligned.sort_values('Date', inplace=True)
        df_aligned.reset_index(drop=True, inplace=True)
        stock_data_aligned[ticker] = df_aligned
    
    return stock_data_aligned, common_dates

stock_data_aligned, common_dates = align_stock_data(stock_data)
print(f"Found {len(common_dates)} common dates across stocks.")


Found 292 common dates across stocks.


In [6]:
stock_data_aligned

{'BA':           Date        Open        High         Low       Close   Adj Close  \
 0   2016-07-08  128.600006  130.460007  127.660004  130.089996  125.234215   
 1   2016-07-11  130.839996  133.250000  130.800003  132.039993  127.111412   
 2   2016-07-12  132.800003  133.000000  130.330002  130.809998  125.927330   
 3   2016-07-13  131.279999  131.389999  129.369995  130.110001  125.253471   
 4   2016-07-14  131.630005  131.919998  130.199997  131.550003  126.639709   
 ..         ...         ...         ...         ...         ...         ...   
 287 2017-08-28  236.220001  237.550003  234.610001  237.179993  237.179993   
 288 2017-08-29  237.210007  241.169998  236.080002  240.490005  240.490005   
 289 2017-08-30  240.990005  241.440002  239.000000  240.460007  240.460007   
 290 2017-08-31  241.000000  241.100006  238.410004  239.660004  239.660004   
 291 2017-09-01  239.660004  242.529999  239.169998  240.330002  240.330002   
 
       Volume    Return      Diff   HL_Diff 

In [7]:
def create_windowed_data(stock_data, L, input_columns, target_column):
    """
    Given a dictionary of aligned DataFrames (each sorted by date), create sliding windows.
    
    Args:
        stock_data: dict, mapping ticker -> DataFrame (all with common dates).
        L: int, length of the historical window (number of days).
        input_columns: list of column names to be used as inputs.
        target_column: str, column name to be used as the target.
        
    Returns:
        X: numpy array of shape (num_samples, N, L, input_dim) for historical data.
        Y: numpy array of shape (num_samples, N, 1) for targets.
        tickers: list of ticker symbols.
    """
    tickers = list(stock_data.keys())
    # Assume each DataFrame has the same number of rows since they are aligned
    T = len(next(iter(stock_data.values())))
    # Number of sliding windows (samples): we need L days for the window and 1 extra for the target.
    num_samples = T - L - 1  
    N = len(tickers)
    input_dim = len(input_columns)
    
    # Initialize arrays to hold the windowed data and targets
    X = np.zeros((num_samples, N, L, input_dim), dtype=np.float32)
    Y = np.zeros((num_samples, N, 1), dtype=np.float32)
    
    for i, ticker in enumerate(tickers):
        df = stock_data[ticker]
        for j in range(num_samples):
            # Window of L days for inputs (rows j to j+L-1)
            window = df.iloc[j:j+L]
            X[j, i, :, :] = window[input_columns].values
            # Target is taken as the value of target_column at day j+L (the day after the window)
            Y[j, i, 0] = df.iloc[j+L][target_column]
    
    return X, Y, tickers

# Define which columns to use:
input_columns = ['Open', 'High', 'Low', 'Volume', 'Return', 'Diff', 'HL_Diff', 'MA5', 'Return_MA5']
target_column = 'Close'
L = 20  # for example, using 20 days of historical data

# Create windowed data
X, Y, tickers = create_windowed_data(stock_data_aligned, L, input_columns, target_column)
print(f"Windowed data shape: X {X.shape}, Y {Y.shape}")


Windowed data shape: X (271, 88, 20, 9), Y (271, 88, 1)
