In [49]:
import torch
from torch.utils.data import DataLoader
# from helper import TimeSeriesDataset
from pytorch_forecasting import TimeSeriesDataSet
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [53]:
def ts_transformations(data,group_id,features_cont,features_cat,label,window_size):
    ## cat features are already encoded so they dont need further processing
    features = features_cont + features_cat
    group_ids = data[group_id].unique()
    X_values, labels = [], []
    for id in group_ids:
        scaler = MinMaxScaler()
        group_data = data[data[group_id] == id].tail(window_size)
        ## This magic number can be any value as is is from the alert df and repeats across the ts
        y = group_data[label].iloc[-1]
        X = scaler.fit_transform(group_data[features])

        X_values.append(X)
        labels.append(y)
    
    X_tensor = torch.tensor(X_values).float().transpose(1, 2)
    y_tensor = torch.tensor(labels).long()
    return X_tensor, y_tensor

def prepare_data(data,features_cont,features_cat, window_size):
    # Load data
    data = add_labels(data)
    X, y = ts_transformations(data,'alert_identifier',features_cont,features_cat,'three_max_vol_label',window_size)
    return X, y

def add_labels(data):
    alerts_df = pd.read_csv(f'/Users/charlesmiller/Documents/model_tester_data/BF/2015-01-01_2023-12-23BF3.csv')
    alerts_df['one_max_vol'] = (alerts_df['one_max']/alerts_df['return_vol_10D']).round(3)
    alerts_df['three_max_vol'] = (alerts_df['three_max']/alerts_df['return_vol_10D']).round(3)
    oneD_target = alerts_df['one_max_vol'].quantile(0.6).round(3)
    threeD_target = alerts_df['three_max_vol'].quantile(0.6).round(3)
    alerts_df['one_max_vol_label'] = alerts_df['one_max_vol'].apply(lambda x: 1 if x >= oneD_target else 0)
    alerts_df['three_max_vol_label'] = alerts_df['three_max_vol'].apply(lambda x: 1 if x >= threeD_target else 0)
    alerts_df['alert_identifier'] = alerts_df.apply(lambda row: f"{row['symbol']}-{row['date']}-{row['hour']}",axis=1)
    data = data.merge(alerts_df[['alert_identifier','three_max_vol_label','one_max_vol_label']],on='alert_identifier',how='left')
    return data

In [54]:
features_cont = ['v', 'vw', 'o', 'c', 'h', 'l','n','sma_20','sma_5','bbu', 'bbl', 'bbm', 'bb_spread',
       'bb_trend','sma_20_trend', 'sma_5_trend', 'pct_5d_high',
       'pct_5d_low', 'stddev_close_diff_5d', 'stddev_close_diff_10d']
features_cat = ['bb_category']

df = pd.read_csv('/Users/charlesmiller/Documents/ts_data/day_aggs/test_recent_7-5K.csv')
df = df.sort_values('t',ascending=True).reset_index(drop=True)

In [55]:
group_ids = df['alert_identifier'].unique()
train_group_ids = group_ids[:6000]
print(f"Train group ids: {train_group_ids}")
test_group_ids = group_ids[10:12]
print(f"Test group ids: {test_group_ids}")
train = df.loc[df['alert_identifier'].isin(train_group_ids)]
test = df.loc[df['alert_identifier'].isin(test_group_ids)]
X_train, y_train = prepare_data(train,features_cont,features_cat,window_size=15)
X_test, y_test = prepare_data(test,features_cont,features_cat,window_size=15)

Train group ids: ['BA-2015-01-12-11.0' 'QCOM-2015-01-21-15.0' 'C-2015-03-27-10.0'
 'QQQ-2015-05-14-15.0' 'IWM-2015-10-05-12.0' 'C-2015-10-06-15.0'
 'FB-2015-11-13-13.0' 'SPY-2015-11-27-14.0' 'SBUX-2016-01-21-12.0'
 'AAPL-2016-03-09-12.0']
Test group ids: ['CSCO-2016-03-10-10.0' 'BA-2016-04-13-13.0']
Features: ['v', 'vw', 'o', 'c', 'h', 'l', 'n', 'sma_20', 'sma_5', 'bbu', 'bbl', 'bbm', 'bb_spread', 'bb_trend', 'sma_20_trend', 'sma_5_trend', 'pct_5d_high', 'pct_5d_low', 'stddev_close_diff_5d', 'stddev_close_diff_10d', 'bb_category']
Group data:              v            vw       o       c         h         l           t  \
95   7680925.0  1.263106e+02  125.63  126.23  127.4200  125.5000  1418965200   
97   3189753.0  1.280529e+02  126.73  128.22  128.5400  126.7300  1419224400   
101  4029074.0  1.301004e+02  129.04  130.03  130.6500  128.6519  1419310800   
104  2125345.0  1.315338e+02  130.30  131.24  132.3300  130.1700  1419397200   
108  2242889.0  1.314107e+02  131.29  131.63  131.9

In [56]:
print(f"X train shape: {X_train.shape}")
print(f"y train shape: {y_train.shape}")
print(f"X test shape: {X_test.shape}")
print(f"y test shape: {y_test.shape}")

X train shape: torch.Size([10, 21, 15])
y train shape: torch.Size([10])
X test shape: torch.Size([2, 21, 15])
y test shape: torch.Size([2])


In [61]:
print(f"X col: {X_train[0][0]}")
print(f"X col: {X_train[0][1]}")
print(f"X col: {X_train[0][2]}")
print(f"X col: {X_train[0][3]}")
print(f"X col: {X_train[0][4]}")
print(f"X col: {X_train[0][5]}")
print(f"X col: {X_train[0][6]}")
print(f"X col: {X_train[0][7]}")
print(f"X col: {X_train[0][8]}")
print(f"X col: {X_train[0][9]}")
print(f"X col: {X_train[0][10]}")
print(f"X col: {X_train[0][11]}")
print(f"X col: {X_train[0][12]}")
print(f"X col: {X_train[0][13]}")
print(f"X col: {X_train[0][14]}")
print(f"X col: {X_train[0][15]}")
print(f"X col: {X_train[0][16]}")
print(f"X col: {X_train[0][17]}")
print(f"X col: {X_train[0][18]}")
print(f"X col: {X_train[0][19]}")
print(f"X col: {X_train[0][20]}")
print(f"y: {y_train}")

X col: tensor([1.0000, 0.3169, 0.4446, 0.1550, 0.1729, 0.2453, 0.2041, 0.2443, 0.4849,
        0.5960, 0.5912, 0.3753, 0.5167, 0.2967, 0.0000])
X col: tensor([0.0000e+00, 1.5755e-06, 3.4269e-06, 4.7231e-06, 4.6118e-06, 5.4317e-06,
        5.0707e-06, 4.1175e-06, 3.3827e-06, 2.5292e-06, 1.2381e-06, 2.7256e-06,
        4.6684e-06, 4.8318e-06, 1.0000e+00])
X col: tensor([0.0000, 0.1662, 0.5151, 0.7054, 0.8550, 0.9199, 0.9622, 1.0000, 0.8218,
        0.6088, 0.5166, 0.4698, 0.7508, 0.9003, 0.9985])
X col: tensor([0.0000, 0.3284, 0.6271, 0.8267, 0.8911, 1.0000, 0.9241, 0.6188, 0.6139,
        0.4653, 0.2145, 0.5413, 0.9191, 0.8762, 0.7591])
X col: tensor([0.0000, 0.2109, 0.6083, 0.9247, 0.8493, 1.0000, 0.9153, 0.9190, 0.8324,
        0.4708, 0.4143, 0.4802, 0.8597, 0.9115, 0.9115])
X col: tensor([0.0000, 0.2043, 0.5236, 0.7757, 0.7857, 0.9419, 1.0000, 0.7243, 0.5963,
        0.5166, 0.1130, 0.5349, 0.8140, 0.9020, 0.8206])
X col: tensor([0.9625, 0.6082, 0.6296, 0.2738, 0.3121, 0.4071, 0.370

In [41]:
group_ids = df['alert_identifier'].unique()
df.drop_duplicates(subset=['symbol','date'],inplace=True)
train_group_ids = group_ids[:100]
test_group_ids = group_ids[100:120]
train = df.loc[df['alert_identifier'].isin(train_group_ids)]
test = df.loc[df['alert_identifier'].isin(test_group_ids)]
X_train, y_train = prepare_data(train,features_cont,features_cat,window_size=15)
X_test, y_test = prepare_data(test,features_cont,features_cat,window_size=15)

KeyError: Index(['symbol'], dtype='object')