In [3]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd


from sklearn.preprocessing import LabelEncoder, StandardScaler


In [47]:
# ==========================================
# STEP 1: Load and Clean Data
# ==========================================

def load_label_data(label_path):
    df = pd.read_csv(label_path, parse_dates=['date'])

    # Fill missing values in end_time with start_time
    df["entry"] = df["entry"].fillna(df["end time"])

    # df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
    # print(df.columns)
    
    # Combine date + start_time or end_time into a unified timestamp
    df['timestamp'] = pd.to_datetime(df['date'].astype(str) + ' ' + df['end time'])
    df = df[['timestamp', 'label', 'entry']].dropna()
    return df

def load_feature_data(feature_path):
    df = pd.read_csv(feature_path)
    df.columns = [c.strip().lower() for c in df.columns]
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp')
    return df

label_path = os.path.join("data", "labels_timestamp.csv")
feature_path = os.path.join("data", "QQQ_1min_firstratedata.csv")

# df_feats = pd.read_csv(os.path.join("data", feature_path), parse_dates=['timestamp'])
features_df = load_feature_data(feature_path)
features_df

Unnamed: 0,timestamp,open,high,low,close,volume
0,2022-09-30 04:00:00,274.0000,274.8400,274.00,274.65,5209
1,2022-09-30 04:01:00,274.7000,274.7300,274.63,274.68,12542
2,2022-09-30 04:02:00,274.6100,274.6100,274.48,274.50,7092
3,2022-09-30 04:03:00,274.4700,274.5200,274.42,274.47,9774
4,2022-09-30 04:04:00,274.5500,274.6100,274.55,274.55,500
...,...,...,...,...,...,...
210477,2023-09-29 19:37:00,358.5400,358.5400,358.54,358.54,150
210478,2023-09-29 19:40:00,358.5800,358.5800,358.58,358.58,1100
210479,2023-09-29 19:41:00,358.6499,358.6499,358.60,358.60,2700
210480,2023-09-29 19:46:00,358.5700,358.5900,358.57,358.59,1736


In [48]:
labels_df = load_label_data(label_path)
labels_df

Unnamed: 0,timestamp,label,entry
0,2022-09-30 09:37:00,no,9:37
1,2022-09-30 09:50:00,no,9:50
2,2022-09-30 09:53:00,up,9:55
3,2022-09-30 10:02:00,no,10:02
4,2022-09-30 10:15:00,no,10:15
...,...,...,...
5433,2022-12-08 11:26:00,profit,11:26
5434,2022-12-08 11:27:00,no,11:27
5435,2022-12-08 11:28:00,no,11:28
5436,2022-12-08 11:29:00,no,11:29


In [57]:
# ==========================================
# STEP 2: Merge Labels with Features
# ==========================================

def merge_data(features_df, labels_df):
    merged = pd.merge_asof(
        features_df.sort_values('timestamp'),
        labels_df.sort_values('timestamp'),
        on='timestamp',
        direction='nearest',
        tolerance=pd.Timedelta('5min')  # optional time tolerance
    )
    # Fill missing labels with "no"
    # merged['label'] = merged['label'].fillna('no')
    merged = merged.dropna()
    return merged

merged_df = merge_data(features_df, labels_df)

merged_df


Unnamed: 0,timestamp,open,high,low,close,volume,label,entry
310,2022-09-30 09:32:00,271.5786,271.7800,271.090,271.250,232907,no,9:37
311,2022-09-30 09:33:00,271.2700,272.2700,271.260,272.240,326439,no,9:37
312,2022-09-30 09:34:00,272.2400,272.4650,272.020,272.120,244485,no,9:37
313,2022-09-30 09:35:00,272.1100,272.2700,271.440,271.680,241636,no,9:37
314,2022-09-30 09:36:00,271.7000,271.7000,270.850,270.860,219512,no,9:37
...,...,...,...,...,...,...,...,...
41790,2022-12-08 11:31:00,283.4000,283.4500,283.290,283.440,63410,no,11:30
41791,2022-12-08 11:32:00,283.4200,283.5200,283.310,283.450,55900,no,11:30
41792,2022-12-08 11:33:00,283.4600,283.5000,283.310,283.410,76953,no,11:30
41793,2022-12-08 11:34:00,283.4000,283.4400,283.310,283.365,37650,no,11:30


In [58]:
# Get counts of each label
label_counts = merged_df["label"].value_counts()

# Get percentage distribution
label_percent = merged_df["label"].value_counts(normalize=True) * 100

print("ðŸ“Š Label Counts:")
print(label_counts)
print("\nðŸ“ˆ Label Percentages (%):")
print(label_percent.round(2))

ðŸ“Š Label Counts:
label
no              2877
consolid        1671
hold             964
no               315
profit           184
up                90
down              54
adown             30
aup               19
vup                7
5min-rb-up         6
rb-up              3
mstop              3
5min-rb-down       2
stop               2
rb-down            1
Name: count, dtype: int64

ðŸ“ˆ Label Percentages (%):
label
no              46.19
consolid        26.83
hold            15.48
no               5.06
profit           2.95
up               1.45
down             0.87
adown            0.48
aup              0.31
vup              0.11
5min-rb-up       0.10
rb-up            0.05
mstop            0.05
5min-rb-down     0.03
stop             0.03
rb-down          0.02
Name: proportion, dtype: float64


In [1]:
# Example: features to include in LSTM input
FEATURE_COLS = ['open', 'high', 'low', 'close', 'volume']
START_TIME = pd.to_timedelta("09:31:00")  # starting timestamp each day

def create_lstm_sequences(df, feature_cols=FEATURE_COLS):
    """
    Convert each row into an LSTM input sequence from 9:31 to 'end_time'.
    Zero-padding is applied if sequences are shorter than the max length.
    """
    sequences = []
    labels = []
    
    # Ensure timestamp is datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Group by day
    for day, day_df in df.groupby(df['timestamp'].dt.date):
        print("day/ day_df: ", day)
        print(day_df)
        day_start = pd.Timestamp(f"{day} 09:31:00")
        
        # Iterate each row
        for _, row in day_df.iterrows():
            # Sequence: from 9:31 to end_time at 1-min intervals
            end_time = pd.to_datetime(row['timestamp'])  # use the row's timestamp as end
            time_index = pd.date_range(start=day_start, end=end_time, freq='T')
            print("time_index: ", time_index)
            
            # Extract features for timestamps in time_index
            day_features = day_df.set_index('timestamp').reindex(time_index, fill_value=0)
            print("day_features: ", day_features)
            sequence = day_features[feature_cols].to_numpy()
            
            sequences.append(sequence)
            labels.append(row['label'])
            print(sequences, labels)
            exit()
        exit()
    
    return sequences, labels



NameError: name 'pd' is not defined

In [59]:
# ==========================================
# STEP 3: Encode Labels & Normalize Features
# ==========================================


def preprocess_data(merged_df):
    feature_cols = [c for c in merged_df.columns if c not in ['timestamp', 'label']]
    
    # Scale features
    scaler = StandardScaler()
    merged_df[feature_cols] = scaler.fit_transform(merged_df[feature_cols])
    
    # Encode labels as integers
    le = LabelEncoder()
    merged_df['label_enc'] = le.fit_transform(merged_df['label'])
    
    return merged_df, feature_cols, le, scaler

# ==========================================
# STEP 4: Create Sequences for LSTM
# ==========================================

def create_sequences(df, feature_cols, lookback):
    X, y = [], []
    data = df[feature_cols].values
    labels = df['label_enc'].values
    
    for i in range(len(df) - lookback):
        X.append(data[i:i+lookback])
        y.append(labels[i+lookback])
        
    X = np.array(X)
    y = np.array(y)
    return X, y

In [None]:
X, y = create_lstm_sequences(merged_df)
