The final feature set was split into training, validation, and test sets using a chronological split (no shuffling).

This reflects real-world use where future data shouldn't influence past predictions.

Each set preserves the temporal integrity and alignment between input (X) and target (y) columns.

In [1]:
import pandas as pd


df = pd.read_csv("../data/final_features_with_targets.csv", index_col=0, parse_dates=True)

train = df.loc[:'2018-12-31']
val = df.loc['2019-01-01':'2021-12-31']
test = df.loc['2022-01-01':]


print("Train:", train.shape)
print("Validation:", val.shape)
print("Test:", test.shape)

# Extract feature and target column names
feature_cols = [col for col in df.columns if not col.endswith('_target_5d')]
target_cols = [col for col in df.columns if col.endswith('_target_5d')]

# Input and target matrices
X_train, y_train = train[feature_cols], train[target_cols]
X_val, y_val = val[feature_cols], val[target_cols]
X_test, y_test = test[feature_cols], test[target_cols]

# Saving splits to CSVs if needed
X_train.to_csv("../data/X_train.csv")
y_train.to_csv("../data/y_train.csv")
X_val.to_csv("../data/X_val.csv")
y_val.to_csv("../data/y_val.csv")
X_test.to_csv("../data/X_test.csv")
y_test.to_csv("../data/y_test.csv")


Train: (2067, 43)
Validation: (692, 43)
Test: (695, 43)


In [3]:
display(X_train.shape)

(2067, 37)

In [5]:
display(y_train.shape)

(2067, 6)