In [1]:
import pandas as pd
# Monkey patch for pandas_ta
import numpy as np
np.NaN = np.nan

In [2]:
import os
import sys

repo_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.insert(0, repo_root)

from src.data.loaders.data_loader import DataLoader
from src.config.schemas.data import DataLoaderConfig, DatasetSchema
from src.config.schemas.features import ReturnsMethod, FeatureConfig, ReturnsConfig
from src.config.schemas.indicators import *
from src.data.processors.winsorization import winsorize
from src.data.processors.target_engineering import create_direction_target
from src.features.calculators.returns import calculate_returns
from src.features.feature_builder import FeatureBuilder
from src.data.splitters.cross_validation import GroupTimeSeriesSplit
from src.utils.visualization import plot_price, plot_returns, plot_correlation_matrix, plot_distbox

In [3]:
mapping = DatasetSchema(timestamps="timestamp")
config = DataLoaderConfig(mapping=mapping, read_options={"sep": ";"})
loader = DataLoader(config)

In [4]:
dfs = loader.load("/home/denisalpino/dev/stock_prediction/data/crypto/15min/raw")

In [5]:
indicaors = [
    BBandsConfig(output=["BBP"]), # BBB has greater p-value than Volatility Ratio, so we remove it
    MACDConfig(output=["MACD", "MACD_Signal"]),
    ADXConfig(),
    ERPConfig(),
    MIConfig(),
    VRConfig(),
    BodyConfig(),
    UpWConfig(),
    LowWConfig(),
    RetSkewConfig(),
    RetKurtConfig(),
    RetMeanConfig(),
]

In [6]:
featue_cfg = FeatureConfig(
    returns=ReturnsConfig(method=ReturnsMethod.PERCENT),
    indicators=indicaors
)
feature_builder = FeatureBuilder(featue_cfg)

In [7]:
returns = [df.copy() for df in dfs]

for df in returns:
    # Timestamps convertions
    df["timestamps"] = pd.to_datetime(df.timestamps)

    # Returns calculation (pct_change)
    ret = calculate_returns(df.close, ReturnsMethod.PERCENT)

    # Returns winsorization
    _, _, ret, = winsorize(ret, ret, ret, winsorize_percent=99.9)

    # Target calculation
    df["target"] = ret.shift(-1)

    # Target calculation
    df["direction"] = create_direction_target(df, "close")

In [8]:
dfs_featured = [feature_builder.build_features(df) for df in returns]

drop_columns = ["open", "high", "low", "close"]

dfs_full = [
    pd.concat([df, feat], axis=1).drop(columns=drop_columns)
    for df, feat in zip(returns, dfs_featured)
]

In [9]:
plot_correlation_matrix(
    pd.concat(dfs_full, axis=0, ignore_index=True),
    method="spearman",
    mask_upper=False,
    width=1800,
    height=1600,
)



In [13]:
mrgd = pd.concat(dfs_full, axis=0, ignore_index=True).dropna(ignore_index=True)
mrgd

Unnamed: 0,timestamps,ticker,target,direction,day_of_week_sin,day_of_week_cos,returns,BBP,MACD,MACD_Signal,...,UpW,LowW,RetSkew_14,RetKurt_14,RetMean_14,returns_lag1,returns_lag2,returns_lag3,returns_lag4,returns_lag5
0,2023-01-01 05:15:00,JASMY,0.000661,1,-0.781831,0.623490,-0.006888,0.684676,0.000026,0.000012,...,0.000003,1.000000e-06,0.365236,-0.566520,0.000006,-0.006193,0.009543,0.010306,0.006020,0.012530
1,2023-01-01 05:30:00,JASMY,0.030363,1,-0.781831,0.623490,0.000661,0.668417,0.000027,0.000010,...,0.000003,8.000000e-06,0.506951,-0.422261,0.000005,-0.006888,-0.006193,0.009543,0.010306,0.006020
2,2023-01-01 05:45:00,JASMY,0.004164,1,-0.781831,0.623490,0.030363,0.998819,0.000034,0.000014,...,0.000038,6.000000e-06,1.721096,3.854197,0.000012,0.000661,-0.006888,-0.006193,0.009543,0.010306
3,2023-01-01 06:00:00,JASMY,-0.006699,0,-0.781831,0.623490,0.004164,0.922587,0.000040,0.000016,...,0.000035,4.000000e-05,1.654227,3.852728,0.000013,0.030363,0.000661,-0.006888,-0.006193,0.009543
4,2023-01-01 06:15:00,JASMY,-0.005780,0,-0.781831,0.623490,-0.006699,0.780413,0.000043,0.000015,...,0.000044,3.000000e-06,1.506443,3.229107,0.000011,0.004164,0.030363,0.000661,-0.006888,-0.006193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279331,2023-12-30 22:30:00,LDO,-0.003991,0,-0.974928,-0.222521,0.001453,0.270939,-0.011653,0.002610,...,0.001000,6.000000e-03,-0.040193,1.241436,-0.000786,-0.002175,-0.002171,-0.000362,0.001086,-0.001085
279332,2023-12-30 22:45:00,LDO,-0.002914,0,-0.974928,-0.222521,-0.003991,-0.043900,-0.012014,0.001799,...,0.000000,1.000000e-02,0.527862,1.050541,-0.000357,0.001453,-0.002175,-0.002171,-0.000362,0.001086
279333,2023-12-30 23:00:00,LDO,0.005480,1,-0.974928,-0.222521,-0.002914,-0.088318,-0.012799,0.000811,...,0.006000,5.000000e-03,0.606705,0.650371,-0.000857,-0.003991,0.001453,-0.002175,-0.002171,-0.000362
279334,2023-12-30 23:15:00,LDO,-0.007267,0,-0.974928,-0.222521,0.005480,0.384728,-0.012071,0.001231,...,0.002000,6.000000e-03,0.558520,-0.326134,0.000143,-0.002914,-0.003991,0.001453,-0.002175,-0.002171


In [14]:
mrgd.to_csv("/home/denisalpino/dev/stock_prediction/data/crypto/15min/preprocessed/cryptos.csv", index=False)