In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import glob
import os

In [None]:
DATA = "../data/"
!ls $DATA

In [None]:
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.utils.validation import check_is_fitted
from typing import List, Optional, Union


def is_fitted(scaler: Union[MinMaxScaler, StandardScaler]) -> bool:
    try:
        check_is_fitted(scaler)
        return True
    except NotFittedError:
        return False


class Pipeline:
    def __init__(self, *args, **kwargs) -> None:
        pass

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        pass

    def inverse_transform(self, x: np.ndarray, x_last: Optional[np.ndarray]) -> np.ndarray:
        pass

    def __repr__(self) -> str:
        return f"{self.__class__.__name__} (scaler={self.scaler})"


class ScalerPipeline(Pipeline):
    def __init__(self, scaler: Union[MinMaxScaler, StandardScaler]) -> None:
        super(ScalerPipeline, self).__init__()
        self.scaler = scaler

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        df_targets = df[targets]

        if not is_fitted(self.scaler):
            self.scaler.fit(df_targets)

        return self.scaler.transform(df_targets)

    def inverse_transform(self, x: np.ndarray, x_last: Optional[np.ndarray] = None) -> np.ndarray:
        return self.scaler.inverse_transform(x)


class LogReturnPipeline(Pipeline):
    def __init__(self, scaler: Union[MinMaxScaler, StandardScaler]) -> None:
        super(LogReturnPipeline, self).__init__()
        self.scaler = scaler

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        df_targets = df[targets]
        log_returns = np.log(df_targets / df_targets.shift(1)).fillna(0).to_numpy()

        if not is_fitted(self.scaler):
            self.scaler.fit(log_returns)

        return self.scaler.transform(log_returns)

    def inverse_transform(self, x: np.ndarray, x_last: np.ndarray) -> np.ndarray:
        log_returns = self.scaler.inverse_transform(x)
        return x_last * (np.cumprod(np.exp(log_returns), axis=0))

In [None]:
def compute_holder_exponent(x, delta, q=1, k=2):
    sqrt_pi = np.sqrt(np.pi)
    K = np.var(x)
    x = np.log(x + 1)
    H = list()
    for t in range(delta, len(x), delta):
        S = 0
        for j in range(t - delta, t - q):
            S += abs(x[j + q] - x[j]) ** k
            S /= delta - q + 1
        H.append((np.log((sqrt_pi * S) / ((2 ** (k / 2)) * (sqrt_pi / 2) * (K**k)))) / (k * np.log(q / (len(x) - 1))))
    return H

In [None]:
def compute_avg_log_returns(df, target, window):
    log_return_pipeline = LogReturnPipeline(StandardScaler())
    x = pd.DataFrame(log_return_pipeline.preprocess(df, target))
    x = x.rolling(window).mean().to_numpy().squeeze()
    x = x[::window][1:]
    return x

In [None]:
df = pd.read_csv(DATA + "ohlc_KO_PEP_NVDA_KSU_val.csv")

In [None]:
STOCK = "PEP"
WINDOW = 390

volumeNVDA = df[f"volume_{STOCK}"].to_numpy()
holder_exponent = compute_holder_exponent(volumeNVDA, delta=WINDOW)
avg_log_returns = compute_avg_log_returns(df, [f"mid_price_{STOCK}"], window=WINDOW)

In [None]:
len(holder_exponent), avg_log_returns.shape

In [None]:
fig, axs = plt.subplots(2, figsize=(4, 4))
axs[0].plot(avg_log_returns)
axs[1].plot(holder_exponent)

In [None]:
plt.scatter(avg_log_returns, holder_exponent)

In [None]:
all_files = glob.glob(os.path.join(DATA, "volumes_metrics", "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), axis=1, ignore_index=False)
df = df.loc[:, ~df.columns.duplicated()]
df.shape

In [None]:
SWEEPNAME = "stoic-jazz-65"
STOCK_NAMES = ["PEP", "KO", "KSU", "NVDA"]
METRIC_NAMES = ["Max", "Skew", "Min", "Mean", "Std", "Kurtosis"]
REAL_PRED = ["Real", "Pred"]
COLUMNS = [
    SWEEPNAME + " - " + realOpred + " Volume: " + metric_name + "/" + stock_name + "_epoch"
    for stock_name in STOCK_NAMES
    for metric_name in METRIC_NAMES
    for realOpred in REAL_PRED
]
COLUMNS.insert(0, "epoch")

In [None]:
a = df[COLUMNS].iloc[65]

In [None]:
a.to_csv(os.path.join(DATA, "volume_quality_metrics.csv"))

In [None]:
df_avg_volume_corr_dist = pd.read_csv(os.path.join(DATA, "avg_volume_corr_dist.csv"))

In [None]:
COLS = [
    "KO_volume-KSU_volume_epoch",
    "KO_volume-NVDA_volume_epoch",
    "KO_volume-PEP_volume_epoch",
    "NVDA_volume-KSU_volume_epoch",
    "PEP_volume-KSU_volume_epoch",
    "PEP_volume-NVDA_volume_epoch",
]
COLUMNS = [SWEEPNAME + " - corr_dist/" + col for col in COLS]

In [None]:
df_avg_volume_corr_dist[COLUMNS]

In [None]:
df_avg_volume_corr_dist[COLUMNS]
df_avg_volume_corr_dist["avg"] = df_avg_volume_corr_dist[COLUMNS].mean(axis=1)
print(df_avg_volume_corr_dist[COLUMNS].iloc[65])
print(df_avg_volume_corr_dist[["avg"]].iloc[65])

In [None]:
df_train = pd.read_csv(DATA + "ohlc_KO_PEP_NVDA_KSU_train.csv")
df_val = pd.read_csv(DATA + "ohlc_KO_PEP_NVDA_KSU_val.csv")
df = pd.concat([df_train, df_val])
# print(df.columns)
df = df.drop(
    [
        "hour_slot",
        "minute_slot",
        "weekday",
        "symbol",
        "open_KO",
        "high_KO",
        "low_KO",
        "norders_KO",
        "mid_price_KO",
        "open_PEP",
        "high_PEP",
        "low_PEP",
        "norders_PEP",
        "mid_price_PEP",
        "open_NVDA",
        "high_NVDA",
        "low_NVDA",
        "norders_NVDA",
        "mid_price_NVDA",
        "open_KSU",
        "high_KSU",
        "low_KSU",
        "norders_KSU",
        "mid_price_KSU",
    ],
    axis=1,
)
# print(df.columns)
# df.head()

In [None]:
targets_v = ["volume_KO", "volume_PEP", "volume_NVDA", "volume_KSU"]

targets_p = ["close_KO", "close_PEP", "close_NVDA", "close_KSU"]

In [None]:
df[targets_p]

In [None]:
corr = df_train[targets_p].corr(numeric_only=True)
sb.heatmap(corr, cmap="Blues", annot=True)

In [None]:
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.utils.validation import check_is_fitted
from typing import List, Optional, Union


def is_fitted(scaler: Union[MinMaxScaler, StandardScaler]) -> bool:
    try:
        check_is_fitted(scaler)
        return True
    except NotFittedError:
        return False


class Pipeline:
    def __init__(self, *args, **kwargs) -> None:
        pass

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        pass

    def inverse_transform(self, x: np.ndarray, x_last: Optional[np.ndarray]) -> np.ndarray:
        pass

    def __repr__(self) -> str:
        return f"{self.__class__.__name__} (scaler={self.scaler})"


class ScalerPipeline(Pipeline):
    def __init__(self, scaler: Union[MinMaxScaler, StandardScaler]) -> None:
        super(ScalerPipeline, self).__init__()
        self.scaler = scaler

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        df_targets = df[targets]

        if not is_fitted(self.scaler):
            self.scaler.fit(df_targets)

        return self.scaler.transform(df_targets)

    def inverse_transform(self, x: np.ndarray, x_last: Optional[np.ndarray] = None) -> np.ndarray:
        return self.scaler.inverse_transform(x)


class LogReturnPipeline(Pipeline):
    def __init__(self, scaler: Union[MinMaxScaler, StandardScaler]) -> None:
        super(LogReturnPipeline, self).__init__()
        self.scaler = scaler

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        df_targets = df[targets]
        log_returns = np.log(df_targets / df_targets.shift(1)).fillna(0).to_numpy()

        if not is_fitted(self.scaler):
            self.scaler.fit(log_returns)

        return self.scaler.transform(log_returns)

    def inverse_transform(self, x: np.ndarray, x_last: np.ndarray) -> np.ndarray:
        log_returns = self.scaler.inverse_transform(x)
        return x_last * (np.cumprod(np.exp(log_returns), axis=0))

In [None]:
log_return_pipeline = LogReturnPipeline(StandardScaler())
df_pre = log_return_pipeline.preprocess(df_train, targets_p)
df_pre = pd.DataFrame(df_pre, columns=["KO", "PEP", "NVDA", "KSU"])

In [None]:
corr = df_pre.corr(numeric_only=True)
sb.heatmap(corr, cmap="Blues", annot=True)

In [None]:
scaler_pipeline = ScalerPipeline(MinMaxScaler())
df_pre = scaler_pipeline.preprocess(df_train, targets_v)
df_pre = pd.DataFrame(df_pre, columns=["KO", "PEP", "NVDA", "KSU"])

In [None]:
corr = df_pre.corr(numeric_only=True)
sb.heatmap(corr, cmap="Blues", annot=True)