In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import glob
import os
import pickle
from itertools import combinations
from matplotlib.lines import Line2D

In [None]:
!pwd

In [None]:
DATA = "../data/"
!ls $DATA

In [None]:
PATH_PICKLE_PRICE = "../storage/thesis-gan/ywbjynma/preds_epoch=109-target_price=mid_price-target_volume=None.pickle"
PATH_PICKLE_VOLUME = "../storage/thesis-gan/3chof3p2/preds_epoch=118-target_price=None-target_volume=volume.pickle"
PATH_PICKLE_REAL = "../storage/thesis-gan/reals.pickle"

In [None]:
PATH_PICKLE_PIPELINE_PRICE = (
    "../storage/thesis-gan/ywbjynma/checkpoints/epoch=109-step=30799.ckpt/metadata/data_pipeline.pickle"
)
PATH_PICKLE_PIPELINE_VOLUME = (
    "../storage/thesis-gan/3chof3p2/checkpoints/epoch=118-step=33319.ckpt/metadata/data_pipeline.pickle"
)

In [None]:
with open(PATH_PICKLE_PRICE, "rb") as handle:
    pred_prices_dict = pickle.load(handle)
with open(PATH_PICKLE_VOLUME, "rb") as handle:
    pred_volumes_dict = pickle.load(handle)
with open(PATH_PICKLE_REAL, "rb") as handle:
    reals_dict = pickle.load(handle)

In [None]:
with open(PATH_PICKLE_PIPELINE_PRICE, "rb") as handle:
    pipeline_price = pickle.load(handle)
with open(PATH_PICKLE_PIPELINE_VOLUME, "rb") as handle:
    pipeline_volume = pickle.load(handle)

In [None]:
pipeline_price, pipeline_volume

In [None]:
pred_prices_dict.keys(), pred_volumes_dict.keys(), reals_dict.keys()

In [None]:
pred_sequence_price = pred_prices_dict["pred_sequence"]
pred_prices = pred_prices_dict["pred_prices"]
prices = reals_dict["prices"]
pred_sequence_price.shape, pred_prices.shape, prices.shape

In [None]:
pred_sequence_volume = pred_volumes_dict["pred_sequence"]
pred_volumes = pred_volumes_dict["pred_volumes"]
volumes = reals_dict["volumes"]
pred_sequence_volume.shape, pred_volumes.shape, volumes.shape

In [None]:
sequence = reals_dict["sequence"]
sequence_price = sequence[:, :4, :]
sequence_volume = sequence[:, 4:, :]
sequence_price.shape, sequence_volume.shape

In [None]:
stock_names = ["KO", "PEP", "NVDA", "KSU"]

In [None]:
sequence_price = sequence_price.squeeze().numpy().T
pred_sequence_price = pred_sequence_price.squeeze().numpy().T
prices = prices.squeeze().numpy().T
pred_prices = pred_prices.squeeze().numpy().T
sequence_price.shape, pred_sequence_price.shape, prices.shape, pred_prices.shape,

In [None]:
sequence_volume = sequence_volume.squeeze().numpy().T
pred_sequence_volume = pred_sequence_volume.squeeze().numpy().T
volumes = volumes.squeeze().numpy().T
pred_volumes = pred_volumes.squeeze().numpy().T
sequence_volume.shape, pred_sequence_volume.shape, volumes.shape, pred_volumes.shape,

In [None]:
sequence_price.shape, pred_sequence_price.shape, prices.shape, sequence_volume.shape, volumes.shape, pred_sequence_volume.shape,

In [None]:
history_indexes = np.arange(390)
continuation_indexes = np.arange(390, prices.shape[0])
history_indexes.shape, continuation_indexes.shape

In [None]:
history = sequence_price[:390, :].T
reals = sequence_price[390:, :].T
preds = pred_sequence_price[390:, :].T
history.shape, reals.shape, preds.shape

In [None]:
history_and_reals = np.concatenate((history, reals), axis=1)
history_and_preds = np.concatenate((history, preds), axis=1)
history_and_reals.shape, history_and_preds.shape

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(20, 15))
legend_elements = [
    Line2D([0], [0], color="C0", lw=2, label="Observed"),
    Line2D([0], [0], color="C1", lw=2, label="Real continuation"),
    Line2D([0], [0], color="C2", lw=2, label="Predicted continuation"),
]

ax[0, 0].set_title(f"{stock_names[0]} - Volume")
ax[0, 0].plot(
    history_indexes,
    volumes[:390, 0],
    color="C0",
)
ax[0, 0].plot(
    continuation_indexes,
    volumes[390:, 0],
    color="C1",
)
ax[0, 0].plot(
    continuation_indexes,
    pred_volumes[390:, 0],
    color="C2",
)

ax[0, 1].set_title(f"{stock_names[1]} - Volume")
ax[0, 1].plot(
    history_indexes,
    volumes[:390, 1],
    color="C0",
)
ax[0, 1].plot(
    continuation_indexes,
    volumes[390:, 1],
    color="C1",
)
ax[0, 1].plot(
    continuation_indexes,
    pred_volumes[390:, 1],
    color="C2",
)

ax[1, 0].set_title(f"{stock_names[2]} - Volume")
ax[1, 0].plot(
    history_indexes,
    volumes[:390, 2],
    color="C0",
)
ax[1, 0].plot(
    continuation_indexes,
    volumes[390:, 2],
    color="C1",
)
ax[1, 0].plot(
    continuation_indexes,
    pred_volumes[390:, 2],
    color="C2",
)

ax[1, 1].set_title(f"{stock_names[3]} - Volume")
ax[1, 1].plot(
    history_indexes,
    volumes[:390, 3],
    color="C0",
)
ax[1, 1].plot(
    continuation_indexes,
    volumes[390:, 3],
    color="C1",
)
ax[1, 1].plot(
    continuation_indexes,
    pred_volumes[390:, 3],
    color="C2",
)

fig.legend(handles=legend_elements, loc="upper center", ncol=3)
fig.tight_layout()
plt.savefig(f"/Users/giuseppemasi/PycharmProjects/thesis-gan/storage/thesis-gan/volumes.png")
plt.show()
plt.close(fig)

In [None]:
real_avg_log_returns = compute_avg_log_returns(sequence_price, 15)
real_avg_volumes = compute_avg_volumes(sequence_volume, 15)

pred_avg_log_returns = compute_avg_log_returns(pred_sequence_price, 15)
pred_avg_volumes = compute_avg_volumes(pred_sequence_volume, 15)

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(7 * 4, 10))

for target_idx in range(4):
    stock_name = stock_names[target_idx]

    # Real volume-volatility correlation
    title = f"{stock_name} - Real"
    ax[0, target_idx].set_title(title)
    ax[0, target_idx].scatter(
        real_avg_log_returns[target_idx],
        real_avg_volumes[target_idx],
        color="C0",
    )
    ax[0, target_idx].set_xlabel("Avg log-returns")
    ax[0, target_idx].set_ylabel("Avg log-volumes")

    # Pred volume-volatility correlation
    title = f"{stock_name} - Pred"
    ax[1, target_idx].set_title(title)
    ax[1, target_idx].scatter(
        pred_avg_log_returns[target_idx],
        pred_avg_volumes[target_idx],
        color="C1",
    )
    ax[1, target_idx].set_xlabel("Avg log-returns")
    ax[1, target_idx].set_ylabel("Avg log-volumes")

fig.tight_layout()
title = "Volume-Volatility Corr"
# plt.show()
plt.savefig(f"/home/giuseppe/PycharmProjects/thesis-gan/storage/thesis-gan/{title}.png")

plt.close(fig)

In [None]:
all_files = glob.glob(os.path.join(DATA, "volumes_metrics", "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), axis=1, ignore_index=False)
df = df.loc[:, ~df.columns.duplicated()]
df.shape

In [None]:
SWEEPNAME = "stoic-jazz-65"
STOCK_NAMES = ["PEP", "KO", "KSU", "NVDA"]
METRIC_NAMES = ["Max", "Skew", "Min", "Mean", "Std", "Kurtosis"]
REAL_PRED = ["Real", "Pred"]
COLUMNS = [
    SWEEPNAME + " - " + realOpred + " Volume: " + metric_name + "/" + stock_name + "_epoch"
    for stock_name in STOCK_NAMES
    for metric_name in METRIC_NAMES
    for realOpred in REAL_PRED
]
COLUMNS.insert(0, "epoch")

In [None]:
a = df[COLUMNS].iloc[65]

In [None]:
a.to_csv(os.path.join(DATA, "volume_quality_metrics.csv"))

In [None]:
df_avg_volume_corr_dist = pd.read_csv(os.path.join(DATA, "avg_volume_corr_dist.csv"))

In [None]:
COLS = [
    "KO_volume-KSU_volume_epoch",
    "KO_volume-NVDA_volume_epoch",
    "KO_volume-PEP_volume_epoch",
    "NVDA_volume-KSU_volume_epoch",
    "PEP_volume-KSU_volume_epoch",
    "PEP_volume-NVDA_volume_epoch",
]
COLUMNS = [SWEEPNAME + " - corr_dist/" + col for col in COLS]

In [None]:
df_avg_volume_corr_dist[COLUMNS]

In [None]:
df_avg_volume_corr_dist[COLUMNS]
df_avg_volume_corr_dist["avg"] = df_avg_volume_corr_dist[COLUMNS].mean(axis=1)
print(df_avg_volume_corr_dist[COLUMNS].iloc[65])
print(df_avg_volume_corr_dist[["avg"]].iloc[65])

In [None]:
df_train = pd.read_csv(DATA + "ohlc_KO_PEP_NVDA_KSU_train.csv")
df_val = pd.read_csv(DATA + "ohlc_KO_PEP_NVDA_KSU_val.csv")
df = pd.concat([df_train, df_val])
# print(df.columns)
df = df.drop(
    [
        "hour_slot",
        "minute_slot",
        "weekday",
        "symbol",
        "open_KO",
        "high_KO",
        "low_KO",
        "norders_KO",
        "mid_price_KO",
        "open_PEP",
        "high_PEP",
        "low_PEP",
        "norders_PEP",
        "mid_price_PEP",
        "open_NVDA",
        "high_NVDA",
        "low_NVDA",
        "norders_NVDA",
        "mid_price_NVDA",
        "open_KSU",
        "high_KSU",
        "low_KSU",
        "norders_KSU",
        "mid_price_KSU",
    ],
    axis=1,
)
# print(df.columns)
# df.head()

In [None]:
targets_v = ["volume_KO", "volume_PEP", "volume_NVDA", "volume_KSU"]

targets_p = ["mid_price_KO", "mid_price_PEP", "mid_price_NVDA", "mid_price_KSU"]

In [None]:
df_val[targets_p]

In [None]:
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.utils.validation import check_is_fitted
from typing import List, Optional, Union


def is_fitted(scaler: Union[MinMaxScaler, StandardScaler]) -> bool:
    try:
        check_is_fitted(scaler)
        return True
    except NotFittedError:
        return False


class Pipeline:
    def __init__(self, *args, **kwargs) -> None:
        pass

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        pass

    def inverse_transform(self, x: np.ndarray, x_last: Optional[np.ndarray]) -> np.ndarray:
        pass

    def __repr__(self) -> str:
        return f"{self.__class__.__name__} (scaler={self.scaler})"


class ScalerPipeline(Pipeline):
    def __init__(self, scaler: Union[MinMaxScaler, StandardScaler]) -> None:
        super(ScalerPipeline, self).__init__()
        self.scaler = scaler

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        df_targets = df[targets]

        if not is_fitted(self.scaler):
            self.scaler.fit(df_targets)

        return self.scaler.transform(df_targets)

    def inverse_transform(self, x: np.ndarray, x_last: Optional[np.ndarray] = None) -> np.ndarray:
        return self.scaler.inverse_transform(x)


class LogReturnPipeline(Pipeline):
    def __init__(self, scaler: Union[MinMaxScaler, StandardScaler]) -> None:
        super(LogReturnPipeline, self).__init__()
        self.scaler = scaler

    def preprocess(self, df: pd.DataFrame, targets: List[str]) -> np.ndarray:
        df_targets = df[targets]
        log_returns = np.log(df_targets / df_targets.shift(1)).fillna(0).to_numpy()

        if not is_fitted(self.scaler):
            self.scaler.fit(log_returns)

        return self.scaler.transform(log_returns)

    def inverse_transform(self, x: np.ndarray, x_last: np.ndarray) -> np.ndarray:
        log_returns = self.scaler.inverse_transform(x)
        return x_last * (np.cumprod(np.exp(log_returns), axis=0))

In [None]:
STOCKS = ["KO", "PEP", "NVDA", "KSU"]
log_return_pipeline = LogReturnPipeline(StandardScaler())
df_train_pre = pd.DataFrame(log_return_pipeline.preprocess(df_train, targets_p), columns=STOCKS)
df_val_pre = pd.DataFrame(log_return_pipeline.preprocess(df_val, targets_p), columns=STOCKS)

In [None]:
df_val_pre

In [None]:
corr = df_val_pre.corr(numeric_only=True)
sb.heatmap(corr, cmap="Blues", annot=True)