In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import pickle
from matplotlib.lines import Line2D
import scipy.stats as scs

In [None]:
plt.rcParams["figure.figsize"] = [16, 9]
# plt.rcParams["figure.dpi"] = 300
plt.rcParams["font.size"] = 20
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["axes.titlesize"] = 24
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["font.family"] = "serif"

In [None]:
stock_names = ["KO", "PEP", "NVDA", "KSU"]

In [None]:
PATH_PICKLE_PRICE = "../storage/thesis-gan/ywbjynma/preds_epoch=109-target_price=mid_price-target_volume=None.pickle"
PATH_PICKLE_VOLUME = "../storage/thesis-gan/3chof3p2/preds_epoch=118-target_price=None-target_volume=volume.pickle"
PATH_PICKLE_REAL = "../storage/thesis-gan/reals.pickle"

In [None]:
with open(PATH_PICKLE_PRICE, "rb") as handle:
    pred_prices_dict = pickle.load(handle)
with open(PATH_PICKLE_VOLUME, "rb") as handle:
    pred_volumes_dict = pickle.load(handle)
with open(PATH_PICKLE_REAL, "rb") as handle:
    reals_dict = pickle.load(handle)

In [None]:
pred_prices_dict.keys(), pred_volumes_dict.keys(), reals_dict.keys()

In [None]:
pred_sequence_price = pred_prices_dict["pred_sequence"]
pred_prices = pred_prices_dict["pred_prices"]
prices = reals_dict["prices"]
pred_sequence_price.shape, pred_prices.shape, prices.shape

In [None]:
pred_sequence_volume = pred_volumes_dict["pred_sequence"]
pred_volumes = pred_volumes_dict["pred_volumes"]
volumes = reals_dict["volumes"]
pred_sequence_volume.shape, pred_volumes.shape, volumes.shape

In [None]:
sequence = reals_dict["sequence"]
sequence_price = sequence[:, :4, :]
sequence_volume = sequence[:, 4:, :]
sequence_price.shape, sequence_volume.shape

In [None]:
sequence_price = sequence_price.squeeze().numpy().T
pred_sequence_price = pred_sequence_price.squeeze().numpy().T
prices = prices.squeeze().numpy().T
pred_prices = pred_prices.squeeze().numpy().T
sequence_price.shape, pred_sequence_price.shape, prices.shape, pred_prices.shape,

In [None]:
sequence_volume = sequence_volume.squeeze().numpy().T
pred_sequence_volume = pred_sequence_volume.squeeze().numpy().T
volumes = volumes.squeeze().numpy().T
pred_volumes = pred_volumes.squeeze().numpy().T
sequence_volume.shape, pred_sequence_volume.shape, volumes.shape, pred_volumes.shape,

In [None]:
prices.shape, pred_prices.shape, volumes.shape, pred_volumes.shape,

In [None]:
history_indexes = np.arange(390)
continuation_indexes = np.arange(390, prices.shape[0])
history_indexes.shape, continuation_indexes.shape

In [None]:
history = sequence_price[:390, :].T
reals = sequence_price[390:, :].T
preds = pred_sequence_price[390:, :].T
history.shape, reals.shape, preds.shape

In [None]:
history_and_reals = np.concatenate((history, reals), axis=1)
history_and_preds = np.concatenate((history, preds), axis=1)
history_and_reals.shape, history_and_preds.shape

# PRICES

In [None]:
fig, axes = plt.subplots(2, 2)
legend_elements = [
    Line2D([0], [0], color="C0", lw=2, label="Observed"),
    Line2D([0], [0], color="C1", lw=2, label="Real continuation"),
    Line2D([0], [0], color="C2", lw=2, label="Predicted continuation"),
]

for i in range(2):
    for j in range(2):
        linear_index = i * 2 + j
        axes[i, j].set_title(f"{stock_names[linear_index]}", fontsize=20)

        axes[i, j].plot(
            history_indexes,
            prices[:390, linear_index],
            color="C0",
        )
        axes[i, j].plot(
            continuation_indexes,
            prices[390:, linear_index],
            color="C1",
        )
        axes[i, j].plot(
            continuation_indexes,
            pred_prices[390:, linear_index],
            color="C2",
        )
        axes[i, j].axvline(x=390, color="r")

fig.suptitle("Prices", fontsize=24, y=1.04)
fig.legend(handles=legend_elements, loc="upper center", ncol=3, fontsize=15, bbox_to_anchor=(0.5, 1))
fig.tight_layout()
# plt.savefig("../storage/thesis-gan/prices.png")
plt.show()
plt.close(fig)

# VOLUMES

In [None]:
fig, axes = plt.subplots(2, 2)
legend_elements = [
    Line2D([0], [0], color="C0", lw=2, label="Observed"),
    Line2D([0], [0], color="C1", lw=2, label="Real continuation"),
    Line2D([0], [0], color="C2", lw=2, label="Predicted continuation"),
]

for i in range(2):
    for j in range(2):
        linear_index = i * 2 + j
        axes[i, j].set_title(f"{stock_names[linear_index]}", fontsize=20)

        axes[i, j].plot(
            history_indexes,
            volumes[:390, linear_index],
            color="C0",
        )
        axes[i, j].plot(
            continuation_indexes,
            volumes[390:, linear_index],
            color="C1",
        )
        axes[i, j].plot(
            continuation_indexes,
            pred_volumes[390:, linear_index],
            color="C2",
        )
        axes[i, j].axvline(x=390, color="r")

fig.suptitle("Volumes", fontsize=24, y=1.04)
fig.legend(handles=legend_elements, loc="upper center", ncol=3, fontsize=15, bbox_to_anchor=(0.5, 1))
fig.tight_layout()
# plt.savefig("../storage/thesis-gan/prices.png")
plt.show()
plt.close(fig)

# STYLISED FACT

## RETURNS DISTRIBUTION

In [None]:
# extract all the stats from describe() function
def extract_data_stats(col):
    d_stat = col.describe()
    mu = d_stat["mean"]
    sigma = d_stat["std"]
    rtn_range = np.linspace(d_stat["min"], d_stat["max"], num=1000)
    norm_pdf = scs.norm.pdf(rtn_range, loc=mu, scale=sigma)
    return mu, sigma, rtn_range, norm_pdf

In [None]:
# draw the histogram with Probability Density Function
def draw_hist(ax, col_real, col_pred, stock_name, xlim=(-0.02, 0.02)):
    sb.histplot(data=col_real, kde=True, color="orange", legend=True, ax=ax)
    sb.histplot(data=col_pred, kde=True, color="green", legend=True, ax=ax)

    mu, sigma, rtn_range, norm_pdf = extract_data_stats(col_real)
    ax.plot(rtn_range, norm_pdf, "orange", lw=3, label=f"Real: N({mu:.5f}, {sigma**2:.5f})")

    mu, sigma, rtn_range, norm_pdf = extract_data_stats(col_pred)
    ax.plot(rtn_range, norm_pdf, "green", lw=3, label=f"Pred: N({mu:.5f}, {sigma**2:.5f})")

    ax.axvline(x=0, c="c", linestyle="--", lw=3)
    ax.set_title(f"{stock_name}", fontsize=24)
    ax.set_xlim(xlim)
    ax.legend(loc="upper right", fontsize=10, frameon=True, fancybox=True, framealpha=1, shadow=True, borderpad=1)

In [None]:
def print_returns_distribution_stylised_fact(ax, stock_name):
    stock_index = stock_names.index(stock_name)
    prices_real = prices[:, stock_index]
    prices_pred = pred_prices[:, stock_index]

    prices_real = pd.DataFrame(prices_real, columns=["mid_price"])
    prices_pred = pd.DataFrame(prices_pred, columns=["mid_price"])

    prices_real["Returns"] = prices_real["mid_price"].pct_change()
    prices_pred["Returns"] = prices_pred["mid_price"].pct_change()

    prices_real = prices_real.dropna()
    prices_pred = prices_pred.dropna()

    draw_hist(ax, prices_real["Returns"], prices_pred["Returns"], stock_name)

In [None]:
fig, axs = plt.subplots(2, 2)

print_returns_distribution_stylised_fact(axs[0, 0], "NVDA")
print_returns_distribution_stylised_fact(axs[0, 1], "KSU")
print_returns_distribution_stylised_fact(axs[1, 0], "KO")
print_returns_distribution_stylised_fact(axs[1, 1], "PEP")

fig.suptitle("Returns distribution", fontsize=24)
fig.tight_layout()
# plt.savefig("../storage/thesis-gan/stylised_fact_distribution_returns.png")
plt.show()
plt.close(fig)

## AGGREGATIONAL GAUSSIANITY

In [None]:
# using ax to draw multi-grahps
def draw_hist_multi(col_real, col_pred, xlim=(-0.02, 0.02), ax=None):
    sb.histplot(data=col_real, kde=True, color="orange", legend=True, ax=ax)
    sb.histplot(data=col_pred, kde=True, color="green", legend=True, ax=ax)

    mu, sigma, rtn_range, norm_pdf = extract_data_stats(col_real)
    ax.plot(rtn_range, norm_pdf, "orange", lw=3, label=f"Real: N({mu:.5f}, {sigma**2:.5f})")

    mu, sigma, rtn_range, norm_pdf = extract_data_stats(col_pred)
    ax.plot(rtn_range, norm_pdf, "green", lw=3, label=f"Pred: N({mu:.5f}, {sigma**2:.5f})")

    ax.set_xlim(xlim)
    ax.legend(loc="upper right", fontsize=10, frameon=True, fancybox=True, framealpha=1, shadow=True, borderpad=1)

In [None]:
def print_aggregational_gaussianity_stylised_fact(stock_name):
    stock_index = stock_names.index(stock_name)
    prices_real = prices[:, stock_index]
    prices_pred = pred_prices[:, stock_index]

    prices_real = pd.DataFrame(prices_real, columns=["mid_price"])
    prices_pred = pd.DataFrame(prices_pred, columns=["mid_price"])

    df_simple_rtn_real = pd.DataFrame(prices_real["mid_price"])
    df_simple_rtn_pred = pd.DataFrame(prices_pred["mid_price"])

    lags = 6
    cols = list()
    for lag in range(1, lags + 1):
        col = f"Returns - Lag {lag}"
        cols.append(col)
        df_simple_rtn_real[col] = df_simple_rtn_real["mid_price"].pct_change(periods=lag)
        df_simple_rtn_pred[col] = df_simple_rtn_pred["mid_price"].pct_change(periods=lag)

    df_simple_rtn_real.dropna(inplace=True)
    df_simple_rtn_pred.dropna(inplace=True)

    df_simple_rtn_real = df_simple_rtn_real.drop(["mid_price"], axis=1)
    df_simple_rtn_pred = df_simple_rtn_pred.drop(["mid_price"], axis=1)

    df_simple_rtn_real.columns = cols
    df_simple_rtn_pred.columns = cols

    draw_hist_subplots(df_simple_rtn_real, df_simple_rtn_pred, stock_name)

In [None]:
# create subplots figure with each plot drawed by draw_hist_multi()
def draw_hist_subplots(df_real, df_pred, stock_name):
    fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(16, 9))

    axs = axs.ravel()
    for i, col in enumerate(df_real.columns):
        draw_hist_multi(df_real[col], df_pred[col], ax=axs[i])

    fig.suptitle(f"Distribution of returns with increased time scale - {stock_name}", fontsize=24)
    fig.tight_layout()
    # plt.savefig(f"/../storage/thesis-gan/stylised_fact_aggregational_gaussianity_{stock_name}.png")
    plt.show()
    plt.close(fig)

In [None]:
print_aggregational_gaussianity_stylised_fact("NVDA")

## ABSENCE OF AUTOCORRELATION

In [None]:
def corr_plot(corr, ax, title):
    sb.set(style="white")
    cmap = sb.diverging_palette(220, 20, as_cmap=True)
    sb.heatmap(corr, annot=True, cmap=cmap, square=True, linewidths=3, linecolor="w", ax=ax)
    ax.set_title(title, fontsize=20)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment="center")
    ax.set_yticklabels(ax.get_yticklabels(), rotation=45, verticalalignment="center")

In [None]:
def print_absence_autocorrelation_stylised_fact(stock_name):
    stock_index = stock_names.index(stock_name)
    prices_real = prices[:, stock_index]
    prices_pred = pred_prices[:, stock_index]

    prices_real = pd.DataFrame(prices_real, columns=["mid_price"])
    prices_pred = pd.DataFrame(prices_pred, columns=["mid_price"])

    df_simple_rtn_real = pd.DataFrame(prices_real["mid_price"])
    df_simple_rtn_pred = pd.DataFrame(prices_pred["mid_price"])

    lags = 6
    cols = list()
    for lag in range(1, lags + 1):
        col = f"Lag {lag}"
        cols.append(col)
        df_simple_rtn_real[col] = df_simple_rtn_real["mid_price"].pct_change(periods=lag)
        df_simple_rtn_pred[col] = df_simple_rtn_pred["mid_price"].pct_change(periods=lag)

    df_simple_rtn_real.dropna(inplace=True)
    df_simple_rtn_pred.dropna(inplace=True)

    df_simple_rtn_real = df_simple_rtn_real.drop(["mid_price"], axis=1)
    df_simple_rtn_pred = df_simple_rtn_pred.drop(["mid_price"], axis=1)

    df_simple_rtn_real.columns = cols
    df_simple_rtn_pred.columns = cols

    corr_real = df_simple_rtn_real.corr()
    corr_pred = df_simple_rtn_pred.corr()

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))

    axs = axs.ravel()
    corr_plot(corr_real, ax=axs[0], title="Real - Returns")
    corr_plot(corr_pred, ax=axs[1], title="Pred - Returns")

    fig.suptitle(f"Returns Autocorrelations - {stock_name}", fontsize=24)
    fig.tight_layout()
    # plt.savefig(f"../storage/thesis-gan/stylised_fact_absence_autocorrelation_{stock_name}.png")
    plt.show()
    plt.close(fig)

In [None]:
print_absence_autocorrelation_stylised_fact("NVDA")

## VOLATILITY CLUSTERING

In [None]:
def print_volatility_clustering_stylised_fact(ax, stock_name):
    stock_index = stock_names.index(stock_name)
    prices_real = prices[:, stock_index]
    prices_pred = pred_prices[:, stock_index]

    prices_real = pd.DataFrame(prices_real, columns=["mid_price"])
    prices_pred = pd.DataFrame(prices_pred, columns=["mid_price"])

    prices_real["Returns"] = prices_real["mid_price"].pct_change()
    prices_pred["Returns"] = prices_pred["mid_price"].pct_change()

    prices_real = prices_real.dropna()
    prices_pred = prices_pred.dropna()

    ax.plot(prices_real["Returns"], label="Real", color="C1")
    ax.plot(prices_pred["Returns"], label="Pred", color="C2", alpha=0.3)

    ax.set_ylabel("Returns")
    ax.set_title(stock_name, fontsize=20)

In [None]:
fig, axs = plt.subplots(2, 2)
legend_elements = [
    Line2D([0], [0], color="C1", lw=2, label="Real"),
    Line2D([0], [0], color="C2", alpha=0.3, lw=2, label="Synthetic"),
]

print_volatility_clustering_stylised_fact(axs[0, 0], "NVDA")
print_volatility_clustering_stylised_fact(axs[0, 1], "KSU")
print_volatility_clustering_stylised_fact(axs[1, 0], "KO")
print_volatility_clustering_stylised_fact(axs[1, 1], "PEP")

fig.suptitle("Volatility clustering", fontsize=24, y=1.04)
fig.legend(handles=legend_elements, loc="upper center", ncol=2, fontsize=15, bbox_to_anchor=(0.5, 1))
fig.tight_layout()
# plt.savefig("../storage/thesis-gan/stylised_fact_volatility_clustering.png")
plt.show()
plt.close(fig)

## VOLUME VOLATILITY CORRELATION

In [None]:
def compute_avg_log_returns(x, delta):
    # x.shape = [sequence_length, n_stocks]
    x = pd.DataFrame(x)
    x = x.rolling(delta).mean().to_numpy().squeeze()
    x = x[::delta][1:]
    return x.T


def compute_avg_volumes(x, delta):
    # x.shape = [sequence_length, n_stocks]
    x = pd.DataFrame(x)
    x = x.rolling(delta).mean().to_numpy().squeeze()
    x = x[::delta][1:]
    return x.T

In [None]:
real_avg_log_returns = compute_avg_log_returns(sequence_price, 15)
real_avg_volumes = compute_avg_volumes(sequence_volume, 15)

pred_avg_log_returns = compute_avg_log_returns(pred_sequence_price, 15)
pred_avg_volumes = compute_avg_volumes(pred_sequence_volume, 15)

In [None]:
fig, ax = plt.subplots(2, 4)

for target_idx in range(4):
    stock_name = stock_names[target_idx]

    # Real volume-volatility correlation
    title = f"{stock_name} - Real"
    ax[0, target_idx].set_title(title)
    ax[0, target_idx].scatter(
        real_avg_log_returns[target_idx],
        real_avg_volumes[target_idx],
        color="C0",
    )
    ax[0, target_idx].set_xlabel("Avg log-returns")
    ax[0, target_idx].set_ylabel("Avg log-volumes")

    # Pred volume-volatility correlation
    title = f"{stock_name} - Pred"
    ax[1, target_idx].set_title(title)
    ax[1, target_idx].scatter(
        pred_avg_log_returns[target_idx],
        pred_avg_volumes[target_idx],
        color="C1",
    )
    ax[1, target_idx].set_xlabel("Avg log-returns")
    ax[1, target_idx].set_ylabel("Avg log-volumes")

fig.suptitle("Volume-Volatility Correlation", fontsize=24)
fig.tight_layout()
plt.show()
# plt.savefig("../storage/thesis-gan/stylised_fact_volume_volatility_correlation.png")
plt.close(fig)

# Correlations


In [None]:
FILE_PATH_TRAIN = "../data/ohlc_KO_PEP_NVDA_KSU_train.csv"
FILE_PATH_VAL = "../data/ohlc_KO_PEP_NVDA_KSU_val.csv"
FILE_PATH_TEST = "../data/ohlc_KO_PEP_NVDA_KSU_test.csv"

In [None]:
df_train = pd.read_csv(FILE_PATH_TRAIN)
df_train = df_train[[f"mid_price_{stock_name}" for stock_name in stock_names]]
df_train.corr()

In [None]:
df_val = pd.read_csv(FILE_PATH_VAL)
df_val = df_val[[f"mid_price_{stock_name}" for stock_name in stock_names]]
df_val.corr()

In [None]:
df_val_preds = pd.DataFrame(pred_prices, columns=df_train.columns)
df_val_preds.corr()