In [None]:
import pickle
import numpy as np
import pandas as pd
from openTSNE import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
plt.rcParams["figure.figsize"] = [16, 9]
# plt.rcParams["figure.dpi"] = 300
plt.rcParams["font.size"] = 20
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["axes.titlesize"] = 24
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["font.family"] = "serif"

In [None]:
FONT_SIZE_TITLE_PLOT = 48  # 40
FONT_SIZE_TITLE_AX = 36  # 30
FONT_SIZE_LABEL = 30  # 24
FONT_SIZE_TICKS = 24  # 20
FONT_SIZE_LEGEND = 32  # 28

In [None]:
PROJECT_FOLDER = "PycharmProjects/thesis-gan"

In [None]:
with open(PROJECT_FOLDER + "/data/timegan/generated_data.pickle", "rb") as f:
    data_timegan = pickle.load(f)
n, seq_len, n_features = data_timegan.shape
# data_timegan = np.reshape(data_timegan, newshape=(n, seq_len*n_features))
data_timegan = np.mean(data_timegan, 2)
data_timegan.shape

In [None]:
scaler = MinMaxScaler()
data = pd.read_csv(PROJECT_FOLDER + "/data/midprice_volume__KO_PEP_NVDA_KSU__train.csv", index_col=0)
data = data.to_numpy()
data_scaled = scaler.fit_transform(data)
data_real_scaled = np.array([data_scaled[i : i + 100] for i in range(data.shape[0] - 100)])
n, seq_len, n_features = data_real_scaled.shape
# data_real_scaled_train = np.reshape(data_real_scaled, newshape=(n, seq_len*n_features))
data_real_scaled_train = np.mean(data_real_scaled, 2)
data_real_scaled_train.shape

In [None]:
scaler = MinMaxScaler()
data = pd.read_csv(PROJECT_FOLDER + "/data/midprice_volume__KO_PEP_NVDA_KSU__val.csv", index_col=0)
data = data.to_numpy()
data_scaled = scaler.fit_transform(data)
data_real_scaled = np.array([data_scaled[i : i + 100] for i in range(data.shape[0] - 100)])
n, seq_len, n_features = data_real_scaled.shape
# data_real_scaled_val = np.reshape(data_real_scaled, newshape=(n, seq_len*n_features))
data_real_scaled_val = np.mean(data_real_scaled, 2)
data_real_scaled_val.shape

In [None]:
RUN_ID_PRICE = "13v3dpxg"
EPOCH_PRICE = 121
PATH_PICKLE_REAL_PRICE = f"{PROJECT_FOLDER}/storage/thesis-gan/{RUN_ID_PRICE}/reals.pickle"
PATH_PICKLE_PRED_PRICE = f"{PROJECT_FOLDER}/storage/thesis-gan/{RUN_ID_PRICE}/preds_epoch={EPOCH_PRICE}-seed=42-target_price=mid_price-target_volume=None-sampling_seed=599121577.pickle"

In [None]:
scaler = MinMaxScaler()
with open(PATH_PICKLE_PRED_PRICE, "rb") as f:
    data_conv = pickle.load(f)
data_conv = data_conv["pred_prices"].T
data_conv_scaled = scaler.fit_transform(data_conv)
data_conv_scaled = np.array([data_conv_scaled[i : i + 100] for i in range(data_conv_scaled.shape[0] - 100)])
n, seq_len, n_features = data_conv_scaled.shape
# data_conv_scaled = np.reshape(data_conv_scaled, newshape=(n, seq_len*n_features))
data_conv_scaled = np.mean(data_conv_scaled, 2)
data_conv_scaled.shape

In [None]:
tsne = TSNE(
    perplexity=15,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    verbose=False,
)

In [None]:
dir_saving_embeddings = "tsne_embeddings"

In [None]:
embedding_timegan = tsne.fit(data_timegan)
embeddings2d_timegan = embedding_timegan.transform(data_timegan)
x_timegan, y_timegan = embeddings2d_timegan[:, 0], embeddings2d_timegan[:, 1]
dict_timegan = {"x": x_timegan, "y": y_timegan}
with open(f"{PROJECT_FOLDER}/data/timegan/{dir_saving_embeddings}/timegan.pickle", "wb") as handle:
    pickle.dump(dict_timegan, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
embedding_real_train = tsne.fit(data_real_scaled_train)
embeddings2d_real_train = embedding_real_train.transform(data_real_scaled_train)
x_real_train, y_real_train = embeddings2d_real_train[:, 0], embeddings2d_real_train[:, 1]
dict_real_train = {"x": x_real_train, "y": y_real_train}
with open(f"{PROJECT_FOLDER}/data/timegan/{dir_saving_embeddings}/real_train.pickle", "wb") as handle:
    pickle.dump(dict_real_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
embedding_real_val = tsne.fit(data_real_scaled_val)
embeddings2d_real_val = embedding_real_val.transform(data_real_scaled_val)
x_real_val, y_real_val = embeddings2d_real_val[:, 0], embeddings2d_real_val[:, 1]
dict_real_val = {"x": x_real_val, "y": y_real_val}
with open(f"{PROJECT_FOLDER}/data/timegan/{dir_saving_embeddings}/real_val.pickle", "wb") as handle:
    pickle.dump(dict_real_val, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
embedding_conv = tsne.fit(data_conv_scaled)
embeddings2d_conv = embedding_conv.transform(data_conv_scaled)
x_conv, y_conv = embeddings2d_conv[:, 0], embeddings2d_conv[:, 1]
dict_conv = {"x": x_conv, "y": y_conv}
with open(f"{PROJECT_FOLDER}/data/timegan/{dir_saving_embeddings}/conv.pickle", "wb") as handle:
    pickle.dump(dict_conv, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Plots

In [None]:
with open(f"{PROJECT_FOLDER}/data/timegan/{dir_saving_embeddings}/timegan.pickle", "rb") as f:
    dict_timegan = pickle.load(f)
x_timegan, y_timegan = dict_timegan["x"], dict_timegan["y"]

In [None]:
with open(f"{PROJECT_FOLDER}/data/timegan/{dir_saving_embeddings}/real_train.pickle", "rb") as f:
    dict_real_train = pickle.load(f)
x_real_train, y_real_train = dict_real_train["x"], dict_real_train["y"]

In [None]:
with open(f"{PROJECT_FOLDER}/data/timegan/{dir_saving_embeddings}/real_val.pickle", "rb") as f:
    dict_real_val = pickle.load(f)
x_real_val, y_real_val = dict_real_val["x"], dict_real_val["y"]

In [None]:
with open(f"{PROJECT_FOLDER}/data/timegan/{dir_saving_embeddings}/conv.pickle", "rb") as f:
    dict_conv = pickle.load(f)
x_conv, y_conv = dict_conv["x"], dict_conv["y"]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 9))

axes[0].scatter(x_real_train, y_real_train, alpha=1, c="C0", label="Real")
axes[0].scatter(x_timegan, y_timegan, alpha=0.1, c="C1", label="Synthetic - TimeGAN")

axes[1].scatter(x_real_val, y_real_val, alpha=1, c="C2", label="Real")
axes[1].scatter(x_conv, y_conv, alpha=0.5, c="C3", label="Synthetic - Conv")


axes[0].set_xticklabels([])
axes[0].set_yticklabels([])
axes[1].set_xticklabels([])
axes[1].set_yticklabels([])

axes[0].set_title("TimeGAN - Train")
axes[1].set_title("Conv - Val")

axes[0].legend(fontsize=16, loc="upper center", ncol=2)
axes[1].legend(fontsize=16, loc="upper center", ncol=2)

fig.suptitle("Scatter plot using t-SNE")
fig.tight_layout()
plt.show()
# plt.savefig('data/tsne.pdf')
plt.close(fig)