In [2]:
# Standard
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from arch.univariate import GARCH, ConstantMean, Normal

# Custom
from lib.preprocessing import get_valid_df, build_data_dict_from_df, build_data_dict_from_arr

# Spec
split_date_list = pd.to_datetime(['2014-01-01', '2020-01-01', '2022-01-01'])

# Preprocess Refinitv

In [3]:
# Load data
dataset = '18k_stock'
df_list = pickle.load(open(f'data/cdm/{dataset}.pkl', 'rb'))
if dataset == '500_fx': df_list = [df for df in df_list if 'MID_PRICE' in df.columns]

df_list_valid = [get_valid_df(df, split_date_list) for df in df_list]
df_list_valid = [df for df in df_list_valid if df is not None]

# shuffle 
np.random.seed(0)
np.random.shuffle(df_list_valid)

pickle.dump(df_list_valid, open(f'data/source/{dataset}.pkl', 'wb'))

# GARCH Simulation

In [28]:
def sample_parameters(alpha_max=0.4):
    omega = np.random.uniform(0.01, 0.10)
    alpha = np.random.uniform(0.1, alpha_max)
    beta = np.random.uniform(1-alpha_max, 1-alpha) 
    return np.array([omega, alpha, beta]).reshape(1, -1)

In [None]:
# Simulate GARCH parameters
np.random.seed(0)
N = int(2e5)
params = np.zeros((N, 4))

for i in range(N):
    omega = np.random.uniform(0.01, 0.10)
    alpha = np.random.uniform(0.1, 0.3)
    beta = np.random.uniform(0.7, 1-alpha)  
    params[i] = 0, omega, alpha, beta


# Simulate GARCH series
n_obs = 4000
simulated_data = np.zeros((params.shape[0], n_obs, 2), dtype=np.float32)

dist = Normal(seed=0)
for i in range(params.shape[0]):
    vol = GARCH(p=1, o=0, q=1)
    repro_mod = ConstantMean(None, volatility=vol, distribution=dist)
    simulated_data[i] = repro_mod.simulate(params[i], n_obs).values[:, :2]

pickle.dump(simulated_data[:110000], open('data/source/110k_garch.pkl', 'wb'))
pickle.dump(params[:110000], open('data/source//110k_garch_params.pkl', 'wb'))

# Convert to tensor

In [3]:
fx_list = pickle.load(open('data/source/500_fx.pkl', 'rb'))
fx_dict = build_data_dict_from_df(fx_list, split_date_list)
pickle.dump(fx_dict, open('data/training/500_fx.pkl', 'wb'))

stock_list = pickle.load(open('data/source/18k_stock.pkl', 'rb'))
stock_dict = build_data_dict_from_df(stock_list, split_date_list)
pickle.dump(stock_dict, open('data/training/18k_stock.pkl', 'wb'))

garch_arr = pickle.load(open('data/source/110k_garch.pkl', 'rb'))
garch_dict = build_data_dict_from_arr(garch_arr)
pickle.dump(garch_dict, open('data/training/110k_garch.pkl', 'wb'))