In [2]:
import pandas as pd
import tick.hawkes as hk
import os
import numpy as np

BASE_IMBALANCE_LEVEL = 10
BEST_HAWKES_TRAINING_TIME = 10
BEST_MOVING_AVERAGE_TRAINING_TIME = 60
BEST_POISSON_TRAINING_TIME = 15

DATA_FOLDER = 'data_ETH_USDT'
ORDERBOOK_DF_PATH = 'C:\\Users\\Admin\\Desktop\\phd\\hawkes_coe\\bitfinex_orderbook_live\\data_ETH_USDT\\orderbook_changes'
DENSITY_FILE_PATH = os.path.join(DATA_FOLDER, 'best_densities_full.csv')


In [None]:
def get_densities_df(path: str) -> pd.DataFrame:
    best_densities_df = pd.read_csv(path)
    best_densities_df = best_densities_df[['timestamp', 'timestamp_density']].groupby('timestamp').agg({'timestamp_density': list}).reset_index()
    return best_densities_df

def get_starting_event_df(
    orderbook_dfs_path: str, base_imbalance_level: int, timestamp_file: int
) -> pd.DataFrame:
    orderbook_df_file_path = get_orderbook_df_file_path(
                        orderbook_dfs_path,
                        timestamp_file,
                    )
    orderbook_df = pd.read_csv(orderbook_df_file_path, sep='\t')
    orderbook_df = get_preprocessed_df(orderbook_df, base_imbalance_level)
    return orderbook_df

def get_orderbook_df_file_path(
    orderbook_dfs_path: str,
    timestamp: str,
) -> str:
    filename_for_finished_orderbook = f'orderbook_changes_{timestamp}.tsv'

    if filename_for_finished_orderbook in os.listdir(orderbook_dfs_path):
        return os.path.join(
            orderbook_dfs_path,
            filename_for_finished_orderbook
        )
    else:
        filename_for_interrupted_orderbook = f'orderbook_changes_{timestamp}_interrupted.tsv'
        if filename_for_interrupted_orderbook in os.listdir(orderbook_dfs_path):
            return os.path.join(
                orderbook_dfs_path,
                filename_for_interrupted_orderbook
            )
        else:
            raise FileNotFoundError(f'No file found for timestamp {timestamp}')

def get_preprocessed_df(df: pd.DataFrame, base_imbalance_orderbook_level: int) -> pd.DataFrame:
    df['MidPrice'] = (df["AskPrice1"]+df["BidPrice1"])/2
    df['Return'] = (-df["MidPrice"]+df["MidPrice"].shift(-1)) / df["MidPrice"]

    pbid = df["BidPrice1"] - df[f"BidPrice{base_imbalance_orderbook_level}"]
    pask = df[f"AskPrice{base_imbalance_orderbook_level}"] - df["AskPrice1"]
    df["BaseImbalance"] = (pbid-pask)/(pbid+pask)

    df=df.dropna(subset=['Return', 'BaseImbalance'])
    df = df[df['Return'] != 0]
    df = df[['Timestamp', 'BaseImbalance', 'Return']]

    return df

def get_attack_times(
    orderbook_df: pd.DataFrame, start_time_training: pd.Timestamp, end_time_simulation: pd.Timestamp, prediction_period_duration: pd.Timedelta
) -> np.ndarray:
    attack_times = pd.to_datetime(
        orderbook_df['Timestamp'], unit='ms'
    ).to_numpy()

    end_time_simulation_with_offset = end_time_simulation + prediction_period_duration

    first_event_not_in_simulation = attack_times[attack_times > end_time_simulation_with_offset][0]
    attack_times = attack_times[
        (attack_times >= start_time_training) & (attack_times <= end_time_simulation_with_offset)
    ].copy()
    attack_times = np.append(attack_times, first_event_not_in_simulation)

    attack_times = np.array([dt.astype('datetime64[ms]').astype(float) / 1000 for dt in attack_times])

    return attack_times

def get_training_attack_times(
    attack_times: np.ndarray, start_time_simulation_timestamp: float, start_time_training_timestamp: float
) -> np.ndarray:
    return attack_times[
        attack_times <= start_time_simulation_timestamp
    ] - start_time_training_timestamp



In [None]:
best_densities_df = get_densities_df(DENSITY_FILE_PATH)

for row in best_densities_df.itertuples():
    timestamp_file = row.timestamp
        
    for timestamp_start_simulation in row.timestamp_density:
        orderbook_df = get_starting_event_df(
            ORDERBOOK_DF_PATH, BASE_IMBALANCE_LEVEL, timestamp_file
        )

        start_time_simulation = pd.Timestamp(timestamp_start_simulation, unit='s')
