In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm

In [None]:
_col_ = ['eventTime', 'tradeId', 'price', 'quantity', 'buyerId', 'sellerId','tradeTime', 'isBuyer']

In [None]:
txn_df = pd.read_csv("./data/APEUSDT-20220920172734.csv", names=_col_)

In [None]:
txn_df.head(2)

In [None]:
# seller orders are matched due to buyer's action
sell_order_df = txn_df[txn_df['isBuyer']].groupby(['sellerId', 'eventTime']).agg({'price': min, 'quantity': sum})

# buyer orders are matched due to seller's action
buy_order_df = txn_df[~txn_df['isBuyer']].groupby(['buyerId', 'eventTime']).agg({'price': max, 'quantity': sum})

In [None]:
sell_order_df.sort_index(level=1, inplace=True)
buy_order_df.sort_index(level=1, inplace=True)

plt.figure(figsize=(20, 7))
plt.plot(sell_order_df.index.get_level_values(1), sell_order_df['quantity'], alpha = 0.5, color='r')
plt.plot(buy_order_df.index.get_level_values(1), buy_order_df['quantity'], alpha = 0.5, color='g')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
plt.scatter(sell_order_df.index.get_level_values(1), sell_order_df['price'], alpha = 0.005, marker='.', color='r')
plt.show()
plt.figure(figsize=(20, 5))
plt.scatter(buy_order_df.index.get_level_values(1), buy_order_df['price'], alpha=0.005, marker='.', color = 'g')
plt.show()

In [None]:
test_time = 1663893105429
time_interval = 100000
price_unit = 0.001
price_str_fmt = "{:.3f}"
recent_price = txn_df[txn_df['eventTime'] > test_time].iloc[0]['price']
print (f"most recent trade at price {recent_price}")
price_range_min = recent_price - 5 * price_unit
price_range_max = recent_price + 5 * price_unit 
start_time = test_time - time_interval
end_time = test_time + time_interval

# subset of df
sub_sell_df = sell_order_df[(sell_order_df.index.get_level_values(1) >= start_time) & \
    (sell_order_df.index.get_level_values(1) < end_time)].copy()
sub_buy_df = buy_order_df[(buy_order_df.index.get_level_values(1) >= start_time) & \
    (buy_order_df.index.get_level_values(1) < end_time)].copy()

# time weighted quantity
sub_sell_df['_time_weight'] = 1.0 - (sub_sell_df.index.get_level_values(1) - test_time)/time_interval
sub_sell_df['_time_weighted_q'] = sub_sell_df['quantity'] * sub_sell_df['_time_weight']
sub_buy_df['_time_weight'] = 1.0 - (sub_buy_df.index.get_level_values(1) - test_time)/time_interval
sub_buy_df['_time_weighted_q'] = sub_buy_df['quantity'] * sub_buy_df['_time_weight']

# price range
sub_sell_df = sub_sell_df.query(f'(`price` <= {price_range_max}) & (`price` >= {price_range_min})')
sub_buy_df = sub_buy_df.query(f'(`price` <= {price_range_max}) & (`price` >= {price_range_min})')

# cleaned df ("{:.3f}".format(0.09))
clean_df = pd.DataFrame({'_price_str_': [price_str_fmt.format(i) for i in np.arange(price_range_min, price_range_max + price_unit, price_unit)]})
sub_sell_df['_price_str_'] = sub_sell_df['price'].apply(lambda x: price_str_fmt.format(x))
sub_buy_df['_price_str_'] = sub_buy_df['price'].apply(lambda x: price_str_fmt.format(x))
clean_df = clean_df.merge(sub_sell_df.groupby('_price_str_')['_time_weighted_q'].sum().rename('_sell_q_'), \
    left_on = '_price_str_', right_index = True, how = 'left')
clean_df = clean_df.merge(sub_buy_df.groupby('_price_str_')['_time_weighted_q'].sum().rename('_buy_q_'), \
    left_on = '_price_str_', right_index = True, how = 'left')
clean_df.fillna(0.0, inplace=True)

# plot
clean_df.plot(x = '_price_str_', kind = 'bar', figsize = (20, 5), alpha = 0.5)
# plt.figure(figsize=(20, 5))
# sub_sell_df.groupby('price')['_time_weighted_q'].sum().plot(kind = 'bar', color = 'r', alpha = 0.4)
# sub_buy_df.groupby('price')['_time_weighted_q'].sum().plot(kind = 'bar', color = 'g', alpha = 0.4)

In [None]:
def getFeature(start_time, end_time, df, anchor_price, price_tick_cnt = 5, price_unit = 0.001, price_str_fmt = "{:.3f}", flag_time_weighted = True):
    '''
    Calculate feature/label
    feature: flag_time_weighted = False
    label: flag_time_weighted = True
    df: can be either sell_df or buy_df
    '''
    # time_interval
    time_interval = end_time - start_time + 1
    # price range
    price_range_min = anchor_price - price_tick_cnt * price_unit
    price_range_max = anchor_price + price_tick_cnt * price_unit 
    # sub df
    sub_df = df[(df.index.get_level_values(1) > start_time) & \
        (df.index.get_level_values(1) < end_time)].copy()
    # time weighted
    if flag_time_weighted:
        # time weighted quantity
        sub_df['_time_weight'] = 1.0 - (sub_df.index.get_level_values(1) - start_time)/time_interval
        sub_df['_time_weighted_q'] = sub_df['quantity'] * sub_df['_time_weight']
    else:
        sub_df['_time_weighted_q'] = sub_df['quantity']
    sub_df = sub_df.query(f'(`price` <= {price_range_max}) & (`price` >= {price_range_min})').copy()
    sub_df['_price_str_'] = sub_df['price'].apply(lambda x: price_str_fmt.format(x))
    # clean df
    clean_df = pd.DataFrame({'_price_str_': [price_str_fmt.format(i) for i in np.arange(price_range_min, price_range_max+price_unit - 1e-10, price_unit)]})
    clean_df = clean_df.merge(sub_df.groupby('_price_str_')['_time_weighted_q'].sum().rename('_tw_q_'), \
        left_on = '_price_str_', right_index = True, how = 'left').fillna(0.0)
    numpy_v = clean_df['_tw_q_'].values
    return numpy_v

def getDataSet(txn_df, time_X, time_y, time_moving_step = 10, flag_norm = True):
    '''
    Prepare training dataset with moving window
    |----- time_X -----|--- time_y ---|
    |    buy/sell df (anchor) buy/sell|
    - step --> |----- time_X -----|--- time_y ---|
    '''
    # valid time range for dataset
    max_data_time = txn_df['eventTime'].max() - time_y
    min_data_time = txn_df['eventTime'].min() + time_X
    # seller orders are matched due to buyer's action
    sell_order_df = txn_df[txn_df['isBuyer']].groupby(['sellerId', 'eventTime']).agg({'price': min, 'quantity': sum})
    # buyer orders are matched due to seller's action
    buy_order_df = txn_df[~txn_df['isBuyer']].groupby(['buyerId', 'eventTime']).agg({'price': max, 'quantity': sum})
    # main loop
    data_X = []
    data_y_sell = []
    data_y_buy = []
    for datapoint_t in tqdm.tqdm(range(min_data_time, max_data_time, time_moving_step)):
        # anchor price
        anchor_price = txn_df[txn_df['eventTime'] < datapoint_t].iloc[-1]['price']
        # sell_X
        sell_X = getFeature(datapoint_t - time_X, datapoint_t, sell_order_df, anchor_price, flag_time_weighted = False)
        # buy_X
        buy_X = getFeature(datapoint_t - time_X, datapoint_t, buy_order_df, anchor_price, flag_time_weighted = False)
        # sell_y
        sell_y = getFeature(datapoint_t, datapoint_t + time_y, sell_order_df, anchor_price, flag_time_weighted = True)
        # buy_y
        buy_y = getFeature(datapoint_t, datapoint_t + time_y, buy_order_df, anchor_price, flag_time_weighted = True)
        # normalization and save
        X = np.concatenate([sell_X, buy_X])
        min_X = np.min(X)
        max_X = np.max(X)
        if flag_norm:
            data_X.append((X-min_X)/(max_X - min_X))
            data_y_sell.append((sell_y-min_X)/(max_X - min_X))
            data_y_buy.append((buy_y-min_X)/(max_X - min_X))
        else:
            data_X.append(X)
            data_y_sell.append(sell_y)
            data_y_buy.append(buy_y)
    return data_X, data_y_sell, data_y_buy

In [None]:
# getFeature(start_time, end_time, sell_order_df, 6.053, flag_time_weighted=False)
# getDataSet(txn_df, 200000, 100000, flag_norm=False)

In [None]:
X, y1, y2 = getDataSet(txn_df, 200000, 100000, time_moving_step=10000, flag_norm=True)

In [None]:
with open("./feature/APE_data.npy", "wb") as f:
    np.save(f, np.array(X))
    np.save(f, np.array(y1))
    np.save(f, np.array(y2))