In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import statistics
from scipy.stats import pearsonr
import networkx as nx
from tqdm import tqdm
import requests
from datetime import datetime

# Feature Engeneerig

In [17]:
joined_df = pd.read_parquet(r'C:\Users\Эвелина Новикова\joined_df.parquet')
sol = pd.read_parquet(r'C:\Users\Эвелина Новикова\Documents\Downloads\binance-futures_book_snapshot_25_2024-11-01_SOLUSDT.parquet')

## LOB Features

### 1. Mid price (lagged)

In [25]:
mid = joined_df['SOL']
mid.isna().sum()

0

### 2. Spread

In [19]:
sol['local_timestamp'] = pd.to_datetime(sol['local_timestamp'], unit='us')
sol = sol.set_index('local_timestamp')

In [21]:
resampled_bids = sol['bids[0].price'].resample('100ms').ffill()
resampled_bids = resampled_bids.dropna()

In [23]:
resampled_asks = sol['asks[0].price'].resample('100ms').ffill()
resampled_asks = resampled_asks.dropna()

In [27]:
spread = resampled_asks - resampled_bids
spread.isna().sum()

0

### 3. Отношение объёмов

In [29]:
sol['ask_vol'] = [0]*len(sol)
sol['bid_vol'] = [0]*len(sol)

In [31]:
for i in range(0, 25):
    sol['ask_vol'] += sol.iloc[:, 2 + i*4]
    sol['bid_vol'] += sol.iloc[:, 4 + i*4]

In [33]:
resampled_asks_vol = sol['ask_vol'].resample('100ms').ffill()
resampled_bids_vol = sol['bid_vol'].resample('100ms').ffill()
resampled_asks_vol = resampled_asks_vol.dropna()
resampled_bids_vol = resampled_bids_vol.dropna()

In [35]:
vol_ratio = (resampled_bids_vol - resampled_asks_vol)/(resampled_bids_vol + resampled_asks_vol)
vol_ratio.isna().sum()

0

### 4. Скорость изменения mid price

In [37]:
diff_mid = mid.diff()

In [39]:
diff_mid.bfill(inplace=True)

In [41]:
diff_mid.isna().sum()

0

### 5. Волатильность цены

In [43]:
sigma = mid.rolling(window=10).std()

In [45]:
sigma.bfill(inplace=True)

In [47]:
sigma.isna().sum()

0

### 6. Imbalance

In [49]:
def calc_imbalance(df, lvl_count):
    bids_amount = sum(
        df[f"bids[{i}].amount"]
        for i in range(lvl_count)
    )
    asks_amount = sum(
        df[f"asks[{i}].amount"]
        for i in range(lvl_count)
    )
    imbalance_value = (bids_amount - asks_amount) / (bids_amount + asks_amount)
    return imbalance_value

In [51]:
imbalance_3_lvl = calc_imbalance(sol, 3)
imbalance_1_lvl = calc_imbalance(sol, 1)

In [53]:
sol['imb_3_lvl'] = imbalance_3_lvl
sol['imb_1_lvl'] = imbalance_1_lvl

In [55]:
imbalance_5_lvl = calc_imbalance(sol, 5)
sol['imb_5_lvl'] = imbalance_5_lvl

In [57]:
resampled_imb_5 = sol['imb_5_lvl'].resample('100ms').ffill()

In [59]:
resampled_imb_5.bfill(inplace=True)

In [61]:
resampled_imb_5.isna().sum()

0

### 7. Книжный импульс

In [63]:
impulse = resampled_imb_5.diff()

In [65]:
impulse.bfill(inplace=True)

In [67]:
impulse.isna().sum()

0

### 8. Volume-Weighted Average Price, VWAP

In [74]:
sol['mid'] = (sol['asks[0].price'] + sol['bids[0].price'])/2

In [76]:
def vwap(df, lvl_count):
    ask_weighted_price = sum(
        df[f"asks[{i}].price"] * df[f"asks[{i}].amount"]
        for i in range(lvl_count)
    )
    ask_volume = sum(df[f"asks[{i}].amount"] for i in range(lvl_count))

    bid_weighted_price = sum(
        df[f"bids[{i}].price"] * df[f"bids[{i}].amount"]
        for i in range(lvl_count)
    )
    bid_volume = sum(df[f"bids[{i}].amount"] for i in range(lvl_count))

    total_weighted_price = ask_weighted_price + bid_weighted_price
    total_volume = ask_volume + bid_volume

    vwap = total_weighted_price / total_volume

    return vwap / df["mid"]

In [78]:
vwap_3_lvl = vwap(sol, 3)
vwap_1_lvl = vwap(sol, 1)

In [80]:
sol['vwap_3_lvl'] = vwap_3_lvl
sol['vwap_1_lvl'] = vwap_1_lvl

In [82]:
vwap_5_lvl = vwap(sol, 5)
sol['vwap_5_lvl'] = vwap_5_lvl

In [84]:
resampled_vwap_5 = sol['vwap_5_lvl'].resample('100ms').ffill()

### 9. Отношение скоростей прихода новых ордеров на покупку и продажу

In [86]:
diff_asks = resampled_asks_vol.diff()
diff_asks.bfill(inplace=True)
diff_bids = resampled_bids_vol.diff()
diff_bids.bfill(inplace=True)
diff_ratio = (diff_bids - diff_asks)/(diff_bids + diff_asks)

In [88]:
diff_ratio = diff_ratio.fillna(0)

In [90]:
fts_need_correct = pd.DataFrame({'spread' : spread,
                               'vol_ratio' : vol_ratio,
                               'imb_5' : resampled_imb_5,
                               'impulse' : impulse,
                               'vwap_5' : resampled_vwap_5,
                               'diff_ratio' : diff_ratio})

In [92]:
fts_no_correct = pd.DataFrame({'mid' : mid,
                             'diff_mid' : diff_mid,
                             'sigma' : sigma})

In [94]:
lob_fts = fts_no_correct.join(fts_need_correct, how = 'left')

## Trades Features

### 10. Соотношение рыночных ордеров на продажу к лимитным

In [96]:
vol = pd.DataFrame({'asks_vol': resampled_asks_vol,
                   'bids_vol': resampled_bids_vol})

In [98]:
new_joined_df = joined_df.join(vol, how='left')

In [100]:
vol_ask_rat = new_joined_df['volume_s']/new_joined_df['bids_vol']

### 11. Соотношение рыночных ордеров на покупку к лимитным

In [102]:
vol_bid_rat = new_joined_df['volume_b']/new_joined_df['bids_vol']

### 12. Order Flow Imbalance

In [104]:
ofi = (new_joined_df['volume_b'] - new_joined_df['volume_s'])/(new_joined_df['volume_b'] + new_joined_df['volume_s'])

In [106]:
ofi = ofi.fillna(0)

### Собираем все фичи в один датасет

In [108]:
trades_fts = pd.DataFrame({'ofi': ofi,
                          'vol_ask_rat': vol_ask_rat,
                          'vol_bid_rat': vol_bid_rat})

In [110]:
fts = lob_fts.join(trades_fts, how = 'left')

In [112]:
fts['eth'] = joined_df['ETH']
fts['btc'] = joined_df['BTC']
fts['pepe'] = joined_df['PEPE']
fts['mew'] = joined_df['MEW']

In [114]:
fts_table = pa.Table.from_pandas(fts)
pq.write_table(fts_table, 'fts.parquet')