In [1]:
%load_ext autoreload
%autoreload 2

import os
import re

from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

from jaref_bot.data.http_api import ExchangeManager, BybitRestAPI
from jaref_bot.utils.files import load_tokens_from_file

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

import polars as pl
import polars_ols as pls

import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, coint
from itertools import combinations

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm

In [2]:
def get_saved_coins():
    data_folder = os.path.join('.', 'data')
    pattern = re.compile(r'^([^_]+)_agg_trades\.parquet$', re.IGNORECASE)
    coins = set()

    for filename in os.listdir(data_folder):
        file_path = os.path.join(data_folder, filename)
        
        # Проверяем, что это файл и соответствует шаблону
        if os.path.isfile(file_path):
            match = pattern.match(filename)
            if match:
                coin_name = match.group(1)
                coins.add(coin_name.upper())  # Для единообразия приводим к верхнему регистру

    return sorted(coins)

In [3]:
def read_and_scale_parquet(sym, period, start_date, end_date):
    df = pl.read_parquet(f'./data/agg_trades/{sym}_agg_trades.parquet')

    return df.group_by(pl.col("datetime").dt.truncate(period)
        ).agg([
                  pl.col("close").median().alias('price'),
                  pl.col("qty").sum()
              ]
        ).sort(by='datetime'
        ).filter((pl.col('datetime') >= start_date) & (pl.col('datetime') < end_date))

In [4]:
def prepare_normed_df(df):
    return df.with_columns([
        (
          (pl.col("price") - pl.col("price").mean())  # вычитаем среднее
          / pl.col("price").std()                     # делим на стандартное отклонение
        ).alias("normed_price"),
        (
        pl.col("price").log() 
        - pl.col("price").log().first()
        ).alias("log_price")
      ]).select('datetime', 'price', 'normed_price', 'log_price')

#### Скачиваем данные с Bybit

In [5]:
async def get_data(symbol, interval, n_iters):
    df = await exc_manager.get_candles(symbol=symbol, interval=interval, n_iters=n_iters)
    df = df['bybit_linear'].sort_index()
    df['Returns'] = df['Close'].pct_change()
    df['Log_price'] = np.log(df['Close'] + 1)
    df['Log_returns'] = df['Log_price'].pct_change()
    return df.dropna()

In [6]:
exc_manager = ExchangeManager()
exc_manager.add_market("bybit_linear", BybitRestAPI('linear'))

In [7]:
res = await get_data(symbol='ADA_USDT', interval='1h', n_iters=1)
res.shape

(999, 11)

In [8]:
top_1000_tokens = load_tokens_from_file("./data/top_1000_tokens.txt")
tokens_to_del = ['USDT', 'USDC', 'LEO', 'USDe', 'DAI', 'BGB', 'OKB', 'GT', 'FET', 'KCS', 'PYUSD', 'NEXO',
                'XAUt', 'RAY', 'FTT', 'TUSD', 'MATIC', 'NFT', 'AMP', 'MX', 'TFUEL', 'MOG', 'BabyDoge', 'WEMIX',
                'LAYER', 'DCR', 'PI', 'USD0',
                'ALT', 'ANKR', 'ARK', 'AXL', 'BRETT', 'DYDX', 'ID', 'IO', 'LQTY', 'LRC', 'MASK', 'MOVR', 
                 'PYTH', 'RVN', 'T', 'WLD' # Излишне волатильные активы
                ]
token_list = [x + '_USDT' for x in top_1000_tokens if x not in tokens_to_del]

In [9]:
prices = await exc_manager.get_prices()
prices = prices['bybit_linear']
len(prices)

565

In [10]:
interval = '1d'
n_iters = 1
start_date = datetime(2025, 1, 1, tzinfo=ZoneInfo("Europe/Moscow"))

main_df = pd.DataFrame()

token_dict = {'PEPE_USDT': '1000PEPE_USDT', 'SHIB_USDT': 'SHIB1000_USDT', 'FLOKI_USDT': '1000FLOKI_USDT',
             'BONK_USDT': '1000BONK_USDT', 'BTT_USDT': '1000BTT_USDT', 'XEC_USDT': '1000XEC_USDT',
             'LUNC_USDT': '1000LUNC_USDT', 'SATS_USDT': '10000SATS_USDT', 'TURBO_USDT': '1000TURBO_USDT',
             'CAT_USDT': '1000CAT_USDT'}

for token in tqdm(token_list):
    if token in token_dict.keys():
        token = token_dict[token]
    try:
        vol24 = prices.get(token).get('vol24h_usdt')
    except AttributeError:
        vol24 = 0

    if vol24 < 1_000_000:
        continue
    
    df = await get_data(symbol=token, interval=interval, n_iters=n_iters)
    
    if df.empty:
        continue
    df = df[df.index > start_date]

    first_date = df.index[0].floor('D')
    if first_date > pd.Timestamp('2025-01-01', tz='Europe/Moscow'):
        continue
        
    main_df[f'{token[:-5]}_price'] = df['Close']
    main_df[f'{token[:-5]}_return'] = df['Log_returns']

  0%|          | 0/957 [00:00<?, ?it/s]

In [11]:
ret_cols = [col for col in main_df.columns if '_return' in col]
corr_matrix = main_df[ret_cols].corr()

In [12]:
corr_matrix

Unnamed: 0,BTC_return,ETH_return,XRP_return,BNB_return,SOL_return,DOGE_return,TRX_return,ADA_return,HYPE_return,SUI_return,...,MAV_return,L3_return,PORTAL_return,GTC_return,IDEX_return,ZEREBRO_return,CATI_return,HOOK_return,BAKE_return,OG_return
BTC_return,1.00,0.80,0.77,0.67,0.79,0.80,0.48,0.74,0.62,0.76,...,0.60,0.31,0.52,0.63,0.37,0.43,0.38,0.59,0.45,0.17
ETH_return,0.80,1.00,0.73,0.73,0.77,0.86,0.52,0.73,0.59,0.76,...,0.69,0.32,0.56,0.68,0.42,0.44,0.50,0.70,0.50,0.27
XRP_return,0.77,0.73,1.00,0.61,0.77,0.77,0.45,0.84,0.50,0.68,...,0.58,0.23,0.53,0.57,0.36,0.37,0.42,0.55,0.40,0.25
BNB_return,0.67,0.73,0.61,1.00,0.66,0.72,0.49,0.58,0.55,0.64,...,0.65,0.34,0.54,0.63,0.40,0.36,0.47,0.70,0.65,0.25
SOL_return,0.79,0.77,0.77,0.66,1.00,0.78,0.45,0.75,0.60,0.76,...,0.64,0.27,0.52,0.63,0.37,0.42,0.50,0.64,0.43,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEREBRO_return,0.43,0.44,0.37,0.36,0.42,0.45,0.22,0.38,0.31,0.53,...,0.41,0.17,0.34,0.37,0.30,1.00,0.40,0.45,0.27,0.28
CATI_return,0.38,0.50,0.42,0.47,0.50,0.52,0.31,0.41,0.35,0.50,...,0.61,0.18,0.48,0.55,0.34,0.40,1.00,0.60,0.43,0.41
HOOK_return,0.59,0.70,0.55,0.70,0.64,0.71,0.44,0.55,0.56,0.66,...,0.77,0.35,0.66,0.70,0.40,0.45,0.60,1.00,0.69,0.31
BAKE_return,0.45,0.50,0.40,0.65,0.43,0.50,0.35,0.39,0.41,0.49,...,0.56,0.24,0.49,0.52,0.31,0.27,0.43,0.69,1.00,0.24


In [13]:
token = 'BTC'
token_df = corr_matrix[f'{token}_return']
token_df.shape

(192,)

In [14]:
token_df[abs(token_df) < 0.25]

PAXG_return    0.02
ZBCN_return    0.17
OM_return      0.25
BAN_return     0.24
AERGO_return   0.05
OG_return      0.17
Name: BTC_return, dtype: float64

In [15]:
# token_df[abs(token_df) > 0.75]

In [16]:
stacked = corr_matrix.stack().reset_index()
stacked.columns = ['stock1', 'stock2', 'correlation']
filtered_pairs = stacked[(stacked['correlation'] > 0.65) & (stacked['stock1'] < stacked['stock2'])].reset_index(drop=True)

In [17]:
%%time
results = []
n_rows = 300 # Сколько последних записей учитывать

for _, row in filtered_pairs.iterrows():
    stock1 = row['stock1'].replace('return', 'price')
    stock2 = row['stock2'].replace('return', 'price')
    corr_value = row['correlation']
    
    data = main_df[[stock1, stock2]].replace([np.inf, -np.inf], np.nan).dropna()
    data = data[-n_rows:]
    
    if data.empty:
        print(f"Пара {stock1}-{stock2} пропущена, т.к. нет данных после очистки")
        continue
    
    last_price_1 = data[stock1].iloc[-1]
    last_price_2 = data[stock2].iloc[-1]
    diff = last_price_2 / last_price_1
    if diff > 5 or diff < 0.2:
        continue
    
    adf_stat, p_value, _ = coint(data[stock1], data[stock2], trend='c')

    results.append({
        'stock1': stock1,
        'stock2': stock2,
        'correlation': corr_value,
        'adf_stat': adf_stat,
        'p_value': p_value,
    })

result_df = pd.DataFrame(results)

CPU times: total: 30.3 s
Wall time: 22.2 s


In [18]:
# token_1 = 'C98'
# token_2 = 'ENJ'
# toks = (token_1 + '_price', token_2 + '_price')

tokens = ['AKT', 'APT', 'ARB', 'ARKM', 'C98',  'CELO', 'CHR', 'ENJ', 'FIL', 'FLOW', 'GALA', 'GMT', 'GRT', 'GTC',
        'MANA', 'OGN', 'ONDO', 'ONG', 'OP', 'PHA', 'ROSE', 'SAND', 'STG', 'SNX', 'VET']
tok_names = [t+'_price' for t in tokens]

filter_df = pl.DataFrame(result_df).sort(by='p_value').filter(
    pl.col('p_value') < 0.05
).filter(
    (pl.col('stock1').is_in(tok_names)) & (pl.col('stock2').is_in(tok_names))
)

In [19]:
filter_df[:10]

stock1,stock2,correlation,adf_stat,p_value
str,str,f64,f64,f64
"""APT_price""","""FIL_price""",0.8678,-5.134268,9.3e-05
"""FIL_price""","""ONDO_price""",0.793049,-5.084821,0.000115
"""GTC_price""","""OP_price""",0.785636,-5.010067,0.000158
"""CELO_price""","""STG_price""",0.768769,-4.999131,0.000165
"""MANA_price""","""ONDO_price""",0.789312,-4.678131,0.000615
"""ARB_price""","""ONDO_price""",0.769994,-4.656378,0.00067
"""GRT_price""","""VET_price""",0.92622,-4.596883,0.000845
"""ARKM_price""","""ONDO_price""",0.794769,-4.48767,0.001283
"""APT_price""","""ONDO_price""",0.764191,-4.478912,0.001326
"""CELO_price""","""ONDO_price""",0.784902,-4.410593,0.001711


In [20]:
token_list = []

for row in filter_df.iter_rows(named=True):
    t1 = row["stock1"][:-6]
    t2 = row["stock2"][:-6]
    
    token_list.append((t1, t2))
print(token_list)

[('APT', 'FIL'), ('FIL', 'ONDO'), ('GTC', 'OP'), ('CELO', 'STG'), ('MANA', 'ONDO'), ('ARB', 'ONDO'), ('GRT', 'VET'), ('ARKM', 'ONDO'), ('APT', 'ONDO'), ('CELO', 'ONDO'), ('FIL', 'SNX'), ('GMT', 'VET'), ('GRT', 'SAND'), ('GALA', 'GMT'), ('CELO', 'GRT'), ('FLOW', 'SNX'), ('FLOW', 'STG'), ('FLOW', 'GTC'), ('FLOW', 'ONDO'), ('GMT', 'STG'), ('CELO', 'SAND'), ('SAND', 'SNX'), ('OGN', 'SAND'), ('OP', 'SNX'), ('OGN', 'VET')]


In [None]:
cointegrated_df = result_df[result_df['p_value'] < 0.05].sort_values(by='adf_stat').reset_index(drop=True)

In [None]:
coint_df = cointegrated_df[~((cointegrated_df['stock1'].isin(tok_names)) & (cointegrated_df['stock2'].isin(tok_names)))]

In [None]:
coint_df[0:10]

#### Загружаем данные с диска

In [None]:
period = '4h'

downloaded_tokens = get_saved_coins()

df = pl.DataFrame()
for token in downloaded_tokens:
    temp_df = pl.read_parquet(f'./data/{token}_agg_trades.parquet')
    temp_df = temp_df.group_by(pl.col("datetime").dt.truncate(period)
        ).agg(pl.col("close").median().alias('price')
        ).sort(by='datetime')
    temp_df = temp_df.with_columns(token = pl.lit(token))
    df = df.vstack(temp_df)

df = df.pivot(
        values="price",
        index="datetime",
        columns="token",
    ).sort(by='datetime'
    ).filter(
        (pl.col('datetime') >= datetime(2024, 4, 4)) & (pl.col('datetime') <= datetime(2025, 4, 1))
    ).drop('datetime'
)

In [None]:
%%time
start_date = datetime(2024, 4, 1)
end_date = datetime(2025, 2, 1)

trading_pairs = []

for sym_1, sym_2 in combinations(downloaded_tokens, 2):
    corr = df.select(f'{sym_1}', f'{sym_2}').corr()[0, 1]
    adf_stat, p_value, marks = coint(df[f"{sym_1}"], df[f"{sym_2}"], trend='ct')

    price_1 = df[f"{sym_1}"].tail(1).item()
    price_2 = df[f"{sym_2}"].tail(1).item()
    diff = price_2 / price_1
    
    if p_value < 0.05 and corr > 0.9 and diff > 0.2 and diff < 5:
        print(f'Pair: "{sym_1}" - "{sym_2}"')
        print(f'ADF stat: {adf_stat:.2f}, p_value: {p_value:.3f}; corr: {corr:.2f}')
        print(f'{sym_1}. mean: {df[f'{sym_1}'].mean():.4f}; std: {df[f'{sym_1}'].std():.4f}')
        print(f'{sym_2}. mean: {df[f'{sym_2}'].mean():.4f}; std: {df[f'{sym_2}'].std():.4f}')
        print()
        trading_pairs.append((sym_1, sym_2))
    else:
        continue

In [None]:
print(len(trading_pairs))
print(trading_pairs)

In [None]:
df_results = pl.DataFrame()

start_date = datetime(2024, 3, 1)
end_date = datetime(2025, 2, 1)

for sym_1, sym_2 in tqdm(trading_pairs):
    results = []
    for period, roll_wind, dev_in, dev_out in params:
        df, balance_hist = run_simulation(sym_1, sym_2, period, roll_wind, start_date, end_date, dev_in, dev_out, long_sl_dev=5,
                        short_sl_dev=5, balance=1_000, max_order_size=1000, fee_perc=0.00075, verbose=False)
        print(f'{sym_1} - {sym_2}: {period=}, {roll_wind=}, {dev_in=}, {dev_out=}', end=' ')
        metrics = analyze_strategy(balance_hist, start_date, end_date)
        record = {
            'period': period,
            'roll_wind': roll_wind,
            'dev_in': dev_in,
            'dev_out': dev_out
        }
        record.update(metrics)
        results.append(record)
    res_df = pl.DataFrame(results)
    res_df = res_df.with_columns(
        pl.lit(f'{sym_1}').alias('sym_1'),
        pl.lit(f'{sym_2}').alias('sym_2'),
        (pl.col('annual_return') / pl.col('std_return')).alias('norm_return')
    )
    df_results = df_results.vstack(res_df.sort(by='norm_return', descending=True).head(3))


In [None]:
df_results.drop('start_date', 'end_date', 'total_days', 'n_deals', 'initial_balance', 'total_return', 'avg_return',
               'calmar_ratio', 'norm_return', 'profit_factor', 'expected_return', 'avg_usdt_per_deal', 'max_profit',
               'avg_profit', 'avg_loss', 'max_loss', 'winning_trades', 'losing_trades'
               ).group_by(['sym_1', 'sym_2']
                         ).agg(pl.col('annual_return').mean().alias('best_res')
                ).sort(by='best_res', descending=True).head(10)

In [None]:
# "AUDIO" - "LRC"   : 31.866667
# "AUDIO" - "MINA"  : 24.266667
# "MANTA" - "WLD"   : 22.066667
# "SSV" - "TIA"     : 21.933333
# "AUDIO" - "ICX"   : 20.1
# "LRC" - "ONT"     : 16.8
# "MANTA" - "OP"    : 16.2
# "RVN" - "T"       : 15.7
# "TIA" - "WLD"     : 15.5
# "ICX" - "ONT"     : 13.366667
# "T" - "WAXP"      : 13.1
# "FLOW"	"ONT"	12.9
# "MINA"	"OP"	12.2
# "NEAR"	"OP"	11.7
# "RVN"	"WAXP"	11.066667
# "SAND"	"XTZ"	10.433333
# "DYM"	"MINA"	9.266667
# "FLOW"	"ICX"	5.933333

#### Рассчёты

In [None]:
coins = get_saved_coins()

In [None]:
start_date = datetime(2024, 4, 1)
end_date = datetime(2025, 2, 1)
period = '1h'

schema = {
    'datetime': pl.Datetime('ms'),
}

df = pl.DataFrame(schema=schema)
for coin in coins:
    tdf = read_and_scale_parquet(coin, period=period, start_date=start_date, end_date=end_date)
    tdf = prepare_normed_df(tdf).select('datetime', 'normed_price').rename({'normed_price': coin})
    df = df.join(tdf, on='datetime', how='outer', coalesce=True)

In [None]:
from scipy.spatial.distance import pdist, squareform
dist_condensed = pdist(df.drop('datetime').to_pandas().T, metric="euclidean")
dist_matrix_sqaure = squareform(dist_condensed, checks = True)

In [None]:
fig, ax = plt.subplots(1, 1,figsize = (10, 10), dpi=400)
cmap = sns.cubehelix_palette(light=0.9, as_cmap=True)

sns.heatmap(dist_matrix_sqaure, annot=True, fmt=".1f", ax=ax, linewidths=.5, cmap=cmap)
# ax.set_xticklabels(df.columns[1:], rotation = 90, fontsize = 10)
# img = ax.set_yticklabels(df.columns[1:], rotation = 0, fontsize = 10)
ax.set_title(r'$SS_{i,j} Matrix$')
fig.tight_layout()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
fig, ax = plt.subplots(1,1,figsize = (14, 4), dpi = 100)
Z = linkage(dist_condensed)
dn = dendrogram(Z, ax = ax, labels = df.columns[1:])
ax.set_ylabel(r'$SS_{i,j}$')

#### Графики

In [None]:
def read_parquet_and_make_pair_df(sym_1, sym_2, period, roll_wind, start_date, end_date):
    df_1 = read_and_scale_parquet(sym_1, period, start_date=start_date, end_date=end_date)
    df_2 = read_and_scale_parquet(sym_2, period, start_date=start_date, end_date=end_date)

    df = df_1.join(df_2, on='datetime', suffix=f'_{sym_2}', how='full'
    ).rename({'price': f'price_{sym_1}', 'qty': f'qty_{sym_1}'}
    ).drop(f'datetime_{sym_2}'
    ).with_columns(
    pl.col(f"price_{sym_2}")
      .least_squares.rolling_ols(
          f"price_{sym_1}",
          window_size=roll_wind,
          mode="coefficients",
          add_intercept=True
    ).alias("regression_coef").shift(1)
    ).with_columns([
    pl.col("regression_coef").struct.field(f"price_{sym_1}").alias("beta"),
    pl.col("regression_coef").struct.field("const").alias("alpha")
    ]).drop('regression_coef'
    ).with_columns(
        (pl.col(f'price_{sym_2}') - (pl.col('alpha') + pl.col('beta') * pl.col(f'price_{sym_1}'))
        ).alias('spread')
    ).with_columns(
        pl.col('spread').rolling_mean(window_size=roll_wind).alias('mean'),
        pl.col('spread').rolling_std(window_size=roll_wind).alias('std')
    )
    
    return df.filter((pl.col('datetime') >= start_date) & (pl.col('datetime') < end_date))

In [None]:
start_date = datetime(2025, 2, 5)
end_date = datetime(2025, 7, 25)

sym_1 = 'STRK'
sym_2 = 'XAI'

In [None]:
df = read_parquet_and_make_pair_df(sym_1, sym_2, period='1d', roll_wind=10, start_date=start_date, end_date=end_date)
print(f'{sym_1}. mean: {df[f"price_{sym_1}"].mean():.4f}; std: {df[f"price_{sym_1}"].std():.4f}')
print(f'{sym_2}. mean: {df[f"price_{sym_2}"].mean():.4f}; std: {df[f"price_{sym_2}"].std():.4f}')

In [None]:
df.tail(3)

In [None]:
df.drop_nulls()[[f'price_{sym_1}', f'price_{sym_2}']].corr()

In [None]:
import altair as alt

In [None]:
def print_pair(df, sym_1, sym_2):
    df_melt = df.melt(
        id_vars=['datetime'], 
        value_vars=[f'{sym_1}', f'{sym_2}'], 
        variable_name='price_type', 
        value_name='price_value'
    )
    
    prices_graph = alt.Chart(df_melt).mark_line().encode(
        x=alt.X('datetime:T', title=''),
        y=alt.Y('price_value:Q', title='Цена'),
        color=alt.Color('price_type:N', legend=alt.Legend(
            title="Token",
            orient='none',  # Убирает автоматическое позиционирование
            legendX=400,    # X-координата от левого края графика
            legendY=0,    # Y-координата от верхнего края графика
            direction='vertical',
            symbolStrokeWidth=3
        ))
    ).properties(
        title='Prices',
        width=480,
        height=150
    )
    
    spread_graph = alt.Chart(df).mark_line().encode(
        x=alt.X('datetime:T', title=''),
        y=alt.Y('spread:Q', title='Цена')
    ).properties(
        title='Spread',
        width=480,
        height=150
    )

    mean_line = alt.Chart(df).mark_line(color='black').encode(
        x=alt.X('datetime:T', title=''),
        y=alt.Y('mean:Q')
    )
    
    graph = alt.vconcat(prices_graph | (spread_graph + mean_line)).configure_view(
        strokeWidth=0
    )
        
    return graph

In [None]:
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

In [None]:
sym_1 = 'STRK'
sym_2 = 'XAI'

start_date = datetime(2025, 2, 5)
end_date = datetime(2025, 7, 15)
period = '1h'

schema = {
    'datetime': pl.Datetime('ms'),
}

df = pl.DataFrame(schema=schema)
for coin in [sym_1, sym_2]:
    tdf = read_and_scale_parquet(coin, period=period, start_date=start_date, end_date=end_date
                                ).select('datetime', 'price').rename({'price': coin})
    # tdf = prepare_normed_df(tdf).select('datetime', 'log_price').rename({'log_price': coin})
    df = df.join(tdf, on='datetime', how='outer', coalesce=True)

In [None]:
# print_pair(df, sym_1, sym_2)

In [None]:
df.tail(2)