In [1]:
import polars as pl
from polars.exceptions import ColumnNotFoundError
import numpy as np
import ast
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['timezone'] = 'Europe/Moscow'

from datetime import datetime, timezone, timedelta
from zoneinfo import ZoneInfo
import pickle
from tqdm.notebook import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, root_mean_squared_error

from bot.core.db.postgres_manager import DBManager
from bot.config.credentials import host, user, password, db_name
db_params = {'host': host, 'user': user, 'password': password, 'dbname': db_name}
postgre_manager = DBManager(db_params)

from bot.analysis.pair_trading import backtest
from bot.utils.pair_trading import make_df_from_orderbooks, make_trunc_df, create_zscore_df, select_cols_1tf
from bot.analysis.strategy_analysis import analyze_strategy

#### Подготовка данных

In [2]:
# Загружаем полный датасет
# df = pl.scan_parquet('./data/test_data_dist_2.parquet')

# Выбрасываем столбцы с информацией о спреде, он нам сейчас не нужен
# all_cols = df.collect_schema().names()
# cols_to_drop = [col for col in all_cols if '_spread_' in col]
# df = df.drop(cols_to_drop).collect()
# f'{df.estimated_size():_}'

In [3]:
def get_thresholds(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()  # Удаляем пробелы и переносы строк
            if line:  # Игнорируем пустые строки
                tuple_data = ast.literal_eval(line)
                data.append(tuple_data)
    return data

In [4]:
method = 'dist'
start_bt_time = datetime(2025, 10, 22, 0, tzinfo=ZoneInfo("Europe/Moscow"))
end_time = datetime(2025, 11, 12, 0, tzinfo=ZoneInfo("Europe/Moscow"))

with open("./data/coin_information.pkl", "rb") as f:
    coin_information = pickle.load(f)

token_pairs = []
with open('./data/token_pairs.txt', 'r') as file:
    for line in file:
        a, b = line.strip().split()
        token_pairs.append((a, b))

In [5]:
# Сначала отберём только те пары, которые с индиидуальным подбором параметров дают положительный результат
# Запускаем backtest.py, а потом этот скрипт
filename = f'./data/pair_selection/ind_thresholds_{method}.txt'
thresholds = get_thresholds(filename)

leverage = 2
dist_in = 0
dist_out = 0

metrics_arr = []

for _, token_1, token_2, tf, wind, in_, out_ in tqdm(thresholds):
    try:
        metrics = dict()
        trades_arr = []

        dp_1 = float(coin_information['bybit_linear'][token_1 + '_USDT']['qty_step'])
        ps_1 = int(coin_information['bybit_linear'][token_1 + '_USDT']['price_scale'])
        dp_2 = float(coin_information['bybit_linear'][token_2 + '_USDT']['qty_step'])
        ps_2 = int(coin_information['bybit_linear'][token_2 + '_USDT']['price_scale'])

        filepath = f'./data/pair_backtest/{token_1}_{token_2}_{method}_full.parquet'
        try:
            df = pl.read_parquet(filepath, low_memory=True, rechunk=True, use_pyarrow=True)
        except FileNotFoundError:
            print(token_1, token_2)
            continue

        cols = ['time', 'ts', token_1, f'{token_1}_size', f'{token_1}_bid_price', f'{token_1}_ask_price',
            f'{token_1}_bid_size', f'{token_1}_ask_size', token_2, f'{token_2}_size',
            f'{token_2}_bid_price', f'{token_2}_ask_price', f'{token_2}_bid_size', f'{token_2}_ask_size',
            f'z_score_{wind}_{tf}']

        tdf = df.select(cols).rename({f'z_score_{wind}_{tf}': 'z_score'}).drop_nulls()

        trades_df = backtest(tdf, token_1, token_2, dp_1, dp_2, ps_1, ps_2,
            thresh_low_in=-in_, thresh_low_out=-out_, thresh_high_in=in_, thresh_high_out=out_,
            long_possible=True, short_possible=True,
            balance=100, order_size=50, fee_rate=0.00055, stop_loss_std=5.0, sl_method='leave',
            sl_seconds = 60, leverage=leverage, dist_in=0, dist_out=0,
            verbose=0)

        if trades_df.height > 0:
            trades_arr.extend(trades_df.to_dicts())
            end_date = df['time'][-1]

        all_trades = pl.DataFrame(trades_arr)
        metrics = analyze_strategy(all_trades, start_date=start_bt_time, end_date=end_date, initial_balance=100.0)

        if metrics:
            metrics_arr.append({
                            'token_1': token_1, 'token_2': token_2, 'tf': tf, 'wind': wind,
                            'thresh_in': in_, 'thresh_out': out_, 'dist_in': dist_in, 'dist_out': dist_out,
                            'n_trades': metrics['n_trades'], 'duration_avg': metrics['duration_avg'],
                            'stop_losses': metrics['stop_losses'], 'liquidations': metrics['liquidations'],
                            'profit': metrics['profit'], 'profit_ratio': metrics['profit_ratio'],
                            'max_profit': metrics['max_profit'], 'max_drawdown': metrics['max_drawdown'],
                            'max_loss': metrics['max_loss'], 'avg_profit': metrics['avg_profit'],
                            'profit_std': metrics['profit_std']})
    except ColumnNotFoundError:
        print('err')
        continue

ind_output = pl.DataFrame(metrics_arr)
ind_output.select(
    'token_1', 'token_2', 'profit', 'profit_ratio', 'max_loss', 'max_drawdown', 'duration_avg', 'n_trades'
).write_parquet('./data/pair_selection/ind_params_result.parquet')

  0%|          | 0/300 [00:00<?, ?it/s]

In [25]:
good_pairs = ind_output.filter(
    (pl.col('profit_ratio') > 1) & (pl.col('profit') > 5) & (pl.col('max_drawdown') > -5)
)

In [26]:
token_pairs = []
for row in good_pairs.iter_rows(named=True):
    token_pairs.append([row['token_1'], row['token_2']])

In [27]:
len(token_pairs)

235

In [28]:
good_pairs

token_1,token_2,tf,wind,thresh_in,thresh_out,dist_in,dist_out,n_trades,duration_avg,stop_losses,liquidations,profit,profit_ratio,max_profit,max_drawdown,max_loss,avg_profit,profit_std
str,str,str,i64,f64,f64,i64,i64,i64,duration[μs],i64,i64,f64,f64,f64,f64,f64,f64,f64
"""BLUR""","""KAS""","""1h""",18,2.5,0.5,0,0,18,10h 43m,0,0,15.45,2.002,4.21,-4.79,-4.76,0.86,2.19
"""FLOW""","""SAND""","""4h""",24,1.6,0.5,0,0,15,22h 12m 50s,0,0,12.41,2.316,2.77,-0.52,-0.52,0.83,0.95
"""KAS""","""VET""","""4h""",16,2.5,0.0,0,0,8,1d 4h 47m 33s,0,0,11.46,1.6,3.49,-3.64,-3.64,1.43,2.53
"""GRT""","""MANTA""","""4h""",30,2.5,0.5,0,0,6,1d 18h 5m 10s,0,0,40.05,5.958,12.15,-2.15,-2.15,6.67,5.33
"""GMT""","""BLUR""","""4h""",12,1.6,0.5,0,0,27,12h 34m 6s,0,0,20.46,3.445,2.4,-1.71,-1.71,0.76,0.72
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""TIA""","""XRP""","""4h""",30,2.5,0.25,0,0,6,1d 22h 28m 14s,0,0,20.28,3.545,5.03,-0.94,-0.94,3.38,2.3
"""STRK""","""MOVE""","""4h""",12,2.5,0.0,0,0,5,20h 41m 42s,0,0,29.01,5.355,12.36,0.0,0.0,5.8,4.18
"""STRK""","""SEI""","""4h""",12,2.5,0.0,0,0,5,22h 8m 37s,0,0,17.11,2.22,15.44,-3.45,-3.45,3.42,7.3
"""DYDX""","""STRK""","""4h""",24,2.5,0.0,0,0,4,23h 37m 45s,0,0,22.64,4.269,8.71,0.0,0.0,5.66,3.02


In [29]:
t_1 = 'ARKM'
t_2 = 'MANTA'

ind_output.filter(
    ((pl.col('token_1') == t_1) & (pl.col('token_2') == t_2)) | 
    ((pl.col('token_1') == t_2) & (pl.col('token_2') == t_1))
)

token_1,token_2,tf,wind,thresh_in,thresh_out,dist_in,dist_out,n_trades,duration_avg,stop_losses,liquidations,profit,profit_ratio,max_profit,max_drawdown,max_loss,avg_profit,profit_std
str,str,str,i64,f64,f64,i64,i64,i64,duration[μs],i64,i64,f64,f64,f64,f64,f64,f64,f64
"""ARKM""","""MANTA""","""4h""",24,2.25,0.0,0,0,13,15h 34m 11s,0,0,42.03,7.959,9.75,0.0,0.0,3.23,2.81


In [36]:
# Теперь для отобранных на индивидуальных параметрах пар прогоним общий тест
leverage = 2

search_space = (
    ('4h', 12), ('4h', 14), ('4h', 16), ('4h', 18), ('4h', 24), ('4h', 30),
    ('1h', 18), ('1h', 24), ('1h', 36), ('1h', 48), ('1h', 64), ('1h', 72), ('1h', 96), ('1h', 120)
)

in_params = (2.5, 2.25, 2.0, 1.8, 1.6)
out_params = (0.0, 0.25, 0.5)
# out_params = (0.25, )

dist_in = 0
dist_out = 0

s_len= len(search_space) * len(in_params) * len(out_params) * len(token_pairs)

print('Whole search space size:', s_len)

Whole search space size: 16590


In [37]:
metrics_arr = []

with tqdm(total=s_len, desc="Обработка", unit="iter") as progress_bar:
    for token_1, token_2 in token_pairs:
        dp_1 = float(coin_information['bybit_linear'][token_1 + '_USDT']['qty_step'])
        ps_1 = int(coin_information['bybit_linear'][token_1 + '_USDT']['price_scale'])
        dp_2 = float(coin_information['bybit_linear'][token_2 + '_USDT']['qty_step'])
        ps_2 = int(coin_information['bybit_linear'][token_2 + '_USDT']['price_scale'])

        filepath = f'./data/pair_backtest/{token_1}_{token_2}_{method}_full.parquet'
        try:
            df = pl.read_parquet(filepath, low_memory=True, rechunk=True, use_pyarrow=True)
        except FileNotFoundError:
            print(token_1, token_2)
            continue

        end_date = df['time'][-1]

        for tf, wind in search_space:
            for in_ in (in_params):
                for out_ in (out_params):
                    progress_bar.update(1)
                    metrics = dict()

                    cols = ['time', 'ts', token_1, f'{token_1}_size', f'{token_1}_bid_price', f'{token_1}_ask_price',
                            f'{token_1}_bid_size', f'{token_1}_ask_size', token_2, f'{token_2}_size',
                            f'{token_2}_bid_price', f'{token_2}_ask_price', f'{token_2}_bid_size', f'{token_2}_ask_size',
                            f'z_score_{wind}_{tf}']

                    tdf = df.select(cols).rename({f'z_score_{wind}_{tf}': 'z_score'}).drop_nulls()

                    trades_df = backtest(tdf, token_1, token_2, dp_1, dp_2, ps_1, ps_2,
                            thresh_low_in=-in_, thresh_low_out=-out_, thresh_high_in=in_, thresh_high_out=out_,
                            long_possible=True, short_possible=True,
                            balance=100, order_size=50, fee_rate=0.00055, stop_loss_std=5.0, sl_method='leave',
                            sl_seconds = 60, leverage=leverage, dist_in=0, dist_out=0,
                            verbose=0)

                    if trades_df.height > 0:
                        metrics = analyze_strategy(trades_df, start_date=start_bt_time, end_date=end_date, initial_balance=100.0)

                    if metrics:
                        metrics_arr.append({
                            'token_1': token_1, 'token_2': token_2, 'tf': tf, 'wind': wind,
                            'thresh_in': in_, 'thresh_out': out_, 'dist_in': dist_in, 'dist_out': dist_out,
                            'n_trades': metrics['n_trades'], 'duration_avg': metrics['duration_avg'],
                            'stop_losses': metrics['stop_losses'], 'liquidations': metrics['liquidations'],
                            'profit': metrics['profit'], 'profit_ratio': metrics['profit_ratio'],
                            'max_profit': metrics['max_profit'], 'max_drawdown': metrics['max_drawdown'],
                            'max_loss': metrics['max_loss'], 'avg_profit': metrics['avg_profit'],
                            'profit_std': metrics['profit_std']})

mass_output = pl.DataFrame(metrics_arr).sort(by='profit', descending=True)

Обработка:   0%|          | 0/16590 [00:00<?, ?iter/s]

In [38]:
# Выбираем наилучшие параметры торговли
mass_output.group_by('tf', 'wind', 'thresh_in', 'thresh_out').agg(
    pl.col('profit').mean().round(2).alias('avg_profit'),
    pl.col('profit_ratio').mean().round(2).alias('avg_pr_ratio'),
    pl.col('max_loss').min().round(2).alias('max_loss'),
    pl.col('max_drawdown').mean().round(2).alias('avg_drawdown'),
    pl.col('max_drawdown').min().round(2).alias('max_drawdown'),
    pl.col('n_trades').mean().cast(pl.Int64).alias('avg_trades'),
).sort('avg_pr_ratio', descending=True).head(5)

tf,wind,thresh_in,thresh_out,avg_profit,avg_pr_ratio,max_loss,avg_drawdown,max_drawdown,avg_trades
str,i64,f64,f64,f64,f64,f64,f64,f64,i64
"""4h""",24,2.0,0.5,16.98,2.91,-11.68,-1.65,-11.68,8
"""4h""",24,1.6,0.5,17.47,2.87,-11.95,-2.22,-11.95,11
"""4h""",24,1.8,0.5,17.04,2.86,-11.78,-1.94,-11.78,9
"""4h""",24,2.25,0.5,16.08,2.81,-11.19,-1.33,-11.19,6
"""4h""",24,2.0,0.25,16.27,2.75,-12.33,-1.83,-12.33,9


In [40]:
# Результаты для конкретных пар на общем тесте
res_df = mass_output.group_by('token_1', 'token_2').agg(
            pl.col('profit').mean().round(2).alias('avg_profit'),
            pl.col('profit').std().round(2).alias('profit_std'),
            pl.col('max_loss').min().round(2).alias('max_loss'),
            pl.col('max_drawdown').mean().round(2).alias('avg_drawdown'),
            pl.col('max_drawdown').min().round(2).alias('max_drawdown'),
            pl.col('duration_avg').mean().alias('avg_duration'),
            pl.col('profit_ratio').mean().round(2).alias('avg_pr_ratio'),
            pl.col('n_trades').mean().cast(pl.Int64).alias('avg_trades'),
        ).sort('avg_pr_ratio', descending=True)
res_df.head(3)

token_1,token_2,avg_profit,profit_std,max_loss,avg_drawdown,max_drawdown,avg_duration,avg_pr_ratio,avg_trades
str,str,f64,f64,f64,f64,f64,duration[μs],f64,i64
"""IMX""","""LDO""",22.96,5.98,-7.7,-3.37,-10.9,17h 17m 46s 314285µs,3.53,17
"""STRK""","""MANTA""",22.18,12.89,-20.69,-8.28,-23.0,19h 30m 6s 190476µs,3.16,13
"""IMX""","""SEI""",17.31,4.05,-6.44,-1.79,-8.04,19h 28m 59s 390476µs,2.92,15


In [41]:
# Выкидываем все пары, показашие плохой результат, и сохраняем хорошие пары в файл
res_df = res_df.filter(
    (pl.col('avg_profit') > 1.0) & (pl.col('avg_pr_ratio') > 1.0)
)
res_df

token_1,token_2,avg_profit,profit_std,max_loss,avg_drawdown,max_drawdown,avg_duration,avg_pr_ratio,avg_trades
str,str,f64,f64,f64,f64,f64,duration[μs],f64,i64
"""IMX""","""LDO""",22.96,5.98,-7.7,-3.37,-10.9,17h 17m 46s 314285µs,3.53,17
"""STRK""","""MANTA""",22.18,12.89,-20.69,-8.28,-23.0,19h 30m 6s 190476µs,3.16,13
"""IMX""","""SEI""",17.31,4.05,-6.44,-1.79,-8.04,19h 28m 59s 390476µs,2.92,15
"""DYDX""","""OP""",16.88,4.83,-8.42,-2.25,-8.42,14h 22m 21s 166666µs,2.84,16
"""TIA""","""DOT""",19.28,5.88,-11.88,-4.56,-11.88,18h 58m 18s 333333µs,2.77,16
…,…,…,…,…,…,…,…,…,…
"""STX""","""TIA""",8.42,5.17,-11.67,-6.92,-15.74,20h 4m 9s 595238µs,1.08,15
"""XRP""","""LDO""",7.41,8.87,-8.44,-6.94,-22.42,1d 5h 45m 7s 785714µs,1.06,10
"""DYDX""","""ARB""",7.04,5.52,-15.24,-5.32,-15.24,1d 3h 54m 36s 142857µs,1.05,11
"""ARKM""","""TIA""",8.88,4.66,-18.97,-7.6,-21.97,13h 35m 47s 161904µs,1.04,21


In [42]:
# res_df.write_parquet(f'./data/pair_selection/common_params_{method}_result.parquet')

In [20]:
# res_df.filter(pl.col('token_1') == 'ARKM')

In [35]:
# Теперь для плюсовых пар ещё раз прогоним общий бектест для подбора наилучших параметров только на этих парах.
token_pairs = []
for row in res_df.iter_rows(named=True):
    token_pairs.append([row['token_1'], row['token_2']])
len(token_pairs)

79

In [43]:
# Оставляем только лучшие параметры
leverage = 2
tf = '4h'
wind = 24
in_ = 2.0
out_ = 0.5
dist_in = 0
dist_out = 0

metrics_arr = []

for token_1, token_2 in tqdm(token_pairs):
    dp_1 = float(coin_information['bybit_linear'][token_1 + '_USDT']['qty_step'])
    ps_1 = int(coin_information['bybit_linear'][token_1 + '_USDT']['price_scale'])
    dp_2 = float(coin_information['bybit_linear'][token_2 + '_USDT']['qty_step'])
    ps_2 = int(coin_information['bybit_linear'][token_2 + '_USDT']['price_scale'])

    filepath = f'./data/pair_backtest/{token_1}_{token_2}_{method}_full.parquet'
    try:
        df = pl.read_parquet(filepath, low_memory=True, rechunk=True, use_pyarrow=True)
    except FileNotFoundError:
        continue

    end_date = df['time'][-1]
    metrics = dict()

    cols = ['time', 'ts', token_1, f'{token_1}_size', f'{token_1}_bid_price', f'{token_1}_ask_price',
            f'{token_1}_bid_size', f'{token_1}_ask_size', token_2, f'{token_2}_size',
            f'{token_2}_bid_price', f'{token_2}_ask_price', f'{token_2}_bid_size', f'{token_2}_ask_size',
            f'z_score_{wind}_{tf}']

    tdf = df.select(cols).rename({f'z_score_{wind}_{tf}': 'z_score'}).drop_nulls()

    trades_df = backtest(tdf, token_1, token_2, dp_1, dp_2, ps_1, ps_2,
            thresh_low_in=-in_, thresh_low_out=-out_, thresh_high_in=in_, thresh_high_out=out_,
            long_possible=True, short_possible=True,
            balance=100, order_size=50, fee_rate=0.00055, stop_loss_std=5.0, sl_method='leave',
            sl_seconds = 60, leverage=leverage, dist_in=0, dist_out=0,
            verbose=0)

    if trades_df.height > 0:
        metrics = analyze_strategy(trades_df, start_date=start_bt_time, end_date=end_date, initial_balance=100.0)

    if metrics:
        metrics_arr.append({
            'token_1': token_1, 'token_2': token_2, 'tf': tf, 'wind': wind,
            'thresh_in': in_, 'thresh_out': out_, 'dist_in': dist_in, 'dist_out': dist_out,
            'n_trades': metrics['n_trades'], 'duration_avg': metrics['duration_avg'],
            'stop_losses': metrics['stop_losses'], 'liquidations': metrics['liquidations'],
            'profit': metrics['profit'], 'profit_ratio': metrics['profit_ratio'],
            'max_profit': metrics['max_profit'], 'max_drawdown': metrics['max_drawdown'],
            'max_loss': metrics['max_loss'], 'avg_profit': metrics['avg_profit'],
            'profit_std': metrics['profit_std']})

best_output = pl.DataFrame(metrics_arr).sort(by='profit', descending=True)

  0%|          | 0/79 [00:00<?, ?it/s]

In [None]:
best_output.tail(2)

In [None]:
best_output.select(
    'token_1', 'token_2', 'profit', 'profit_ratio', 'max_loss', 'max_drawdown', 'duration_avg', 'n_trades'
).write_parquet(f'./data/pair_selection/best_params_{method}_result.parquet')

#### Отбор монет

In [45]:
method = 'dist'

pairs = pl.read_parquet('./data/pair_selection/curr_pairs.parquet')

# ind_params = pl.read_parquet(f'./data/pair_selection/ind_params_{method}_result.parquet')
# ind_params = ind_params.rename({'token_1': 'coin1', 'token_2': 'coin2', 'profit': 'pr_ind', 'profit_ratio': 'pr_rat_ind',
#                                      'max_loss': 'loss_ind', 'max_drawdown': 'max_drdn_ind',
#                                      'duration_avg': 'dur_ind', 'n_trades': 'trades_ind'})

common_params = pl.read_parquet(f'./data/pair_selection/common_params_{method}_result.parquet')
common_params = common_params.rename({'token_1': 'coin1', 'token_2': 'coin2', 'avg_profit': 'avg_pr_all', 'profit_std': 'pr_std_all',
                                     'max_loss': 'loss_all', 'avg_drawdown': 'avg_drdn_all', 'max_drawdown': 'max_drdn_all',
                                     'avg_duration': 'dur_all', 'avg_pr_ratio': 'pr_rat_all', 'avg_trades': 'trades_all'})

best_params = pl.read_parquet(f'./data/pair_selection/best_params_{method}_result.parquet')
best_params = best_params.rename({'token_1': 'coin1', 'token_2': 'coin2', 'profit': 'pr_best', 'profit_ratio': 'pr_rat_best',
                                     'max_loss': 'loss_best', 'max_drawdown': 'max_drdn_best',
                                     'duration_avg': 'dur_best', 'n_trades': 'trades_best'})

pairs = pairs.with_columns(
    pl.col('spr_dist').round(2),
    pl.col('spr_std').round(2),
    pl.col('corr').round(2),
    pl.col('pv_1').round(2),
    pl.col('pv_2').round(2),
)

In [46]:
# pairs = pairs.join(ind_params, on=['coin1', 'coin2'])
# pairs = pairs.join(common_params, on=['coin1', 'coin2'])


In [47]:
pairs = pairs.join(best_params, on=['coin1', 'coin2'])

cols_to_rename = {'pr_best': 'profit', 'pr_rat_best': 'profit_ratio', 'loss_best': 'max_loss', 'max_drdn_best': 'max_drn',
                 'dur_best': 'avg_duration', 'trades_best': 'trades'}
pairs = pairs.rename(cols_to_rename).sort('profit_ratio', descending=True)

pairs = pairs.filter(
    (abs(pl.col('beta_1') - pl.col('beta_2')) < 1.6) & (pl.col('profit') > 5) & (pl.col('spr_dist') < 2.8)
)

In [48]:
pairs#[0:30]

coin1,coin2,spr_dist,spr_std,std_1,std_2,dist_1,dist_2,corr,pv_1,pv_2,beta_1,beta_2,profit,profit_ratio,max_loss,max_drn,avg_duration,trades
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,duration[μs],i64
"""IOTA""","""CELO""",1.52,0.11,0.027016,0.045753,0.1158,0.209,0.91,0.12,0.02,1.50282,0.523963,24.55,4.855,0.0,0.0,1d 6h 21m 33s,7
"""FLOW""","""MANTA""",2.27,0.11,0.051957,0.050604,0.2144,0.2343,0.91,0.62,0.45,0.86947,0.916603,33.74,4.827,-2.53,-2.53,2d 1h 16m 12s,7
"""ARKM""","""OP""",1.93,0.1,0.097966,0.127943,0.457,0.5281,0.92,0.06,0.16,1.195244,0.700768,24.74,4.693,-0.15,-0.15,1d 5h 10m 5s,12
"""ARKM""","""MANTA""",1.22,0.09,0.097966,0.050604,0.457,0.2343,0.93,0.37,0.57,0.472937,1.772534,23.74,4.525,0.0,0.0,1d 3h 24m 30s,11
"""MANTA""","""CELO""",1.72,0.13,0.050604,0.045753,0.2343,0.209,0.86,0.24,0.01,0.751369,0.919116,26.34,4.481,-1.2,-1.2,1d 9h 15m 4s,7
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""FLOW""","""CELO""",2.7,0.15,0.051957,0.045753,0.2144,0.209,0.83,0.28,0.03,0.705825,0.910208,9.94,1.947,0.0,0.0,1d 19h 27m 4s,5
"""CHZ""","""VET""",1.22,0.08,0.004244,0.003466,0.01915,0.01667,0.94,0.3,0.52,0.761168,1.141186,10.59,1.788,-1.45,-1.45,1d 9h 10m 21s,8
"""IOTA""","""SEI""",2.78,0.21,0.027016,0.058258,0.1158,0.2126,0.55,0.05,0.43,1.475574,0.317317,14.7,1.736,-6.02,-6.02,1d 11h 13m 3s,10
"""SUSHI""","""ONDO""",1.21,0.07,0.128021,0.123632,0.5969,0.5511,0.95,0.01,0.01,0.921122,0.987673,8.44,1.625,-0.17,-0.17,1d 13h 2m 59s,8


In [49]:
# Жадный метод
used_tokens = []
trade_pairs_list = []
total_profit = 0

for row in pairs.iter_rows(named=True):
    t1 = row['coin1']
    t2 = row['coin2']
    profit = row['profit']
    
    if t1 in used_tokens or t2 in used_tokens:
        continue

    trade_pairs_list.append((t1, t2))
    used_tokens.append(t1)
    used_tokens.append(t2)
    total_profit += profit
    print(t1, t2, profit)
print('Общий профит:', round(total_profit, 1))

IOTA CELO 24.55
FLOW MANTA 33.74
ARKM OP 24.74
SUSHI STX 22.02
ARB SAND 15.9
IMX LDO 23.28
GMT BLUR 13.4
DOT RENDER 14.13
DYDX SEI 15.46
CHZ VET 10.59
Общий профит: 197.8


In [50]:
import networkx as nx

In [51]:
df = pairs.filter(
    abs(pl.col('beta_1') - pl.col('beta_2')) < 1.8
).with_columns([
        pl.when(pl.col("coin1") < pl.col("coin2")).then(pl.col("coin1")).otherwise(pl.col("coin2")).alias("a"),
        pl.when(pl.col("coin1") < pl.col("coin2")).then(pl.col("coin2")).otherwise(pl.col("coin1")).alias("b"),
])

In [52]:
edges = df.group_by(["a","b"]).agg(
    pl.max("profit").alias("weight")
)

In [53]:
G = nx.Graph()
for row in edges.to_dicts():
    # row: {"a": "BLUR", "b":"MANTA", "weight": 28.33}
    G.add_edge(row["a"], row["b"], weight=float(row["weight"]))

In [54]:
matching = nx.max_weight_matching(G, maxcardinality=False, weight="weight")

In [55]:
matched_rows = []
total = 0.0
for u, v in matching:
    w = G[u][v]["weight"]
    matched_rows.append({"coin1": u, "coin2": v, "profit": w})
    total += w

In [56]:
matched_df = pl.DataFrame(matched_rows)

In [59]:
print("Matched pairs:")
print(matched_df[:])
print(f"Total profit: {total:.2f}")

Matched pairs:
shape: (11, 3)
┌───────┬───────┬────────┐
│ coin1 ┆ coin2 ┆ profit │
│ ---   ┆ ---   ┆ ---    │
│ str   ┆ str   ┆ f64    │
╞═══════╪═══════╪════════╡
│ CELO  ┆ POL   ┆ 20.76  │
│ SEI   ┆ DYDX  ┆ 15.46  │
│ BLUR  ┆ GMT   ┆ 13.4   │
│ ARKM  ┆ OP    ┆ 24.74  │
│ CHZ   ┆ VET   ┆ 10.59  │
│ …     ┆ …     ┆ …      │
│ IOTA  ┆ MANA  ┆ 15.51  │
│ LDO   ┆ IMX   ┆ 23.28  │
│ ARB   ┆ SAND  ┆ 15.9   │
│ SUSHI ┆ STX   ┆ 22.02  │
│ MANTA ┆ FLOW  ┆ 33.74  │
└───────┴───────┴────────┘
Total profit: 209.53


In [61]:
pairs.filter(pl.col('coin1') == 'DYDX')

coin1,coin2,spr_dist,spr_std,std_1,std_2,dist_1,dist_2,corr,pv_1,pv_2,beta_1,beta_2,profit,profit_ratio,max_loss,max_drn,avg_duration,trades
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,duration[μs],i64
"""DYDX""","""SEI""",2.77,0.18,0.121569,0.058258,0.4693,0.2126,0.7,0.39,0.23,0.368039,1.602643,15.46,2.158,-3.84,-3.84,23h 17m 38s,12


In [7]:
trading_history = postgre_manager.get_table('trading_history', df_type='polars')

res_df = trading_history.group_by('token_1', 'token_2').agg(
        pl.col('profit').sum().round(2),
        pl.col('profit').min().round(2).alias('min_profit'),
        pl.col('profit').max().round(2).alias('max_profit'),
        (pl.col('profit').sum() / pl.col('profit').len()).round(2).alias('avg_profit'),
        pl.col('profit').len().alias('n_trades'),
    
    ).sort(by='profit', descending=True)

res_df = res_df.join(pairs, left_on=('token_1', 'token_2'), right_on=('coin1', 'coin2')).sort('profit', descending=True)
# res_df = res_df.rename({'dist': 'spr_dist', 'std': 'spr_std'})
res_df.shape

(5, 24)

In [8]:
res_df

token_1,token_2,profit,min_profit,max_profit,avg_profit,n_trades,spr_dist,spr_std,std_1,std_2,dist_1,dist_2,corr,pv_1,pv_2,beta_1,beta_2,profit_right,profit_ratio,max_loss,max_drn,avg_duration,trades
str,str,f64,f64,f64,f64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,duration[μs],i64
"""ARKM""","""MANTA""",17.3,-0.83,8.0,2.47,7,1.22,0.09,0.097966,0.050604,0.457,0.2343,0.93,0.37,0.57,0.472937,1.772534,22.6,2.889,-4.96,-4.96,16h 28m 50s,17
"""IOTA""","""CELO""",6.87,6.87,6.87,6.87,1,1.52,0.11,0.027016,0.045753,0.1158,0.209,0.91,0.12,0.02,1.50282,0.523963,25.43,4.314,-1.27,-1.27,21h 13m 32s,9
"""GMT""","""BLUR""",0.63,0.63,0.63,0.63,1,0.94,0.07,0.00961,0.016732,0.04141,0.07046,0.97,0.52,0.42,1.674928,0.55249,8.51,1.337,-1.71,-2.47,18h 58m 54s,14
"""ARKM""","""OP""",0.06,0.06,0.06,0.06,1,1.93,0.1,0.097966,0.127943,0.457,0.5281,0.92,0.06,0.16,1.195244,0.700768,13.84,1.522,-7.15,-7.15,11h 46m 32s,13
"""GRT""","""MANTA""",-16.77,-16.77,-16.77,-16.77,1,0.87,0.05,0.014741,0.050604,0.06767,0.2343,0.98,0.09,0.12,3.350767,0.284352,20.76,3.103,-2.15,-2.57,15h 35m 41s,14


In [None]:
cols = ['token_1', 'token_2', 'profit', 'min_profit', 'max_profit', 'avg_profit', 'n_trades', 'spr_dist', 'spr_std', 'std_1', 'std_2',
       'dist_1', 'dist_2', 'corr', 'pv_1', 'pv_2', 'avg_pr_all', 'pr_std_all', 'loss_all', 'avg_drdn_all', 'max_drdn_all', 'pr_rat_all',
       'pr_best', 'pr_rat_best', 'loss_best', 'max_drdn_best']

In [None]:
res_df.select(cols).head(10)

In [None]:
res_df.select(cols).tail(5)

In [None]:
pairs.filter((pl.col('coin1') == 'CELO') & (pl.col('coin2') == 'IMX'))

#### ML

In [None]:
df = pairs.select('coin1', 'coin2', 'dist', 'std', 'corr', 'pv_1', 'pv_2', 'pr_ind', 'pr_rat_ind', 'avg_pr_all', 'pr_rat_all',
            'pr_best', 'pr_rat_best', 'loss_best', 'max_drdn_best', 'dur_best', 'trades_best', 
            # 'pr_test', 'pr_rat_test', 'loss_test', 'max_drdn_test', 'dur_test', 'trades_test'
                 )

In [None]:
df.shape

In [None]:
df.sample(3)

In [None]:
X, y = df.drop('pr_rat_test'), df['pr_rat_test']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
X_test.with_columns(
    pl.Series(y_test).alias('pr_rat_real')
).filter((pl.col('pr_rat_best') > 0.5) & (pl.col('pr_rat_all') > 0.5))#['pr_rat_real'].sum()

In [None]:
4.753

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train);
lr_preds = lr.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), lr_preds)

In [None]:
# 0.8532 -> 0.8286 (добавил фичу std)

In [None]:
for name, coef in zip(lr.feature_names_in_, lr.coef_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
params = {'eps': [0.0001, 0.001, 0.01]}

lasso = LassoCV(random_state=42)
gcv = GridSearchCV(lasso, params, cv=5, n_jobs=12, verbose=0)
gcv.fit(X_train, y_train);

In [None]:
gcv.best_params_

In [None]:
lasso = LassoCV(random_state=42, eps=0.0001)
lasso.fit(X_train, y_train);
lasso_preds = lasso.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), lasso_preds)

In [None]:
# 0.8211

In [None]:
for name, coef in zip(lasso.feature_names_in_, lasso.coef_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
ridge = RidgeCV()
ridge.fit(X_train, y_train);
ridge_preds = ridge.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), ridge_preds)

In [None]:
# 0.8118 -> 0.8117 (+ std)

In [None]:
for name, coef in zip(ridge.feature_names_in_, ridge.coef_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
params = {'max_features': [4, 6, 8, 10, 13], 'min_samples_leaf': [1, 3, 5, 7], 'max_depth': [5, 6, 8, 10, 12, 15],
          'n_estimators': [40, 60, 80, 100, 125, 150]}

rfr = RandomForestRegressor(random_state=42)
gcv = GridSearchCV(rfr, params, cv=5, n_jobs=12, verbose=0)
gcv.fit(X_train, y_train);

In [None]:
gcv.best_params_

In [None]:
rf = RandomForestRegressor(random_state=42, n_estimators=80, max_depth=12, max_features=6, min_samples_leaf=5)
rf.fit(X_train, y_train);
rf_preds = rf.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), rf_preds)

In [None]:
# 0.7634 -> 0.7750 (+std)

In [None]:
for name, coef in zip(rf.feature_names_in_, rf.feature_importances_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
X_test.with_columns(
    pl.Series(rf_preds).alias('pr_rat_pred'),
    pl.Series(y_test).alias('pr_rat_real')
).filter(pl.col('pr_rat_pred') > 0.4)#['pr_rat_real'].sum()

In [None]:
# 4.325

In [None]:
params = {'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2], 'max_features': [4, 6, 8, 10, 13], 
          'min_samples_leaf': [1, 3, 5, 7], 'max_depth': [5, 6, 8, 10, 12, 15],
          'n_estimators': [40, 60, 80, 100, 125, 150]}

gbr = GradientBoostingRegressor(random_state=42)
gcv = GridSearchCV(gbr, params, cv=5, n_jobs=12, verbose=0)
gcv.fit(X_train, y_train);

In [None]:
gcv.best_params_

In [None]:
gbr = GradientBoostingRegressor(random_state=42, learning_rate=0.01, n_estimators=80, max_depth=5, 
                                max_features=4, min_samples_leaf=7)
gbr.fit(X_train, y_train);
gbr_preds = gbr.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), gbr_preds)

In [None]:
# 0.8004 -> 0.8014

In [None]:
import catboost as cb

In [None]:
params = {'learning_rate': [0.01, 0.03, 0.1, 0.3], 'iterations': [250, 500, 1000], 
          'depth': [6, 8, 10],
          'l2_leaf_reg': [1, 3, 5, 7]}
cbr = cb.CatBoostRegressor(random_state=42, verbose=False)
gcv = GridSearchCV(cbr, params, cv=5, verbose=0)
gcv.fit(X_train.to_numpy(), y_train.to_numpy());

In [None]:
gcv.best_params_

In [None]:
train_pool = cb.Pool(X_train.to_numpy(), y_train.to_numpy())

In [None]:
param_grid = {
    'iterations': tune.randint(100, 1500),
    'learning_rate': tune.loguniform(1e-3, 0.5),
    'depth': tune.randint(4, 12),
    'l2_leaf_reg': tune.loguniform(1, 10),
}

In [None]:
cb.__version__

In [None]:
from catboost.utils import grid_search

In [None]:
cbr = cb.CatBoostRegressor(random_state=42, verbose=False, learning_rate=0.01, iterations=800, depth=8,
                          loss_function='Expectile:alpha=0.7'
                          )
cbr.fit(X_train.to_numpy(), y_train.to_numpy());
cbr_preds = cbr.predict(X_test.to_numpy())
root_mean_squared_error(y_test.to_numpy(), cbr_preds)

In [None]:
cbr.save_model('./data/catboost_model.json', format='json')

In [None]:
# 0.7172

In [None]:
for name, coef in zip(X_test.columns, cbr.feature_importances_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
X_test.with_columns(
    pl.Series(cbr_preds).alias('pr_rat_pred'),
    pl.Series(y_test).alias('pr_rat_real')
).filter(pl.col('pr_rat_pred') > 0.4)['pr_rat_real'].sum()

In [None]:
# RMSE: 5.73
# Quantile, alpha=0.7 : 9.45,  RMSE: 0.7362
# Quantile, alpha=0.65: 9.07,  RMSE: 0.7172
# Quantile, alpha=0.6 : 10.04, RMSE: 0.7522
# Quantile, alpha=0.55: 6.65,  RMSE: 0.7392
# Quantile, alpha=0.5 : 5.06, RMSE: 0.7650
# Quantile, alpha=0.4 : 3.56, RMSE: 0.7893
# Quantile, alpha=0.3 : 3.56,  RMSE: 0.8197

# Expectile, alpha=0.3 : 6.25, RMSE: 0.7571
# Expectile, alpha=0.4 : 8.97, RMSE: 0.7603
# Expectile, alpha=0.5 : 9.60, RMSE: 0.7435
# Expectile, alpha=0.6 : 8.97, RMSE: 0.7535
# Expectile, alpha=0.65: 9.60, RMSE: 0.7323
# Expectile, alpha=0.7 : 9.60, RMSE: 0.7214
# Expectile, alpha=0.75: 10.04, RMSE: 0.7260
# Expectile, alpha=0.8 : 9.87, RMSE: 0.7344

In [None]:
X_test.with_columns(
    pl.Series(cbr_preds).alias('pr_rat_pred'),
    pl.Series(y_test).alias('pr_rat_real')
).filter(pl.col('pr_rat_pred') > 0.4)

In [None]:
model = cb.CatBoostRegressor()
model.load_model('./data/catboost_model.json', format='json')

In [None]:
df.sample(2)

In [None]:
df = df.drop('coin1', 'coin2', 'dur_best', 
            # 'dur_test', 'trades_test', 'loss_test', 'max_drdn_test', 'pr_test'
            )

X = df.to_numpy()

In [None]:
preds = model.predict(X)

In [None]:
df = pairs.select('coin1', 'coin2', 'dist', 'std', 'corr', 'pv_1', 'pv_2', 'pr_ind', 'pr_rat_ind', 'avg_pr_all', 'pr_rat_all',
            'pr_best', 'pr_rat_best', 'loss_best', 'max_drdn_best', 'dur_best',
            # 'pr_test', 'pr_rat_test', 'loss_test', 'max_drdn_test', 'dur_test', 'trades_test'
                 )

trade_pairs_df = df.with_columns(
        pl.Series(preds).alias('pred'),
    ).filter(
        (pl.col('pr_rat_ind') > 0.4) & (pl.col('pr_rat_all') > 0.4) & (pl.col('pr_rat_best') > 0.4) #& (pl.col('pred') > 0.4)
    ).sort(by='pr_rat_all', descending=True)


In [None]:
trade_pairs_df

In [None]:
used_tokens = []
trade_pairs_list = []

for row in trade_pairs_df.iter_rows(named=True):
    t1 = row['coin1']
    t2 = row['coin2']
    
    if t1 in used_tokens or t2 in used_tokens:
        continue

    trade_pairs_list.append((t1, t2))
    used_tokens.append(t1)
    used_tokens.append(t2)

In [None]:
used_tokens

In [None]:
trade_pairs_list

In [None]:
from jaref_bot.db.postgres_manager import DBManager
from jaref_bot.config.credentials import host, user, password, db_name

db_params = {'host': host, 'user': user, 'password': password, 'dbname': db_name}
postgre_manager = DBManager(db_params)

In [None]:
current_pairs = postgre_manager.get_table('pairs', df_type='polars')
current_pairs

In [None]:
for row in current_pairs.iter_rows(named=True):
    t1 = row['token_1'][:-5]
    t2 = row['token_2'][:-5]
    
    if (t1, t2) not in trade_pairs_list:
        trade_pairs_list.append((t1, t2))
        print((t1, t2))

In [None]:
with open('./jaref_bot/config/token_pairs.txt', 'w') as file:
    for pair in trade_pairs_list:
        file.write(f"{pair[0]} {pair[1]}\n")