In [1]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, root_mean_squared_error

In [2]:
# Коинтеграционный подход при выборе торговых пар
#          all params     best params    individ params
#   t_1   t_2     profit  train    test   train    test   dist
#   KSM - DOT  :  -1.68    1.44   -2.74    3.78   -1.84   1.37
#  STRK - C98  :  -2.06   -4.18  -11.89   13.52  -16.41   1.33
#   C98 - BLUR :  -5.82   -4.94   -0.11    1.43  -147.7   1.42
#   C98 - STG  :  -7.27    3.78  -21.97    6.74  -31.48   1.40
#  SAND - IOTA :  14.11   14.45    1.09   21.47    1.25   1.06
#   OGN - ZK   :  -1.09   -0.15    0.25    9.77   -7.63   1.67
#  FLOW - IMX  :  -2.60    4.99    4.95    23.0    1.94   1.59
#  SAND - IMX  :  -3.51   12.60  -12.36    24.1  -11.45   1.22
#   CHZ - ZK   :  -0.33   -0.27    4.10    2.35   12.61   1.21
#  NEAR - IMX  :   9.37    9.64   -11.3   35.17  -15.04   1.08
#   OGN - SAND :  -2.77    1.62    7.20    3.11  -11.92   2.13

In [3]:
# Метод наименьшей дистанции при выборе торговых пар
#          all params     best params    individ params
#  t_1   t_2     profit  train    test   train    test   dist
#  C98 - FIL  :  -8.10    1.93   -5.91    2.58  -13.72   0.98
# GALA - SAND :  -3.97   -2.06    2.69    0.08    0.86   0.75
# GALA - GRT  :   0.06    1.29   -0.23    2.69   -4.03   0.76
# GALA - VET  :  -3.40    4.23    1.63    4.23    1.63   0.87
# CELO - DOT  :   3.79    5.78  -12.17   10.61  -13.75   0.97
# CELO - VET  :   0.49    5.17  -150.5    8.47   -3.53   0.97
# GALA - IMX  :  -1.79    6.00   -5.91   17.84   -2.30   0.98
#  GRT - SAND :   0.34    0.91    0.80    4.75    1.63   0.98
# SAND - VET  :  -0.34   -0.91    0.60    6.20    3.66   0.99
#  KSM - ROSE :   0.73           -0.08   14.83    2.56   0.99
#  DOT - GALA :  -1.42    2.16   -0.70    6.27   -0.99   1.03

In [4]:
# Метод наименьшей дистанции при выборе торговых пар (берём топ-11 пар, у которых дистанция больше 1.0)
#            all params    best params    individ params
#   t_1   t_2     profit  train    test   train    test   dist
#   DOT - GALA :  -1.46    3.22   -0.79    6.22   -0.99   1.03
#   IMX - VET  :  11.39   41.19    1.69   41.19    1.69   1.05
#   GRT - VET  :  -0.68    1.61    1.44    4.89    1.66   1.05
#  ARKM - DOT  :   2.86    7.21    0.51    9.28    3.84   1.05
#  CELO - GALA :  -1.20   -1.34  -15.91    1.97  -26.63   1.06
#  GALA - NEAR :   2.18    2.86    1.46   16.13   -0.02   1.06
#  IOTA - SAND :  16.88   15.12    2.53   30.80    1.13   1.06
#   DOT - VET  :   3.44    4.83    1.55   14.66    5.14   1.07
#   IMX - NEAR :  13.44   37.97    6.67   37.97    6.67   1.08
#   SNX - ZK   :  -9.73   -2.85    1.23    2.41  -13.02   1.12
#   DOT - IMX  :  -2.84    2.71   -5.46   16.19   -3.30   1.14

In [5]:
# SNX_USDT высокая волатильность, так что его вообще не должно быть в тестовой выборке
# KSM_USDT, OGN_USDT недостаточно торгового объёма


In [24]:
test_pairs = pl.read_parquet('./data/curr_pairs.parquet')

ind_params = pl.read_parquet('./data/ind_params_result.parquet')
ind_params = ind_params.rename({'token_1': 'coin1', 'token_2': 'coin2', 'profit': 'pr_ind', 'profit_ratio': 'pr_rat_ind',
                                     'max_loss': 'loss_ind', 'max_drawdown': 'max_drdn_ind',
                                     'duration_avg': 'dur_ind', 'n_trades': 'trades_ind'})

common_params = pl.read_parquet('./data/common_params_result.parquet')
common_params = common_params.rename({'token_1': 'coin1', 'token_2': 'coin2', 'avg_profit': 'avg_pr_all', 'profit_std': 'pr_std_all',
                                     'max_loss': 'loss_all', 'avg_drawdown': 'avg_drdn_all', 'max_drawdown': 'max_drdn_all',
                                     'avg_duration': 'dur_all', 'avg_pr_ratio': 'pr_rat_all', 'avg_trades': 'trades_all'})

best_params = pl.read_parquet('./data/best_params_result.parquet')
best_params = best_params.rename({'token_1': 'coin1', 'token_2': 'coin2', 'profit': 'pr_best', 'profit_ratio': 'pr_rat_best',
                                     'max_loss': 'loss_best', 'max_drawdown': 'max_drdn_best',
                                     'duration_avg': 'dur_best', 'n_trades': 'trades_best'})

# test_params = pl.read_parquet('./data/test_params_result.parquet')
# test_params = test_params.rename({'token_1': 'coin1', 'token_2': 'coin2', 'profit': 'pr_test', 'profit_ratio': 'pr_rat_test',
#                                      'max_loss': 'loss_test', 'max_drawdown': 'max_drdn_test',
#                                      'duration_avg': 'dur_test', 'n_trades': 'trades_test'})

test_pairs = test_pairs.with_columns(
    pl.col('dist').round(2),
    pl.col('std').round(2),
    pl.col('correlation').round(2),
    pl.col('adf_stat_1').round(2),
    pl.col('p_value_1').round(2),
    pl.col('adf_stat_2').round(2),
    pl.col('p_value_2').round(2),
).rename({'correlation': 'corr', 'adf_stat_1': 'adf_1', 'adf_stat_2': 'adf_2', 'p_value_1': 'pv_1', 'p_value_2': 'pv_2'})

In [25]:
pairs = test_pairs.join(ind_params, on=['coin1', 'coin2'])
pairs = pairs.join(common_params, on=['coin1', 'coin2'])
pairs = pairs.join(best_params, on=['coin1', 'coin2'])
# pairs = pairs.join(test_params, on=['coin1', 'coin2'])

In [26]:
# pairs.filter(
#     (pl.col('pr_rat_ind') > 1.0) & (pl.col('pr_rat_all') > 1.0) & (pl.col('pr_rat_best') > 1.0)
# ).sort(by='pr_best', descending=True)

In [36]:
df = pairs.select('coin1', 'coin2', 'dist', 'std', 'corr', 'pv_1', 'pv_2', 'pr_ind', 'pr_rat_ind', 'avg_pr_all', 'pr_rat_all',
            'pr_best', 'pr_rat_best', 'loss_best', 'max_drdn_best', 'dur_best', 'trades_best', 
            # 'pr_test', 'pr_rat_test', 'loss_test', 'max_drdn_test', 'dur_test', 'trades_test'
                 )

In [37]:
df.shape

(325, 17)

In [11]:
df.sample(3)

dist,std,corr,pv_1,pv_2,pr_ind,pr_rat_ind,avg_pr_all,pr_rat_all,pr_best,pr_rat_best,loss_best,max_drdn_best,trades_best
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
1.61,0.12,0.84,0.96,0.46,8.74,1.383,1.41,0.24,5.95,0.942,-2.3,-2.3,8
1.93,0.09,0.88,0.4,0.49,17.66,2.334,-26.63,-0.68,-93.96,-0.97,-94.14,-95.15,5
2.04,0.14,0.79,0.98,0.28,7.58,1.386,2.08,0.36,3.56,0.606,-0.82,-1.57,10


In [None]:
X, y = df.drop('pr_rat_test'), df['pr_rat_test']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
X_test.with_columns(
    pl.Series(y_test).alias('pr_rat_real')
).filter((pl.col('pr_rat_best') > 0.5) & (pl.col('pr_rat_all') > 0.5))#['pr_rat_real'].sum()

In [None]:
4.753

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train);
lr_preds = lr.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), lr_preds)

In [None]:
# 0.8532 -> 0.8286 (добавил фичу std)

In [None]:
for name, coef in zip(lr.feature_names_in_, lr.coef_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
params = {'eps': [0.0001, 0.001, 0.01]}

lasso = LassoCV(random_state=42)
gcv = GridSearchCV(lasso, params, cv=5, n_jobs=12, verbose=0)
gcv.fit(X_train, y_train);

In [None]:
gcv.best_params_

In [None]:
lasso = LassoCV(random_state=42, eps=0.0001)
lasso.fit(X_train, y_train);
lasso_preds = lasso.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), lasso_preds)

In [None]:
# 0.8211

In [None]:
for name, coef in zip(lasso.feature_names_in_, lasso.coef_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
ridge = RidgeCV()
ridge.fit(X_train, y_train);
ridge_preds = ridge.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), ridge_preds)

In [None]:
# 0.8118 -> 0.8117 (+ std)

In [None]:
for name, coef in zip(ridge.feature_names_in_, ridge.coef_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
params = {'max_features': [4, 6, 8, 10, 13], 'min_samples_leaf': [1, 3, 5, 7], 'max_depth': [5, 6, 8, 10, 12, 15],
          'n_estimators': [40, 60, 80, 100, 125, 150]}

rfr = RandomForestRegressor(random_state=42)
gcv = GridSearchCV(rfr, params, cv=5, n_jobs=12, verbose=0)
gcv.fit(X_train, y_train);

In [None]:
gcv.best_params_

In [None]:
rf = RandomForestRegressor(random_state=42, n_estimators=80, max_depth=12, max_features=6, min_samples_leaf=5)
rf.fit(X_train, y_train);
rf_preds = rf.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), rf_preds)

In [None]:
# 0.7634 -> 0.7750 (+std)

In [None]:
for name, coef in zip(rf.feature_names_in_, rf.feature_importances_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
X_test.with_columns(
    pl.Series(rf_preds).alias('pr_rat_pred'),
    pl.Series(y_test).alias('pr_rat_real')
).filter(pl.col('pr_rat_pred') > 0.4)#['pr_rat_real'].sum()

In [None]:
# 4.325

In [None]:
params = {'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2], 'max_features': [4, 6, 8, 10, 13], 
          'min_samples_leaf': [1, 3, 5, 7], 'max_depth': [5, 6, 8, 10, 12, 15],
          'n_estimators': [40, 60, 80, 100, 125, 150]}

gbr = GradientBoostingRegressor(random_state=42)
gcv = GridSearchCV(gbr, params, cv=5, n_jobs=12, verbose=0)
gcv.fit(X_train, y_train);

In [None]:
gcv.best_params_

In [None]:
gbr = GradientBoostingRegressor(random_state=42, learning_rate=0.01, n_estimators=80, max_depth=5, 
                                max_features=4, min_samples_leaf=7)
gbr.fit(X_train, y_train);
gbr_preds = gbr.predict(X_test)
root_mean_squared_error(y_test.to_numpy(), gbr_preds)

In [None]:
# 0.8004 -> 0.8014

In [12]:
import catboost as cb

In [None]:
params = {'learning_rate': [0.01, 0.03, 0.1, 0.3], 'iterations': [250, 500, 1000], 
          'depth': [6, 8, 10],
          'l2_leaf_reg': [1, 3, 5, 7]}
cbr = cb.CatBoostRegressor(random_state=42, verbose=False)
gcv = GridSearchCV(cbr, params, cv=5, verbose=0)
gcv.fit(X_train.to_numpy(), y_train.to_numpy());

In [None]:
gcv.best_params_

In [None]:
train_pool = cb.Pool(X_train.to_numpy(), y_train.to_numpy())

In [None]:
param_grid = {
    'iterations': tune.randint(100, 1500),
    'learning_rate': tune.loguniform(1e-3, 0.5),
    'depth': tune.randint(4, 12),
    'l2_leaf_reg': tune.loguniform(1, 10),
}

In [None]:
cb.__version__

In [None]:
from catboost.utils import grid_search

In [None]:
cbr = cb.CatBoostRegressor(random_state=42, verbose=False, learning_rate=0.01, iterations=800, depth=8,
                          loss_function='Expectile:alpha=0.7'
                          )
cbr.fit(X_train.to_numpy(), y_train.to_numpy());
cbr_preds = cbr.predict(X_test.to_numpy())
root_mean_squared_error(y_test.to_numpy(), cbr_preds)

In [None]:
cbr.save_model('./data/catboost_model.json', format='json')

In [None]:
# 0.7172

In [None]:
for name, coef in zip(X_test.columns, cbr.feature_importances_):
    print(f'{name:>15}: {coef:>7.4f}')

In [None]:
X_test.with_columns(
    pl.Series(cbr_preds).alias('pr_rat_pred'),
    pl.Series(y_test).alias('pr_rat_real')
).filter(pl.col('pr_rat_pred') > 0.4)['pr_rat_real'].sum()

In [None]:
# RMSE: 5.73
# Quantile, alpha=0.7 : 9.45,  RMSE: 0.7362
# Quantile, alpha=0.65: 9.07,  RMSE: 0.7172
# Quantile, alpha=0.6 : 10.04, RMSE: 0.7522
# Quantile, alpha=0.55: 6.65,  RMSE: 0.7392
# Quantile, alpha=0.5 : 5.06, RMSE: 0.7650
# Quantile, alpha=0.4 : 3.56, RMSE: 0.7893
# Quantile, alpha=0.3 : 3.56,  RMSE: 0.8197

# Expectile, alpha=0.3 : 6.25, RMSE: 0.7571
# Expectile, alpha=0.4 : 8.97, RMSE: 0.7603
# Expectile, alpha=0.5 : 9.60, RMSE: 0.7435
# Expectile, alpha=0.6 : 8.97, RMSE: 0.7535
# Expectile, alpha=0.65: 9.60, RMSE: 0.7323
# Expectile, alpha=0.7 : 9.60, RMSE: 0.7214
# Expectile, alpha=0.75: 10.04, RMSE: 0.7260
# Expectile, alpha=0.8 : 9.87, RMSE: 0.7344

In [None]:
X_test.with_columns(
    pl.Series(cbr_preds).alias('pr_rat_pred'),
    pl.Series(y_test).alias('pr_rat_real')
).filter(pl.col('pr_rat_pred') > 0.4)

In [30]:
model = cb.CatBoostRegressor()
model.load_model('./data/catboost_model.json', format='json')

<catboost.core.CatBoostRegressor at 0x2935adea5d0>

In [33]:
df.sample(2)

coin1,coin2,dist,std,corr,pv_1,pv_2,pr_ind,pr_rat_ind,avg_pr_all,pr_rat_all,pr_best,pr_rat_best,loss_best,max_drdn_best,dur_best,trades_best
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,duration[μs],i64
"""1INCH""","""MANTA""",2.95,0.21,0.51,0.39,0.89,6.88,1.074,-0.83,0.03,3.03,0.397,-4.37,-4.37,1d 11h 49m 28s,5
"""GRT""","""SAND""",1.02,0.06,0.96,0.53,0.06,7.09,1.355,-0.62,-0.06,2.69,0.43,-2.1,-2.24,15h 54m 4s,9


In [34]:
df = df.drop('coin1', 'coin2', 'dur_best', 
            # 'dur_test', 'trades_test', 'loss_test', 'max_drdn_test', 'pr_test'
            )

X = df.to_numpy()

In [35]:
preds = model.predict(X)

In [56]:
df = pairs.select('coin1', 'coin2', 'dist', 'std', 'corr', 'pv_1', 'pv_2', 'pr_ind', 'pr_rat_ind', 'avg_pr_all', 'pr_rat_all',
            'pr_best', 'pr_rat_best', 'loss_best', 'max_drdn_best', 'dur_best',
            # 'pr_test', 'pr_rat_test', 'loss_test', 'max_drdn_test', 'dur_test', 'trades_test'
                 )

trade_pairs_df = df.with_columns(
        pl.Series(preds).alias('pred'),
    ).filter(
        (pl.col('pr_rat_ind') > 0.4) & (pl.col('pr_rat_all') > 0.4) & (pl.col('pr_rat_best') > 0.4) #& (pl.col('pred') > 0.4)
    ).sort(by='pr_rat_all', descending=True)


In [61]:
trade_pairs_df

coin1,coin2,dist,std,corr,pv_1,pv_2,pr_ind,pr_rat_ind,avg_pr_all,pr_rat_all,pr_best,pr_rat_best,loss_best,max_drdn_best,dur_best,pred
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,duration[μs],f64
"""CELO""","""IMX""",2.84,0.21,0.51,0.38,0.62,26.48,4.778,9.48,1.52,26.48,4.778,-0.82,-0.82,10h 7s,0.168972
"""IMX""","""SEI""",3.83,0.28,0.32,0.53,0.73,18.96,3.623,7.18,1.33,13.77,2.554,-0.62,-0.62,10h 32m,0.204198
"""GALA""","""GMT""",1.33,0.1,0.88,0.12,0.37,13.27,2.584,6.68,1.28,10.84,2.013,-0.59,-0.59,12h 23m 30s,-0.017204
"""OP""","""TIA""",3.12,0.17,0.71,0.48,0.67,15.28,3.041,6.19,1.22,11.96,2.376,0.0,0.0,7h 37m 5s,-0.001512
"""IMX""","""LDO""",2.77,0.21,0.6,0.09,0.53,17.88,3.537,6.24,1.2,14.04,2.777,0.0,0.0,15h 9m 20s,0.334338
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""CVX""","""ONDO""",2.79,0.12,0.88,0.35,0.08,10.81,1.806,2.45,0.45,8.53,1.383,-1.99,-1.99,16h 36m 34s,0.311461
"""LDO""","""RENDER""",3.59,0.27,0.28,0.36,0.92,8.02,1.459,2.2,0.44,6.15,1.066,-1.27,-1.27,1d 32m 31s,0.17059
"""SAND""","""SUSHI""",1.74,0.11,0.85,0.42,0.28,9.02,1.721,2.21,0.43,7.68,1.412,-0.67,-0.67,15h 36m 17s,0.587028
"""BLUR""","""GRT""",0.77,0.05,0.96,0.0,0.0,9.67,1.755,2.12,0.42,6.5,1.14,-0.6,-1.2,9h 8m 41s,0.114824


In [62]:
used_tokens = []
trade_pairs_list = []

for row in trade_pairs_df.iter_rows(named=True):
    t1 = row['coin1']
    t2 = row['coin2']
    
    if t1 in used_tokens or t2 in used_tokens:
        continue

    trade_pairs_list.append((t1, t2))
    used_tokens.append(t1)
    used_tokens.append(t2)

In [63]:
used_tokens

['CELO',
 'IMX',
 'GALA',
 'GMT',
 'OP',
 'TIA',
 'ARKM',
 'MANTA',
 'IOTA',
 'MOVE',
 'CVX',
 'SUSHI',
 'CRV',
 'SUI',
 'CHZ',
 'KAS',
 'LDO',
 'SAND',
 'DOT',
 'NEAR',
 'GRT',
 'SEI',
 'BLUR',
 'VET',
 'MORPHO',
 'STX',
 'RENDER',
 'XRP',
 'ONDO',
 'POL']

In [68]:
trade_pairs_list

[('CELO', 'IMX'),
 ('GALA', 'GMT'),
 ('OP', 'TIA'),
 ('ARKM', 'MANTA'),
 ('IOTA', 'MOVE'),
 ('CVX', 'SUSHI'),
 ('CRV', 'SUI'),
 ('CHZ', 'KAS'),
 ('LDO', 'SAND'),
 ('DOT', 'NEAR'),
 ('GRT', 'SEI'),
 ('BLUR', 'VET'),
 ('MORPHO', 'STX'),
 ('RENDER', 'XRP'),
 ('ONDO', 'POL')]

In [65]:
from jaref_bot.db.postgres_manager import DBManager
from jaref_bot.config.credentials import host, user, password, db_name

db_params = {'host': host, 'user': user, 'password': password, 'dbname': db_name}
postgre_manager = DBManager(db_params)

In [66]:
current_pairs = postgre_manager.get_table('pairs', df_type='polars')
current_pairs

token_1,token_2,side,qty_1,qty_2,status
null,null,null,null,null,null


In [67]:
for row in current_pairs.iter_rows(named=True):
    t1 = row['token_1'][:-5]
    t2 = row['token_2'][:-5]
    
    if (t1, t2) not in trade_pairs_list:
        trade_pairs_list.append((t1, t2))
        print((t1, t2))

In [70]:
with open('./jaref_bot/config/token_pairs.txt', 'w') as file:
    for pair in trade_pairs_list:
        file.write(f"{pair[0]} {pair[1]}\n")