In [None]:
from modules.data_services.data_loaders import load_data
from modules.data_services.data_utils import merge_by_pair
from modules.pair_selection.statistical_tests import ssd_cumulative_returns, pearson_correlation, engle_granger_cointegration, johansen_cointegration

In [None]:
start = "2024-01-01"
end = "2024-03-01"
interval = "1h"
tickers = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT", "XRPUSDT", "ADAUSDT", "AVAXUSDT", "DOGEUSDT", "TRXUSDT",
           "DOTUSDT", "LINKUSDT", "SHIBUSDT", "LTCUSDT", "BCHUSDT", "UNIUSDT"]

In [3]:
# 1. Load data
df = load_data(
    tickers=tickers,
    start=start,
    end=end,
    interval=interval,
)
print(df.head())

                      BTCUSDT  ETHUSDT  BNBUSDT  SOLUSDT  XRPUSDT  ADAUSDT  \
open_time                                                                    
2024-01-01 00:00:00  42475.23  2295.51    314.4   101.96   0.6162   0.5979   
2024-01-01 01:00:00  42613.56  2303.72    315.3   104.12   0.6185   0.6023   
2024-01-01 02:00:00  42581.10  2293.02    310.9   103.69   0.6154   0.5995   
2024-01-01 03:00:00  42330.49  2273.81    309.2   103.07   0.6130   0.5956   
2024-01-01 04:00:00  42399.99  2279.55    309.2   102.62   0.6116   0.5953   

                     AVAXUSDT  DOGEUSDT  TRXUSDT  DOTUSDT  LINKUSDT  SHIBUSDT  \
open_time                                                                       
2024-01-01 00:00:00     38.94   0.08983  0.10832    8.267    15.122   0.00001   
2024-01-01 01:00:00     39.39   0.09016  0.10820    8.294    15.167   0.00001   
2024-01-01 02:00:00     39.04   0.08980  0.10781    8.253    15.065   0.00001   
2024-01-01 03:00:00     38.51   0.08921  0.10711

In [4]:
# 1. Calculate SSD of cumulative returns

ssd_df = ssd_cumulative_returns(df)
print(ssd_df.head())

                pair       ssd
0  DOGEUSDT-SHIBUSDT  1.316605
1    XRPUSDT-DOTUSDT  2.393193
2    ADAUSDT-DOTUSDT  2.591386
3   SHIBUSDT-LTCUSDT  3.702435
4   ADAUSDT-AVAXUSDT  3.805218


In [5]:
# 2. Calculate Pearson's correlation matrix

corr_prices_df = pearson_correlation(df, source="prices")
print(corr_prices_df.head())

                pair  corr_prices
0    ETHUSDT-BNBUSDT     0.939692
1  DOGEUSDT-SHIBUSDT     0.935798
2   SOLUSDT-AVAXUSDT     0.926325
3    ADAUSDT-DOTUSDT     0.917982
4    BTCUSDT-ETHUSDT     0.909241


In [6]:
# 3. Calculate returns correlation

corr_returns_df = pearson_correlation(df, source="returns")
print(corr_returns_df.head())

                pair  corr_returns
0    ADAUSDT-DOTUSDT      0.824841
1   SOLUSDT-AVAXUSDT      0.820279
2  DOGEUSDT-SHIBUSDT      0.812545
3   AVAXUSDT-DOTUSDT      0.784385
4   ADAUSDT-AVAXUSDT      0.762790


In [7]:
# 4. Calculate log-returns correlation

corr_log_returns_df = pearson_correlation(df, source="log_returns")
print(corr_log_returns_df.head())

                pair  corr_log_returns
0    ADAUSDT-DOTUSDT          0.826138
1   SOLUSDT-AVAXUSDT          0.821408
2  DOGEUSDT-SHIBUSDT          0.818520
3   AVAXUSDT-DOTUSDT          0.785760
4   ADAUSDT-AVAXUSDT          0.764557


In [8]:
# 5. Perform Engle-Granger cointegration test

eg_df = engle_granger_cointegration(df)
print(eg_df.head())

              pair  eg_p_value  adf_stat
0  ETHUSDT-BCHUSDT    0.002988 -4.256718
1  BTCUSDT-BCHUSDT    0.004171 -4.161398
2  BNBUSDT-UNIUSDT    0.007593 -3.982967
3  BTCUSDT-ADAUSDT    0.013955 -3.790522
4  BTCUSDT-SOLUSDT    0.015325 -3.759751


In [9]:
# 6. Perform Johansen cointegration test

johansen_df = johansen_cointegration(df)
print(johansen_df.head())

                pair  trace_stat  crit_95  crit_99  trace_stat - crit_95  \
0    BTCUSDT-BCHUSDT   20.874688  15.4943  19.9349              5.380388   
1    ETHUSDT-BCHUSDT   18.198021  15.4943  19.9349              2.703721   
2  DOGEUSDT-SHIBUSDT   17.415646  15.4943  19.9349              1.921346   
3    BTCUSDT-ADAUSDT   17.156938  15.4943  19.9349              1.662638   
4    BNBUSDT-UNIUSDT   16.792436  15.4943  19.9349              1.298136   

   trace_stat - crit_99  
0              0.939788  
1             -1.736879  
2             -2.519254  
3             -2.777962  
4             -3.142464  


In [10]:
# 7. Merge dataframes

merged_df = merge_by_pair(
    dfs=[ssd_df, corr_log_returns_df, eg_df, johansen_df],
    keep_cols=[
        ['ssd'],
        ['corr_log_returns'],
        ['eg_p_value'],
        ['trace_stat - crit_99']
    ]
).sort_values('eg_p_value', ascending=True).reset_index(drop=True)
print(merged_df.head())

              pair        ssd  corr_log_returns  eg_p_value  \
0  ETHUSDT-BCHUSDT  35.760886          0.589011    0.002988   
1  BTCUSDT-BCHUSDT  24.048188          0.633006    0.004171   
2  BNBUSDT-UNIUSDT  30.106199          0.356806    0.007593   
3  BTCUSDT-ADAUSDT  49.567267          0.711226    0.013955   
4  BTCUSDT-SOLUSDT  19.336679          0.674951    0.015325   

   trace_stat - crit_99  
0             -1.736879  
1              0.939788  
2             -3.142464  
3             -2.777962  
4             -6.139169  
