In [1]:
# Install the necessary packages
!pip install -r ../requirements.txt




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
start = "2024-01-01"
end = "2024-03-01"
interval="1h"
tickers = ["ADAUSDT","AVAXUSDT","BTCUSDT","DOGEUSDT","DOTUSDT","ETHUSDT","LINKUSDT","LTCUSDT","SOLUSDT","XRPUSDT"]

In [2]:
from src.data_services.data_pipeline import load_data

# 1. Load data
df = load_data(
    tickers=tickers,
    start=start,
    end=end,
    interval=interval,
)
print(df.head())

                     ADAUSDT  AVAXUSDT   BTCUSDT  DOGEUSDT  DOTUSDT  ETHUSDT  \
open_time                                                                      
2024-01-01 00:00:00   0.5979     38.94  42475.23   0.08983    8.267  2295.51   
2024-01-01 01:00:00   0.6023     39.39  42613.56   0.09016    8.294  2303.72   
2024-01-01 02:00:00   0.5995     39.04  42581.10   0.08980    8.253  2293.02   
2024-01-01 03:00:00   0.5956     38.51  42330.49   0.08921    8.148  2273.81   
2024-01-01 04:00:00   0.5953     38.21  42399.99   0.08910    8.140  2279.55   

                     LINKUSDT  LTCUSDT  SOLUSDT  XRPUSDT  
open_time                                                 
2024-01-01 00:00:00    15.122    73.06   101.96   0.6162  
2024-01-01 01:00:00    15.167    73.34   104.12   0.6185  
2024-01-01 02:00:00    15.065    73.16   103.69   0.6154  
2024-01-01 03:00:00    14.931    72.67   103.07   0.6130  
2024-01-01 04:00:00    14.920    72.53   102.62   0.6116  


In [3]:
# 1. Calculate Pearson's correlation matrix

from src.pair_selection.corr_coint_tests import pearson_correlation

corr_prices_df = pearson_correlation(df).sort_values('corr', ascending=False).reset_index(drop=True)
corr_prices_df = corr_prices_df.rename(columns={'corr': 'corr_prices'})
print(corr_prices_df.head())

               pair  corr_prices
0  AVAXUSDT-SOLUSDT     0.926325
1   ADAUSDT-DOTUSDT     0.917982
2   BTCUSDT-ETHUSDT     0.909241
3  ADAUSDT-AVAXUSDT     0.844651
4   ADAUSDT-BTCUSDT     0.842703


In [8]:
# 2. Calculate returns correlation

returns = (df / df.shift(1) - 1).dropna()
corr_returns_df = pearson_correlation(returns).sort_values('corr', ascending=False).reset_index(drop=True)
corr_returns_df = corr_returns_df.rename(columns={'corr': 'corr_returns'})
print(corr_returns_df.head())

               pair  corr_returns
0   ADAUSDT-DOTUSDT      0.824841
1  AVAXUSDT-SOLUSDT      0.820279
2  AVAXUSDT-DOTUSDT      0.784385
3  ADAUSDT-AVAXUSDT      0.762790
4   ADAUSDT-ETHUSDT      0.752311


In [9]:
# 3. Calculate log-returns correlation
import numpy as np

log_returns = np.log(df / df.shift(1)).dropna()
corr_log_returns_df = pearson_correlation(log_returns).sort_values('corr', ascending=False).reset_index(drop=True)
corr_log_returns_df = corr_log_returns_df.rename(columns={'corr': 'corr_log_returns'})
print(corr_log_returns_df.head())

               pair  corr_log_returns
0   ADAUSDT-DOTUSDT          0.826138
1  AVAXUSDT-SOLUSDT          0.821408
2  AVAXUSDT-DOTUSDT          0.785760
3  ADAUSDT-AVAXUSDT          0.764557
4   ADAUSDT-ETHUSDT          0.753668


In [10]:
# 4. Perform Engle-Granger cointegration test

from src.pair_selection.corr_coint_tests import engle_granger_cointegration

eg_df = engle_granger_cointegration(df, tickers).sort_values('p_value', ascending=True).reset_index(drop=True)
print(eg_df.head())

               pair     score   p_value
0   ADAUSDT-BTCUSDT -3.790522  0.013955
1   BTCUSDT-SOLUSDT -3.258414  0.060690
2   ADAUSDT-ETHUSDT -3.231670  0.064789
3   DOTUSDT-ETHUSDT -3.041725  0.100618
4  DOGEUSDT-ETHUSDT -2.992856  0.111914


In [11]:
# 5. Perform Johansen cointegration test

from src.pair_selection.corr_coint_tests import johansen_cointegration

johansen_df = johansen_cointegration(df, tickers).sort_values('p_est', ascending=True).reset_index(drop=True)
print(johansen_df.head())

               pair  trace_stat  crit_95     p_est
0   ADAUSDT-BTCUSDT   17.156938  15.4943  0.010000
1   DOTUSDT-XRPUSDT   14.918801  15.4943  0.037143
2  DOGEUSDT-LTCUSDT   14.389188  15.4943  0.071324
3   ETHUSDT-LTCUSDT   14.262345  15.4943  0.079510
4   BTCUSDT-LTCUSDT   14.218298  15.4943  0.082353


In [13]:
# 6. Merge dataframes

from src.data_services.data_pipeline import merge_by_pair

merged_df = merge_by_pair(
    dfs=[corr_prices_df, corr_returns_df, corr_log_returns_df, eg_df, johansen_df],
    keep_cols=[
        ['corr_prices'],
        ['corr_returns'],
        ['corr_log_returns'],
        ['p_value'],
        ['p_est']
    ]
).sort_values('p_value', ascending=True).reset_index(drop=True)
print(merged_df.head())

               pair  corr_prices  corr_returns  corr_log_returns   p_value  \
0   ADAUSDT-BTCUSDT     0.842703      0.710358          0.711226  0.013955   
1   BTCUSDT-SOLUSDT     0.834386      0.672675          0.674951  0.060690   
2   ADAUSDT-ETHUSDT     0.804337      0.752311          0.753668  0.064789   
3   DOTUSDT-ETHUSDT     0.648552      0.738975          0.740521  0.100618   
4  DOGEUSDT-ETHUSDT     0.683948      0.565895          0.577014  0.111914   

      p_est  
0  0.010000  
1  0.109625  
2  0.243973  
3  0.093213  
4  0.259850  


In [17]:
merged_df['corr_returns x (1 - p_value)'] = merged_df['corr_returns'] * (1 - merged_df['p_value'])
print(merged_df.sort_values('corr_returns x (1 - p_value)', ascending=False).head())

               pair  corr_prices  corr_returns  corr_log_returns   p_value  \
2   ADAUSDT-ETHUSDT     0.804337      0.752311          0.753668  0.064789   
0   ADAUSDT-BTCUSDT     0.842703      0.710358          0.711226  0.013955   
3   DOTUSDT-ETHUSDT     0.648552      0.738975          0.740521  0.100618   
1   BTCUSDT-SOLUSDT     0.834386      0.672675          0.674951  0.060690   
5  AVAXUSDT-ETHUSDT     0.580218      0.706856          0.709313  0.116415   

      p_est  corr_returns x (1 - p_value)  
2  0.243973                      0.703569  
0  0.010000                      0.700445  
3  0.093213                      0.664622  
1  0.109625                      0.631850  
5  0.191566                      0.624567  
